In [1]:
%pip install pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
%pip install python-dotenv requests

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd
import os
import requests
from dotenv import load_dotenv
import time
import re

In [4]:
df_users = pd.read_csv("../datasets/users.dat", 
                         delimiter="::",
                         engine="python", 
                         encoding="ISO-8859-1", 
                         header=None,
                         names=['Id', 'Gender', 'Age', 'Occupation', 'Zip-code'], 
                         index_col=0)
df_users

Unnamed: 0_level_0,Gender,Age,Occupation,Zip-code
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,F,1,10,48067
2,M,56,16,70072
3,M,25,15,55117
4,M,45,7,02460
5,M,25,20,55455
...,...,...,...,...
6036,F,25,15,32603
6037,F,45,1,76006
6038,F,56,1,14706
6039,F,45,0,01060


In [6]:
df_ratings = pd.read_csv("../datasets/ratings.dat", 
                         delimiter="::",
                         engine="python", 
                         encoding="ISO-8859-1", 
                         header=None,
                         names=['UserID', 'MovieID', 'Rating', 'Timestamp'], 
                         index_col=[0, 1])
df_ratings

Unnamed: 0_level_0,Unnamed: 1_level_0,Rating,Timestamp
UserID,MovieID,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1193,5,978300760
1,661,3,978302109
1,914,3,978301968
1,3408,4,978300275
1,2355,5,978824291
...,...,...,...
6040,1091,1,956716541
6040,1094,5,956704887
6040,562,5,956704746
6040,1096,4,956715648


In [74]:
df_movies = pd.read_csv("../datasets/movies.dat", 
                         delimiter="::",
                         engine="python", 
                         encoding="ISO-8859-1", 
                         header=None,
                         names=['Id', 'Name', 'Genres'], 
                         index_col=0)
df_movies

Unnamed: 0_level_0,Name,Genres
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Animation|Children's|Comedy
2,Jumanji (1995),Adventure|Children's|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama
5,Father of the Bride Part II (1995),Comedy
...,...,...
3948,Meet the Parents (2000),Comedy
3949,Requiem for a Dream (2000),Drama
3950,Tigerland (2000),Drama
3951,Two Family House (2000),Drama


In [75]:
df_genres = df_movies['Genres'].str.get_dummies(sep='|')

In [76]:
df_movies = pd.concat([df_movies[['Name']], df_genres], axis=1)
df_movies

Unnamed: 0_level_0,Name,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,Toy Story (1995),0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Jumanji (1995),0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,Meet the Parents (2000),0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3949,Requiem for a Dream (2000),0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3950,Tigerland (2000),0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3951,Two Family House (2000),0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [79]:
load_dotenv()

headers = {
    "Authorization": f"Bearer {os.getenv('TMDB_API_TOKEN')}",
    "Accept": "application/json"
}

In [86]:
def get_tmdb_metadata(row):
    match = re.match(r"(.*?)\s*\((\d{4})\)\s*$", row['Name'])
    if not match:
        return pd.Series()
    
    title = match.group(1).strip()
    year = match.group(2)
    
    params = {
        "query": title,
        "primary_release_year": year,
        "page": 1
    }
    
    try:
        response = requests.get(
            "https://api.themoviedb.org/3/search/movie",
            headers=headers,
            params=params
        )
        time.sleep(0.05)
        
        if response.status_code == 200:
            results = response.json().get('results', [])
            if results:
                return pd.Series({
                    'adult': results[0].get('adult'),
                    'original_language': results[0].get('original_language'),
                    'overview': results[0].get('overview'),
                    'popularity': results[0].get('popularity'),
                    'vote_average': results[0].get('vote_average'),
                    'vote_count': results[0].get('vote_count')
                })
        return pd.Series()
        
    except Exception as e:
        print(f"Error fetching data for {title}: {str(e)}")
        return pd.Series()

In [87]:
tmdb_metadata = df_movies.apply(get_tmdb_metadata, axis=1)
df_movies = pd.concat([df_movies, tmdb_metadata], axis=1)

In [88]:
df_movies

Unnamed: 0_level_0,Name,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,...,Sci-Fi,Thriller,War,Western,adult,original_language,overview,popularity,vote_average,vote_count
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),0,0,1,1,1,0,0,0,0,...,0,0,0,0,False,en,"Led by Woody, Andy's toys live happily in his ...",4.124,7.968,18679.0
2,Jumanji (1995),0,1,0,1,0,0,0,0,1,...,0,0,0,0,False,en,When siblings Judy and Peter discover an encha...,1.168,7.200,10680.0
3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,0,...,0,0,0,0,False,en,A family wedding reignites the ancient feud be...,3.531,6.462,392.0
4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,0,...,0,0,0,0,False,en,"Cheated on, mistreated and stepped on, the wom...",3.530,6.300,170.0
5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,0,...,0,0,0,0,False,en,Just when George Banks has recovered from his ...,3.781,6.232,746.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,Meet the Parents (2000),0,0,0,0,1,0,0,0,0,...,0,0,0,0,False,en,"Greg Focker is ready to marry his girlfriend, ...",4.200,6.675,6017.0
3949,Requiem for a Dream (2000),0,0,0,0,0,0,0,1,0,...,0,0,0,0,False,en,The drug-induced utopias of four Coney Island ...,2.287,8.015,10154.0
3950,Tigerland (2000),0,0,0,0,0,0,0,1,0,...,0,0,0,0,False,en,A group of recruits go through Advanced Infant...,3.531,6.629,563.0
3951,Two Family House (2000),0,0,0,0,0,0,0,1,0,...,0,0,0,0,False,en,Buddy Visalo (Michael Rispoli) is a factory wo...,0.637,6.400,18.0
