In [1]:
import pandas as pd
import numpy as np

import requests
import time
import json
from tenacity import (
    retry,
    stop_after_attempt,
    wait_exponential,
    retry_if_exception_type,
    RetryError
)

from concurrent.futures import ThreadPoolExecutor
from tqdm.notebook import tqdm  # Jupyter-compatible progress bar
import copy
from IPython.display import display


In [2]:
# RATINGS DATASET

df_ratings = pd.read_csv('title.ratings.tsv', sep='\t', on_bad_lines='skip', low_memory=False)

display(df_ratings.head(3))
print(f'Number of rows and columns in ratings dataset are {df_ratings.shape[0] } and {df_ratings.shape[1]}')

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2157
1,tt0000002,5.5,293
2,tt0000003,6.5,2199


Number of rows and columns in ratings dataset are 1615604 and 3


In [3]:
# Check for unique movie ids

valid_movie_ids = set(df_ratings['tconst'])

print(f"Unique movies with ratings are {len(valid_movie_ids)}")

Unique movies with ratings are 1615604


In [4]:
# CONTENT DATASET

chunk_size = 100000 # Chunks to be processed per iteration
usecols = ['tconst', 'titleType', 'primaryTitle', 'startYear', 'genres'] # Pulling only required columns
dtypes = {'tconst': 'category', 'startYear': 'category'} # Setting dtypes for specific columns
invalid_genres = {'[]', '\\N'}
results = []

#  Function to select only movies with valid genres
def process_chunk(chunk):
        return chunk[
        chunk['genres'].notna() & 
        (chunk['genres'].apply(lambda x: x not in invalid_genres)) &
        (chunk['titleType'] == 'movie') &
        chunk['tconst'].isin(valid_movie_ids)
    ]

# Reading the movie file 100000 records per chunk (4 parallel threads)
with ThreadPoolExecutor(max_workers=4) as executor:
    chunks = pd.read_csv(
        'title.basics.tsv',
        sep='\t',
        chunksize=chunk_size,
        on_bad_lines='skip',
        usecols=usecols,
        dtype=dtypes,
        low_memory=False
    )
    results = list(executor.map(process_chunk, chunks))

# Concatenate results
df_imdb = pd.concat(results, ignore_index=True, copy=False)

df_imdb.head()

Unnamed: 0,tconst,titleType,primaryTitle,startYear,genres
0,tt0000009,movie,Miss Jerry,1894,Romance
1,tt0000147,movie,The Corbett-Fitzsimmons Fight,1897,"Documentary,News,Sport"
2,tt0000574,movie,The Story of the Kelly Gang,1906,"Action,Adventure,Biography"
3,tt0000591,movie,The Prodigal Son,1907,Drama
4,tt0000615,movie,Robbery Under Arms,1907,Drama


In [5]:
print(f'Number of rows and columns in movies dataset are {df_imdb.shape[0] } and {df_imdb.shape[1]}')

Number of rows and columns in movies dataset are 323475 and 5


In [6]:
final_data = df_imdb.merge(df_ratings, how='inner', on='tconst')
final_data.head()

Unnamed: 0,tconst,titleType,primaryTitle,startYear,genres,averageRating,numVotes
0,tt0000009,movie,Miss Jerry,1894,Romance,5.4,224
1,tt0000147,movie,The Corbett-Fitzsimmons Fight,1897,"Documentary,News,Sport",5.3,558
2,tt0000574,movie,The Story of the Kelly Gang,1906,"Action,Adventure,Biography",6.0,987
3,tt0000591,movie,The Prodigal Son,1907,Drama,5.6,31
4,tt0000615,movie,Robbery Under Arms,1907,Drama,3.9,28


In [7]:
# Bayesian Average for minimizing new movie bias
def bayesian_average(df):
    mean_rating = df['averageRating'].mean()
    minvote_threshold = df['numVotes'].quantile(0.50)
    df['adjusted_rating'] = round(((df['numVotes'] / (df['numVotes'] + minvote_threshold)) 
                                           * df['averageRating']) + ((minvote_threshold/(df['numVotes']+minvote_threshold))
                                                                            * mean_rating),2)
    return df
final_data = bayesian_average(final_data)  
final_data.sort_values(by='tconst',inplace=True)
final_data.head()

Unnamed: 0,tconst,titleType,primaryTitle,startYear,genres,averageRating,numVotes,adjusted_rating
0,tt0000009,movie,Miss Jerry,1894,Romance,5.4,224,5.56
1,tt0000147,movie,The Corbett-Fitzsimmons Fight,1897,"Documentary,News,Sport",5.3,558,5.39
2,tt0000574,movie,The Story of the Kelly Gang,1906,"Action,Adventure,Biography",6.0,987,6.01
3,tt0000591,movie,The Prodigal Son,1907,Drama,5.6,31,5.96
4,tt0000615,movie,Robbery Under Arms,1907,Drama,3.9,28,5.46


In [8]:
imdb_ids = [x for x in final_data['tconst']]

In [10]:
@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=1, max=10))

def fetch_tmdb_data(imdb_id):
    url = f"https://api.themoviedb.org/3/find/{imdb_id}?external_source=imdb_id"
    headers = {
        "accept": "application/json",
        "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJhYzg5YWYxMjY2YTQ3ODk2MTRkZmFhZjQ3NmU0MTg5MiIsIm5iZiI6MTc0Nzc0Nzc4My41NjE5OTk4LCJzdWIiOiI2ODJjODNjN2NiOTE5ZjViNDUwYmRlOTMiLCJzY29wZXMiOlsiYXBpX3JlYWQiXSwidmVyc2lvbiI6MX0.KGsoJGKHYED_YDMI3aCKzzktN9LtrhmR0MVLhPuXKis"
    }
    response = requests.get(url, headers=headers, timeout=20)
    response.raise_for_status()
    return response.json()

def process_imdb_id(imdb_id):
    try:
        data = fetch_tmdb_data(imdb_id)
        movie_data = data.get('movie_results', [])
        if movie_data:
            return imdb_id, movie_data[0].get('original_language', 'not_found')
        else:
            return imdb_id, 'No Movie Data in TMDB'   
    except Exception as e:
        return imdb_id, f'Processing error: {str(e)}'
    

with ThreadPoolExecutor(max_workers=20) as executor:
    results = list(tqdm(
        executor.map(process_imdb_id, imdb_ids[:10]),
        total=len(imdb_ids[:10]),
        desc="Processing IMDB IDs"
    ))

df_lang = pd.DataFrame(columns=['tconst','original_language'])
# Build the lang_df from results
data = {'tconst': [x[0] for x in results],
        'original_language': [x[1] for x in results]}
df_lang = pd.DataFrame(data)


Processing IMDB IDs:   0%|          | 0/10 [00:00<?, ?it/s]

In [11]:
# df_lang.to_excel('language_dictionary.xlsx')
df_lang.head()

Unnamed: 0,tconst,original_language
0,tt0000009,en
1,tt0000147,en
2,tt0000574,en
3,tt0000591,fr
4,tt0000615,en


In [12]:
language_dict = pd.read_excel('language_dictionary.xlsx')
language_dict.head()

Unnamed: 0,tconst,original_language
0,tt0000009,en
1,tt0000147,en
2,tt0000574,en
3,tt0000591,fr
4,tt0000615,en


In [13]:
final_data = final_data.merge(language_dict,on='tconst',how='inner')
final_data.to_excel('Final Data.xlsx')

In [14]:
final_data.head()

Unnamed: 0,tconst,titleType,primaryTitle,startYear,genres,averageRating,numVotes,adjusted_rating,original_language
0,tt0000009,movie,Miss Jerry,1894,Romance,5.4,224,5.56,en
1,tt0000147,movie,The Corbett-Fitzsimmons Fight,1897,"Documentary,News,Sport",5.3,558,5.39,en
2,tt0000574,movie,The Story of the Kelly Gang,1906,"Action,Adventure,Biography",6.0,987,6.01,en
3,tt0000591,movie,The Prodigal Son,1907,Drama,5.6,31,5.96,fr
4,tt0000615,movie,Robbery Under Arms,1907,Drama,3.9,28,5.46,en


In [15]:
user_input = input('Which genre do you want to check for: ')
movie_list = final_data.copy()

def genre_rank(user_input,genre_input):
    genress = genre_input.split((','))
    for genre in genress:
        if user_input in genre:
            flag = (genress.index(genre))+1
            break
        else:
            flag = 0
    return flag

movie_list[f'has_{user_input}'] = movie_list['genres'].apply(lambda genre_input: genre_rank(user_input,genre_input))

movie_list[movie_list[f'has_{user_input}'] != 0].head()

Unnamed: 0,tconst,titleType,primaryTitle,startYear,genres,averageRating,numVotes,adjusted_rating,original_language,has_Action
2,tt0000574,movie,The Story of the Kelly Gang,1906,"Action,Adventure,Biography",6.0,987,6.01,en,1
57,tt0002574,movie,What Happened to Mary,1912,"Action,Drama,Thriller",5.9,38,6.05,en,1
131,tt0003545,movie,Who Will Marry Mary?,1913,"Action,Adventure",5.2,30,5.84,en,1
171,tt0003747,movie,Cameo Kirby,1914,"Action,Drama,Romance",6.4,18,6.19,en,1
203,tt0003897,movie,The Exploits of Elaine,1914,Action,6.1,107,6.11,en,1


In [16]:
user_lang = input('Enter language that you want to see movies in:')

movie_list = movie_list.loc[(movie_list[f'has_{user_input}'] != 0) & ((movie_list['original_language'] == user_lang))].sort_values(
    by=[f'has_{user_input}','adjusted_rating'],ascending=[True,False])

movie_list.head()

Unnamed: 0,tconst,titleType,primaryTitle,startYear,genres,averageRating,numVotes,adjusted_rating,original_language,has_Action
38198,tt0068182,movie,"Aguirre, the Wrath of God",1972,"Action,Adventure,Biography",7.8,65316,7.8,de,1
66794,tt0119472,movie,Knockin' on Heaven's Door,1997,"Action,Comedy,Crime",7.8,35446,7.8,de,1
70972,tt0130827,movie,Run Lola Run,1998,"Action,Crime,Thriller",7.6,213461,7.6,de,1
72317,tt0135790,movie,Bang Boom Bang - Ein todsicheres Ding,1999,"Action,Comedy,Crime",7.5,11113,7.49,de,1
140294,tt0765432,movie,The Baader Meinhof Complex,2008,"Action,Biography,Crime",7.3,40986,7.3,de,1


In [17]:
movie_list.to_excel(f'movie_has_{user_input}_list.xlsx')