In [2]:
#download, clean and merge imdb film dataset


#import modules, packages and libraries ~ 10mins

import pandas as pd
import requests as req
import gzip
from io import BytesIO
import time
import seaborn as sns
sns.set()


#get film datasets

#set urls
url_title_basics = 'https://datasets.imdbws.com/title.basics.tsv.gz' #film name, year, runtime, genres
url_crew = 'https://datasets.imdbws.com/title.principals.tsv.gz' #actors, actresses, cinematographers, directors (redundant)
url_ratings = 'https://datasets.imdbws.com/title.ratings.tsv.gz' #ratings for films (not all)
url_names = 'https://datasets.imdbws.com/name.basics.tsv.gz' #link table for names against nconst
url_langs = 'https://datasets.imdbws.com/title.akas.tsv.gz' #link table for names against nconst

#download from url
res_title_basics = req.get(url_title_basics).content
res_crew = req.get(url_crew).content
res_ratings = req.get(url_ratings).content
res_names = req.get(url_names).content
res_lang = req.get(url_langs).content

#decompress
title_basics_gzip = gzip.decompress(res_title_basics)
crew_basics_gzip = gzip.decompress(res_crew)
title_ratings_gzip = gzip.decompress(res_ratings)
names_gzip = gzip.decompress(res_names)
title_langs_gzip = gzip.decompress(res_lang)

#read csv into dataframes
titles = pd.read_csv(BytesIO(title_basics_gzip), delimiter='\t',low_memory=False)
crew = pd.read_csv(BytesIO(crew_basics_gzip), delimiter='\t',low_memory=False)
ratings = pd.read_csv(BytesIO(title_ratings_gzip), delimiter='\t',low_memory=False)
names = pd.read_csv(BytesIO(names_gzip), delimiter='\t',low_memory=False)
langs = pd.read_csv(BytesIO(title_langs_gzip), delimiter='\t',low_memory=False)



#clean data

# #filter only English-speaking regions
desired_langs = ['en']
filtered_langs = langs[langs['language'].isin(desired_langs)]
tconsts_filtered_langs = filtered_langs['titleId'].tolist()

desired_regions = ['CA', 'US', 'GB', 'IE', 'AU', 'NZ']
filtered_regions = langs[langs['region'].isin(desired_regions)]
tconsts_filtered_regions = filtered_regions['titleId'].tolist()

#remove unsuitable films
titles = titles[titles['titleType'] == 'movie']
titles = titles[titles['genres'] != r'\N']
titles['isAdult'] = pd.to_numeric(titles['isAdult'], errors='coerce')
titles = titles[titles['isAdult'] == 0 ]
titles = titles[(titles['startYear'] >= '1955') & (titles['startYear'] != r'\N')]
titles = titles[(titles['tconst'].isin(tconsts_filtered_langs) & (titles['tconst'].isin(tconsts_filtered_regions)))]

#get tconsts for remaining non-film rows, and remove corresponding non-film rows
film_tconsts = titles['tconst'].tolist()
crew = crew[crew['tconst'].isin(film_tconsts)]
ratings = ratings[ratings['tconst'].isin(film_tconsts)]

#get tconsts for remaining non-film rows
film_tconsts = titles['tconst'].tolist()
#remove corresponding non-film rows
crew = crew[crew['tconst'].isin(film_tconsts)]
ratings = ratings[ratings['tconst'].isin(film_tconsts)]

#set columns to remove from dataset
remove_from_titles = ['originalTitle', 'endYear', 'titleType', 'isAdult']
remove_from_crew = ['ordering','job','characters']
remove_from_ratings = ['numVotes']
remove_from_names = ['birthYear', 'deathYear', 'primaryProfession', 'knownForTitles']

titles = titles.drop(columns=remove_from_titles)
crew = crew.drop(columns=remove_from_crew)
ratings = ratings.drop(columns=remove_from_ratings)
names = names.drop(columns=remove_from_names)



#merge relational tables

crew_data = crew.copy()

#merge crew data with names table to get respective names rather than nconst
crew_data['nconst'] = crew_data['nconst'].str.split(', ')
crew_data = crew_data.explode('nconst')
crew_data = pd.merge(crew_data, names, on='nconst', how='left')
crew_data = crew_data.pivot_table(
    index=['tconst'],
    columns=['category'],
    values=['primaryName'],
    aggfunc=lambda x: ', '.join(str(item) for item in x),
).reset_index()

#formaat and restructure columns
crew_data.columns = [' '.join(col).strip() for col in crew_data.columns.values]
crew_data.columns = ['tconst', 'actor', 'actress', 'archive_footage', 'archive_sound', 'cinematographer', 'composer', 'director', 'editor', 'producer', 'production_designer', 'self', 'writer']

#merge datasets for one complete table
crew_data = crew_data.drop(columns=['archive_footage','archive_sound','self', 'production_designer'])
film_data = pd.merge(titles, ratings, on='tconst', how='left')
film_data = pd.merge(film_data, crew_data, on='tconst', how='left')



#remove data-sparse films

columns_check = ['director', 'cinematographer', 'editor', 'writer', 'composer', 'producer']
film_data = film_data[film_data[columns_check].isna().sum(axis=1) < 4]

film_data= film_data.dropna(subset=['actor', 'actress'])
film_data = film_data.dropna(subset=['runtimeMinutes'])
film_data = film_data.dropna(subset=['averageRating'])
film_data = film_data.dropna(subset=['genres'])

#add columns for plot and poster path
film_data['plot'] = 'NaN'
film_data['poster'] = 'NaN'


film_data

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres,averageRating,actor,actress,cinematographer,composer,director,editor,producer,writer,plot,poster
0,tt0035423,Kate & Leopold,2001,118,"Comedy,Fantasy,Romance",6.4,"Hugh Jackman, Liev Schreiber, Breckin Meyer",Meg Ryan,Stuart Dryburgh,Rolfe Kent,James Mangold,David Brenner,Cathy Konrad,Steven Rogers,,
1,tt0039442,"Habla, mudita",1973,88,Drama,6.1,"José Luis López Vázquez, Francisco Algora","Kiti Mánver, Hanna Haxmann",Luis Cuadrado,Franz Schubert,Manuel Gutiérrez Aragón,Pablo G. del Amo,,José Luis García Sánchez,,
2,tt0042423,The Dungeon of Harrow,1962,86,Horror,3.4,"Russ Harvey, William McNulty","Helen Hogan, Michele Buquor",James C. Houston,,Pat Boyette,,Don Russell,Henry Garcia,,
3,tt0045853,Hadaka no taishô,1958,92,Comedy,7.4,"Keiju Kobayashi, Daisuke Katô","Kyôko Aoyama, Aiko Mimasu, Yasuko Nakada",Asakazu Nakai,Toshirô Mayuzumi,Hiromichi Horikawa,,,Yôko Mizuki,,
4,tt0046292,The Sign of Venus,1955,97,"Comedy,Drama,Romance",6.5,"Vittorio De Sica, Raf Vallone","Sophia Loren, Franca Valeri",,,Dino Risi,,Marcello Girosi,"Edoardo Anton, Luigi Comencini, Ennio Flaiano,...",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96858,tt9911196,The Marriage Escape,2020,103,"Comedy,Drama",7.5,"Herman Finkers, Ferdi Stofmeel","Johanna ter Steege, Leonie ter Braak",,Daniël Polman,Johan Nijenhuis,,Ingmar Menning,"Radek Bajgar, Maarten Lebens, Mirka Zlatníková",,
96860,tt9914192,No Gogó do Paulinho,2020,98,Comedy,5.3,"Maurício Manfrini, Alan Rocha, Alex Teix",Cacau Protásio,,Fabiano Krieger,Roberto Santucci,,André Carreira,"Paulo Cursino, Odete Damico, Sergio Martorelli",,
96862,tt9914942,Life Without Sara Amat,2019,74,Drama,6.7,Biel Rossell,"Maria Morera, Francesca Piñón, Anna Sabaté",Gris Jordana,Pau Vallvé,Laura Jou,Raúl Román,,"Coral Cruz, Pep Puig",,
96866,tt9916170,The Rehearsal,2019,51,Drama,7.0,"Pablo Lafuente, Kelner Macêdo, Germano Melo",Julia Ianina,Barbara Alvarez,,Tamar Guimaraes,Beatriz Pomar,,"Lillah Halla, Melissa de Raaf",,


In [None]:
#TMDB api with multi-threading

import concurrent.futures
import time
import os

#theMovieDB api call for film plot summary and poster
def fetchDetails(film_id):
    url = f'https://api.themoviedb.org/3/movie/{film_id}'
    
    headers = { 
        "accept": "application/json",
        "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiI0YmYxZTkxOWFjMDBkYmI2NjhjODVlODg5ZWJjZTg1ZCIsInN1YiI6IjY1OGIwNzEyMzI1YTUxNTkyNzAxNWU4OSIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.yKZIOsVYvJxzRO3GJ1yayqvSCZg3l-ryO9FjBkfHIZc"
    }

    response = req.get(url, headers=headers)

    return response

#get film psoster and plot for given batch of films
def doBatch(batch):
        
        global request_counter     
        MAX_THREADS = min(os.cpu_count(), 1000)

         # Filter films that are sparse (most of the specified columns are empty)

        with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
            results = list(executor.map(doFetch, batch['tconst']))

        
        for index, details in zip(batch.index, results):

            request_counter += 1
        
            if(details.ok):
            
                details = details.json()

                if(details['overview']):
                    film_data.at[index, 'plot'] = details['overview']

                if(details['poster_path']):
                    film_data.at[index, 'poster'] = details['poster_path']

            if request_counter % 50 == 0:
                time.sleep(10)
  
   
def doFetch(film_id):
    return fetchDetails(film_id)


#process batches of 1,000 films at a time
batch_size = 1000

num_batches = (len(film_data) // batch_size) + 1
request_counter = 0
print(num_batches)

for i in range(num_batches):
    start_index = i * batch_size
    end_index = (i + 1) * batch_size

    #slice DataFrame to get the current batch
    current_batch = film_data.iloc[start_index:end_index]

    #fetch details for the current batch
    doBatch(current_batch)

    time.sleep(1)

    print(f"Done batch {i + 1}")
    

In [5]:
from tmdb_calls import doBatch
import concurrent.futures
from multiprocessing import Manager

if __name__ == '__main__':

    manager = Manager()
    shared_data = manager.Namespace()
    shared_data.film_data = film_data
    shared_data.number = 1

    with concurrent.futures.ProcessPoolExecutor() as process_executor:

        future = process_executor.submit(doBatch, shared_data)

        concurrent.futures.wait([future])
        print("All batches completed")

        print(shared_data.number)


All batches completed
1


In [4]:
film_data

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres,averageRating,actor,actress,cinematographer,composer,director,editor,producer,writer,plot,poster
0,tt0035423,Kate & Leopold,2001,118,"Comedy,Fantasy,Romance",6.4,"Hugh Jackman, Liev Schreiber, Breckin Meyer",Meg Ryan,Stuart Dryburgh,Rolfe Kent,James Mangold,David Brenner,Cathy Konrad,Steven Rogers,,
1,tt0039442,"Habla, mudita",1973,88,Drama,6.1,"José Luis López Vázquez, Francisco Algora","Kiti Mánver, Hanna Haxmann",Luis Cuadrado,Franz Schubert,Manuel Gutiérrez Aragón,Pablo G. del Amo,,José Luis García Sánchez,,
2,tt0042423,The Dungeon of Harrow,1962,86,Horror,3.4,"Russ Harvey, William McNulty","Helen Hogan, Michele Buquor",James C. Houston,,Pat Boyette,,Don Russell,Henry Garcia,,
3,tt0045853,Hadaka no taishô,1958,92,Comedy,7.4,"Keiju Kobayashi, Daisuke Katô","Kyôko Aoyama, Aiko Mimasu, Yasuko Nakada",Asakazu Nakai,Toshirô Mayuzumi,Hiromichi Horikawa,,,Yôko Mizuki,,
4,tt0046292,The Sign of Venus,1955,97,"Comedy,Drama,Romance",6.5,"Vittorio De Sica, Raf Vallone","Sophia Loren, Franca Valeri",,,Dino Risi,,Marcello Girosi,"Edoardo Anton, Luigi Comencini, Ennio Flaiano,...",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96858,tt9911196,The Marriage Escape,2020,103,"Comedy,Drama",7.5,"Herman Finkers, Ferdi Stofmeel","Johanna ter Steege, Leonie ter Braak",,Daniël Polman,Johan Nijenhuis,,Ingmar Menning,"Radek Bajgar, Maarten Lebens, Mirka Zlatníková",,
96860,tt9914192,No Gogó do Paulinho,2020,98,Comedy,5.3,"Maurício Manfrini, Alan Rocha, Alex Teix",Cacau Protásio,,Fabiano Krieger,Roberto Santucci,,André Carreira,"Paulo Cursino, Odete Damico, Sergio Martorelli",,
96862,tt9914942,Life Without Sara Amat,2019,74,Drama,6.7,Biel Rossell,"Maria Morera, Francesca Piñón, Anna Sabaté",Gris Jordana,Pau Vallvé,Laura Jou,Raúl Román,,"Coral Cruz, Pep Puig",,
96866,tt9916170,The Rehearsal,2019,51,Drama,7.0,"Pablo Lafuente, Kelner Macêdo, Germano Melo",Julia Ianina,Barbara Alvarez,,Tamar Guimaraes,Beatriz Pomar,,"Lillah Halla, Melissa de Raaf",,


In [None]:
#TEST  fetch credits #############################
def fetchCredits(film_id):
    url = f'https://api.themoviedb.org/3/movie/{film_id}/credits?language=en-US'
    
    headers = { 
        "accept": "application/json",
        "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiI0YmYxZTkxOWFjMDBkYmI2NjhjODVlODg5ZWJjZTg1ZCIsInN1YiI6IjY1OGIwNzEyMzI1YTUxNTkyNzAxNWU4OSIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.yKZIOsVYvJxzRO3GJ1yayqvSCZg3l-ryO9FjBkfHIZc"
    }

    response = req.get(url, headers=headers)

    return response

def filter_crew_by_department(crew, department):
    return list(filter(lambda member: member['known_for_department'] == department, crew))


def do(items):

    total = 0

    for index, row in items.iterrows():
        # Further film credits, cast, producers etc
        api_credits = fetchCredits(row['tconst'])
        helped = False

        if api_credits.ok:
            api_credits = api_credits.json()
            

            if len(api_credits['cast']) > 0:
                if pd.isnull(row['director']):
                    directors = filter_crew_by_department(api_credits['cast'], 'Directing')
                    if directors:
                        helped = True

                if pd.isnull(row['editor']):
                    editors = filter_crew_by_department(api_credits['cast'], 'Editing')
                    if editors:
                        helped = True

                if pd.isnull(row['producer']):
                    producers = filter_crew_by_department(api_credits['cast'], 'Production')
                    if producers:
                        helped = True

                if pd.isnull(row['cinematographer']):
                    cinematographers = filter_crew_by_department(api_credits['cast'], 'Camera')
                    if cinematographers:
                        helped = True

                if pd.isnull(row['composer']):
                    composers = filter_crew_by_department(api_credits['cast'], 'Sound')
                    if composers:
                        helped = True
            
            if helped:
                total += 1
            
    return total  

film_data_copy = film_data.copy()
for i in range(1,101):
    x = 0
    films = film_data_copy.sample(n=1000) #n = sample size
    x += do(films)
    print(f'Nulls filled per 1000 film: ', x)



In [None]:
#TEST #############################

#film data stats from raw imdb
columns_check = ['director', 'cinematographer', 'editor', 'writer', 'composer', 'producer']
nan_counts = film_data[columns_check].isna().sum(axis=1)
sample = film_data[nan_counts < 3]

# Calculate the number of missing values for each specified column
print(film_data[columns_check].isna().sum())
print('')
print(f'Missing director: {film_data["director"].isna().sum()}')
print(f'Missing cinematographer: {film_data["cinematographer"].isna().sum()}')
print(f'Missing editor: {film_data["editor"].isna().sum()}')
print(f'Missing writer: {film_data["writer"].isna().sum()}')
print(f'Missing composer: {film_data["composer"].isna().sum()}')
print(f'Missing producer: {film_data["producer"].isna().sum()}')

In [None]:
#export film data to csv and json

#convert csv to json
def csv_to_json(input_csv, output_json):
    # Read CSV file into a DataFrame
    df = pd.read_csv(input_csv)

    # Convert DataFrame to a list of dictionaries
    json_list = df.to_dict(orient='records')

    # Write the list of dictionaries to a JSON file as an array
    with open(output_json, 'w', encoding='utf-8') as json_file:
        json_str = pd.Series(json_list).to_json(orient='records', lines=False, default_handler=str, indent=2)
        json_str_no_apostrophes = json_str.replace("'", "")
        json_file.write(json_str_no_apostrophes)

#shuffle films and export
film_data = film_data.sample(frac=1)
film_data.to_csv('webpage/films.csv')
csv_to_json('webpage/films.csv', 'webpage/films.json')