In [82]:
#download, clean and merge imdb film dataset


#import modules, packages and libraries ~ 10mins

import pandas as pd
import numpy as np
import requests as req
import gzip
from io import BytesIO
import time
import seaborn as sns
sns.set()


#get film datasets

#set urls
url_title_basics = 'https://datasets.imdbws.com/title.basics.tsv.gz' #film name, year, runtime, genres
url_crew = 'https://datasets.imdbws.com/title.principals.tsv.gz' #actors, actresses, cinematographers, directors (redundant)
url_ratings = 'https://datasets.imdbws.com/title.ratings.tsv.gz' #ratings for films (not all)
url_names = 'https://datasets.imdbws.com/name.basics.tsv.gz' #link table for names against nconst
url_langs = 'https://datasets.imdbws.com/title.akas.tsv.gz' #link table for names against nconst

#download from url
res_title_basics = req.get(url_title_basics).content
res_crew = req.get(url_crew).content
res_ratings = req.get(url_ratings).content
res_names = req.get(url_names).content
res_lang = req.get(url_langs).content

#decompress
title_basics_gzip = gzip.decompress(res_title_basics)
crew_basics_gzip = gzip.decompress(res_crew)
title_ratings_gzip = gzip.decompress(res_ratings)
names_gzip = gzip.decompress(res_names)
title_langs_gzip = gzip.decompress(res_lang)

#read csv into dataframes
titles = pd.read_csv(BytesIO(title_basics_gzip), delimiter='\t',low_memory=False)
crew = pd.read_csv(BytesIO(crew_basics_gzip), delimiter='\t',low_memory=False)
ratings = pd.read_csv(BytesIO(title_ratings_gzip), delimiter='\t',low_memory=False)
names = pd.read_csv(BytesIO(names_gzip), delimiter='\t',low_memory=False)
langs = pd.read_csv(BytesIO(title_langs_gzip), delimiter='\t',low_memory=False)

print('Downloaded data')


#clean data

# #filter only English-speaking regions
desired_langs = ['en']
filtered_langs = langs[langs['language'].isin(desired_langs)]
tconsts_filtered_langs = filtered_langs['titleId'].tolist()

desired_regions = ['CA', 'US', 'GB', 'IE', 'AU', 'NZ']
filtered_regions = langs[langs['region'].isin(desired_regions)]
tconsts_filtered_regions = filtered_regions['titleId'].tolist()

#remove unsuitable films
titles = titles[titles['titleType'] == 'movie']
titles = titles[titles['genres'] != r'\N']
titles['isAdult'] = pd.to_numeric(titles['isAdult'], errors='coerce')
titles = titles[titles['isAdult'] == 0 ]
titles = titles[(titles['startYear'] >= '1955') & (titles['startYear'] != r'\N')]
titles = titles[(titles['tconst'].isin(tconsts_filtered_langs) & (titles['tconst'].isin(tconsts_filtered_regions)))]

#get tconsts for remaining non-film rows, and remove corresponding non-film rows
film_tconsts = titles['tconst'].tolist()
crew = crew[crew['tconst'].isin(film_tconsts)]
ratings = ratings[ratings['tconst'].isin(film_tconsts)]

#get tconsts for remaining non-film rows
film_tconsts = titles['tconst'].tolist()
#remove corresponding non-film rows
crew = crew[crew['tconst'].isin(film_tconsts)]
ratings = ratings[ratings['tconst'].isin(film_tconsts)]

#set columns to remove from dataset
remove_from_titles = ['originalTitle', 'endYear', 'titleType', 'isAdult']
remove_from_crew = ['ordering','job','characters']
remove_from_ratings = ['numVotes']
remove_from_names = ['birthYear', 'deathYear', 'primaryProfession', 'knownForTitles']

titles = titles.drop(columns=remove_from_titles)
crew = crew.drop(columns=remove_from_crew)
ratings = ratings.drop(columns=remove_from_ratings)
names = names.drop(columns=remove_from_names)

print('Cleaned data 1')


#merge relational tables

crew_data = crew.copy()

#merge crew data with names table to get respective names rather than nconst
crew_data['nconst'] = crew_data['nconst'].str.split(', ')
crew_data = crew_data.explode('nconst')
crew_data = pd.merge(crew_data, names, on='nconst', how='left')
crew_data = crew_data.pivot_table(
    index=['tconst'],
    columns=['category'],
    values=['primaryName'],
    aggfunc=lambda x: ', '.join(str(item) for item in x),
).reset_index()

#formaat and restructure columns
crew_data.columns = [' '.join(col).strip() for col in crew_data.columns.values]
crew_data.columns = ['tconst', 'actor', 'actress', 'archive_footage', 'archive_sound', 'cinematographer', 'composer', 'director', 'editor', 'producer', 'production_designer', 'self', 'writer']

#merge datasets for one complete table
crew_data = crew_data.drop(columns=['archive_footage','archive_sound','self', 'production_designer'])
film_data = pd.merge(titles, ratings, on='tconst', how='left')
film_data = pd.merge(film_data, crew_data, on='tconst', how='left')

print('Merged tables')



#remove data-sparse films

columns_check = ['director', 'cinematographer', 'editor', 'writer', 'composer', 'producer']
film_data = film_data[film_data[columns_check].isna().sum(axis=1) < 4]

film_data= film_data.dropna(subset=['actor', 'actress'])
film_data = film_data.dropna(subset=['runtimeMinutes'])
film_data = film_data.dropna(subset=['averageRating'])
film_data = film_data.dropna(subset=['genres'])

#add columns for plot and poster path
film_data['plot'] = np.nan
film_data['poster'] = np.nan

print('Cleaned data 2')



film_data

Downloaded data
Cleaned data 1
Merged tables
Cleaned data 2


Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres,averageRating,actor,actress,cinematographer,composer,director,editor,producer,writer,plot,poster
0,tt0035423,Kate & Leopold,2001,118,"Comedy,Fantasy,Romance",6.4,"Hugh Jackman, Liev Schreiber, Breckin Meyer",Meg Ryan,Stuart Dryburgh,Rolfe Kent,James Mangold,David Brenner,Cathy Konrad,Steven Rogers,,
1,tt0039442,"Habla, mudita",1973,88,Drama,6.1,"José Luis López Vázquez, Francisco Algora","Kiti Mánver, Hanna Haxmann",Luis Cuadrado,Franz Schubert,Manuel Gutiérrez Aragón,Pablo G. del Amo,,José Luis García Sánchez,,
2,tt0042423,The Dungeon of Harrow,1962,86,Horror,3.4,"Russ Harvey, William McNulty","Helen Hogan, Michele Buquor",James C. Houston,,Pat Boyette,,Don Russell,Henry Garcia,,
3,tt0045853,Hadaka no taishô,1958,92,Comedy,7.4,"Keiju Kobayashi, Daisuke Katô","Kyôko Aoyama, Aiko Mimasu, Yasuko Nakada",Asakazu Nakai,Toshirô Mayuzumi,Hiromichi Horikawa,,,Yôko Mizuki,,
4,tt0046292,The Sign of Venus,1955,97,"Comedy,Drama,Romance",6.5,"Vittorio De Sica, Raf Vallone","Sophia Loren, Franca Valeri",,,Dino Risi,,Marcello Girosi,"Edoardo Anton, Luigi Comencini, Ennio Flaiano,...",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96955,tt9911196,The Marriage Escape,2020,103,"Comedy,Drama",7.5,"Herman Finkers, Ferdi Stofmeel","Johanna ter Steege, Leonie ter Braak",,Daniël Polman,Johan Nijenhuis,,Ingmar Menning,"Radek Bajgar, Maarten Lebens, Mirka Zlatníková",,
96957,tt9914192,No Gogó do Paulinho,2020,98,Comedy,5.3,"Maurício Manfrini, Alan Rocha, Alex Teix",Cacau Protásio,,Fabiano Krieger,Roberto Santucci,,André Carreira,"Paulo Cursino, Odete Damico, Sergio Martorelli",,
96959,tt9914942,Life Without Sara Amat,2019,74,Drama,6.7,Biel Rossell,"Maria Morera, Francesca Piñón, Anna Sabaté",Gris Jordana,Pau Vallvé,Laura Jou,Raúl Román,,"Coral Cruz, Pep Puig",,
96963,tt9916170,The Rehearsal,2019,51,Drama,7.0,"Pablo Lafuente, Kelner Macêdo, Germano Melo",Julia Ianina,Barbara Alvarez,,Tamar Guimaraes,Beatriz Pomar,,"Lillah Halla, Melissa de Raaf",,


In [83]:
#get film plot and poster with tmdb api ~ >2hrs


#call api/details for each film with multiprocessing and mutlithreading


from tmdb_calls import doBatch
import concurrent.futures
from multiprocessing import Manager

if __name__ == '__main__':

    manager = Manager()
    shared_data = manager.Namespace()
    agg_list = []

    batch_size = 1000
    sleep_time = 3

    num_batches = (len(film_data) // batch_size) + 1
    print(f"Total batches: {num_batches}")

    with concurrent.futures.ProcessPoolExecutor(8) as process_executor:

        for i in range(num_batches):

            start_index = i * batch_size
            end_index = (i + 1) * batch_size
            
            shared_data.film_data = film_data.iloc[start_index:end_index]

            future = process_executor.submit(doBatch, shared_data)

            concurrent.futures.wait([future])

            agg_list.append(shared_data.film_data)

            print(f"{1000*(i+1)} films completed")
                
    film_data = pd.concat(agg_list, ignore_index=True)


film_data = film_data.dropna(subset=['plot'])


print('Fetched film summaries and poster')

python(61338) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Total batches: 62


python(61340) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


1000 films completed
2000 films completed
3000 films completed
4000 films completed
5000 films completed
6000 films completed
7000 films completed
8000 films completed
9000 films completed
10000 films completed
11000 films completed
12000 films completed
13000 films completed
14000 films completed
15000 films completed
16000 films completed
17000 films completed
18000 films completed
19000 films completed
20000 films completed
21000 films completed
22000 films completed
23000 films completed
24000 films completed
25000 films completed
26000 films completed
27000 films completed
28000 films completed
29000 films completed
30000 films completed
31000 films completed
32000 films completed
33000 films completed
34000 films completed
35000 films completed
36000 films completed
37000 films completed
38000 films completed
39000 films completed
40000 films completed
41000 films completed
42000 films completed
43000 films completed
44000 films completed
45000 films completed
46000 films complet

In [84]:
#export film data to json

film_data

#shuffle order
film_data = film_data.sample(frac=1)
result = film_data.to_json('webpage/films.json' ,orient="records")

print('Exported as json')

Exported as json
