In [1]:
#import modules and packages

import pandas as pd #data processing
import numpy as np #maths
import requests as req #get data from url
import gzip
from io import BytesIO

import seaborn as sns
sns.set()


In [2]:
#get film datasets

#set urls
url_title_basics = 'https://datasets.imdbws.com/title.basics.tsv.gz' #film name, year, runtime, genres
url_crew = 'https://datasets.imdbws.com/title.principals.tsv.gz'#actors, actresses, cinematographers, directors (redundant)
url_ratings = 'https://datasets.imdbws.com/title.ratings.tsv.gz' #ratings for films (not all)
url_names = 'https://datasets.imdbws.com/name.basics.tsv.gz' #link table for names against nconst
url_langs = 'https://datasets.imdbws.com/title.akas.tsv.gz' #link table for names against nconst


#download from url
res_title_basics = req.get(url_title_basics).content
res_crew = req.get(url_crew).content
res_ratings = req.get(url_ratings).content
res_names = req.get(url_names).content
res_lang = req.get(url_langs).content


#decompress
title_basics_gzip = gzip.decompress(res_title_basics)
crew_basics_gzip = gzip.decompress(res_crew)
title_ratings_gzip = gzip.decompress(res_ratings)
names_gzip = gzip.decompress(res_names)
title_langs_gzip = gzip.decompress(res_lang)


#read csv into dataframes
titles = pd.read_csv(BytesIO(title_basics_gzip), delimiter='\t')
crew = pd.read_csv(BytesIO(crew_basics_gzip), delimiter='\t')
ratings = pd.read_csv(BytesIO(title_ratings_gzip), delimiter='\t')
names = pd.read_csv(BytesIO(names_gzip), delimiter='\t')
langs = pd.read_csv(BytesIO(title_langs_gzip), delimiter='\t')



  titles = pd.read_csv(BytesIO(title_basics_gzip), delimiter='\t')
  langs = pd.read_csv(BytesIO(title_langs_gzip), delimiter='\t')


In [3]:
#clean data

#filter only English-speaking regions
desired_regions = ['CA', 'US', 'GB', 'IE', 'AU', 'NZ']
filtered_langs = langs[langs['region'].isin(desired_regions)]
tconsts_filtered_langs = filtered_langs['titleId'].tolist()


#remove non-movie rows, and movies from before 1960
titles = titles[titles['titleType'] == 'movie']
titles = titles[titles['startYear'] >= '1955']
titles = titles[titles['startYear'] != '/N']
titles = titles[titles['genres'] != '/N']
titles = titles[titles['tconst'].isin(tconsts_filtered_langs)]

#get tconsts for remaining non-movie rows
film_tconsts = titles['tconst'].tolist()

#remove corresponding non-movie rows
crew = crew[crew['tconst'].isin(film_tconsts)]
ratings = ratings[ratings['tconst'].isin(film_tconsts)]

#set columns to remove from dataset
remove_from_titles = ['originalTitle', 'endYear', 'titleType', 'isAdult']
remove_from_crew = ['ordering','job','characters']
remove_from_ratings = ['numVotes']
remove_from_names = ['birthYear', 'deathYear', 'primaryProfession', 'knownForTitles']

titles = titles.drop(columns=remove_from_titles)
crew = crew.drop(columns=remove_from_crew)
ratings = ratings.drop(columns=remove_from_ratings)
names = names.drop(columns=remove_from_names)


In [4]:
titles

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres
27745,tt0028248,Shipmates o' Mine,2022,87,Musical
31041,tt0031603,Made in Germany - Die dramatische Geschichte d...,1957,101,"Biography,Drama"
34798,tt0035423,Kate & Leopold,2001,118,"Comedy,Fantasy,Romance"
35961,tt0036606,"Another Time, Another Place",1983,118,"Drama,War"
38005,tt0038687,Let There Be Light,1980,58,"Documentary,War"
...,...,...,...,...,...
10445291,tt9916190,Safeguard,2020,95,"Action,Adventure,Thriller"
10445330,tt9916270,Il talento del calabrone,2020,84,Thriller
10445375,tt9916362,Coven,2020,92,"Drama,History"
10445407,tt9916428,The Secret of China,2019,\N,"Adventure,History,War"


In [5]:
crew

Unnamed: 0,tconst,nconst,category
219759,tt0028248,nm0526951,actor
219760,tt0028248,nm0308249,actor
219761,tt0028248,nm0012499,actress
219762,tt0028248,nm0088942,actor
219763,tt0028248,nm0228839,actor
...,...,...,...
59882899,tt9916428,nm0422639,actress
59882900,tt9916428,nm0910951,director
59882901,tt9916428,nm8680851,actor
59882902,tt9916428,nm3370295,actor


In [6]:
names

Unnamed: 0,nconst,primaryName
0,nm0000001,Fred Astaire
1,nm0000002,Lauren Bacall
2,nm0000003,Brigitte Bardot
3,nm0000004,John Belushi
4,nm0000005,Ingmar Bergman
...,...,...
13138587,nm9993714,Romeo del Rosario
13138588,nm9993716,Essias Loberg
13138589,nm9993717,Harikrishnan Rajan
13138590,nm9993718,Aayush Nair


In [7]:
ratings

Unnamed: 0,tconst,averageRating
12167,tt0028248,4.2
14824,tt0031603,6.5
18025,tt0035423,6.4
19023,tt0036606,6.4
20720,tt0038687,7.4
...,...,...
1385641,tt9916170,7.0
1385642,tt9916190,3.7
1385649,tt9916270,5.8
1385653,tt9916362,6.4


In [8]:
#merge datasets for one complete table

crew_data = crew.pivot_table(index='tconst', columns='category', values='nconst', aggfunc=lambda x: ', '.join(x)).reset_index()
crew_data = crew_data.drop(columns=['archive_footage','archive_sound','self'])
film_data = pd.merge(titles, ratings, on='tconst', how='left')
film_data = pd.merge(film_data, crew_data, on='tconst', how='left')

film_data

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres,averageRating,actor,actress,cinematographer,composer,director,editor,producer,production_designer,writer
0,tt0028248,Shipmates o' Mine,2022,87,Musical,4.2,"nm0526951, nm0308249, nm0088942, nm0228839, nm...",nm0012499,,,nm0593632,,,,"nm0068760, nm0669260"
1,tt0031603,Made in Germany - Die dramatische Geschichte d...,1957,101,"Biography,Drama",6.5,"nm0705384, nm0386076","nm0549032, nm0754240",nm0643218,nm0521440,nm0772191,,nm0345106,,"nm0530752, nm0726114"
2,tt0035423,Kate & Leopold,2001,118,"Comedy,Fantasy,Romance",6.4,"nm0413168, nm0000630, nm0005227",nm0000212,nm0238698,nm0448843,nm0003506,nm0107463,nm0465298,,nm0737216
3,tt0036606,"Another Time, Another Place",1983,118,"Drama,War",6.4,"nm0561155, nm0269416, nm0743027",nm0517642,nm0005683,nm0572820,nm0705535,,nm0675294,,"nm0450407, nm0485392"
4,tt0038687,Let There Be Light,1980,58,"Documentary,War",7.4,nm0404158,,,,nm0001379,,,,nm0442105
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330558,tt9916190,Safeguard,2020,95,"Action,Adventure,Thriller",3.7,"nm0302466, nm5464699, nm2018573, nm2009056",,"nm8262250, nm5785503",nm7879007,nm7308376,nm4877791,nm10299811,,
330559,tt9916270,Il talento del calabrone,2020,84,Thriller,5.8,"nm0144812, nm3080119","nm2063290, nm9428255",nm0130846,nm2747888,nm1480867,,"nm1799384, nm0656465",,nm10538402
330560,tt9916362,Coven,2020,92,"Drama,History",6.4,"nm0107165, nm0266723","nm3766704, nm10678594",,nm5813626,nm1893148,,"nm2970042, nm4065853, nm1086949",,nm3471432
330561,tt9916428,The Secret of China,2019,\N,"Adventure,History,War",3.5,"nm3611859, nm9445072, nm8594703, nm8680851, nm...",nm0422639,,,nm0910951,,,,


In [9]:
#remove movies that don't have enough data (roughly 70,000)

rows_with_no_info = film_data[film_data[['actor', 'actress', 'cinematographer', 'producer', 'editor', 'composer', 'averageRating']].isna().all(axis=1)]
film_data = film_data.drop(rows_with_no_info.index)

film_data

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres,averageRating,actor,actress,cinematographer,composer,director,editor,producer,production_designer,writer
0,tt0028248,Shipmates o' Mine,2022,87,Musical,4.2,"nm0526951, nm0308249, nm0088942, nm0228839, nm...",nm0012499,,,nm0593632,,,,"nm0068760, nm0669260"
1,tt0031603,Made in Germany - Die dramatische Geschichte d...,1957,101,"Biography,Drama",6.5,"nm0705384, nm0386076","nm0549032, nm0754240",nm0643218,nm0521440,nm0772191,,nm0345106,,"nm0530752, nm0726114"
2,tt0035423,Kate & Leopold,2001,118,"Comedy,Fantasy,Romance",6.4,"nm0413168, nm0000630, nm0005227",nm0000212,nm0238698,nm0448843,nm0003506,nm0107463,nm0465298,,nm0737216
3,tt0036606,"Another Time, Another Place",1983,118,"Drama,War",6.4,"nm0561155, nm0269416, nm0743027",nm0517642,nm0005683,nm0572820,nm0705535,,nm0675294,,"nm0450407, nm0485392"
4,tt0038687,Let There Be Light,1980,58,"Documentary,War",7.4,nm0404158,,,,nm0001379,,,,nm0442105
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330557,tt9916188,Minotaur,\N,\N,Thriller,,,,,,nm2410311,,nm0865189,,
330558,tt9916190,Safeguard,2020,95,"Action,Adventure,Thriller",3.7,"nm0302466, nm5464699, nm2018573, nm2009056",,"nm8262250, nm5785503",nm7879007,nm7308376,nm4877791,nm10299811,,
330559,tt9916270,Il talento del calabrone,2020,84,Thriller,5.8,"nm0144812, nm3080119","nm2063290, nm9428255",nm0130846,nm2747888,nm1480867,,"nm1799384, nm0656465",,nm10538402
330560,tt9916362,Coven,2020,92,"Drama,History",6.4,"nm0107165, nm0266723","nm3766704, nm10678594",,nm5813626,nm1893148,,"nm2970042, nm4065853, nm1086949",,nm3471432


In [10]:
#export cleaned data to csv

film_data.to_csv('webpage/film_data.csv')