In [1]:
#import modules and packages

import pandas as pd #data processing
import numpy as np #maths
import requests as req #get data from url
import gzip
from io import BytesIO

import seaborn as sns
sns.set()


In [2]:
#get film datasets

#set urls
url_title_basics = 'https://datasets.imdbws.com/title.basics.tsv.gz' #film name, year, runtime, genres
url_crew = 'https://datasets.imdbws.com/title.principals.tsv.gz' #actors, actresses, cinematographers, directors (redundant)
url_ratings = 'https://datasets.imdbws.com/title.ratings.tsv.gz' #ratings for films (not all)
url_names = 'https://datasets.imdbws.com/name.basics.tsv.gz' #link table for names against nconst
url_langs = 'https://datasets.imdbws.com/title.akas.tsv.gz' #link table for names against nconst


#download from url
res_title_basics = req.get(url_title_basics).content
res_crew = req.get(url_crew).content
res_ratings = req.get(url_ratings).content
res_names = req.get(url_names).content
res_lang = req.get(url_langs).content


#decompress
title_basics_gzip = gzip.decompress(res_title_basics)
crew_basics_gzip = gzip.decompress(res_crew)
title_ratings_gzip = gzip.decompress(res_ratings)
names_gzip = gzip.decompress(res_names)
title_langs_gzip = gzip.decompress(res_lang)


#read csv into dataframes
titles = pd.read_csv(BytesIO(title_basics_gzip), delimiter='\t')
crew = pd.read_csv(BytesIO(crew_basics_gzip), delimiter='\t')
ratings = pd.read_csv(BytesIO(title_ratings_gzip), delimiter='\t')
names = pd.read_csv(BytesIO(names_gzip), delimiter='\t')
langs = pd.read_csv(BytesIO(title_langs_gzip), delimiter='\t')



  titles = pd.read_csv(BytesIO(title_basics_gzip), delimiter='\t')
  langs = pd.read_csv(BytesIO(title_langs_gzip), delimiter='\t')


In [3]:
#clean data

#filter only English-speaking regions
desired_regions = ['CA', 'US', 'GB', 'IE', 'AU', 'NZ']
desired_langs = ['en']
filtered_regions = langs[langs['region'].isin(desired_regions)]
filtered_langs = langs[langs['language'].isin(desired_langs)]
tconsts_filtered_regions = filtered_regions['titleId'].tolist()
tconsts_filtered_langs = filtered_langs['titleId'].tolist()


#remove non-movie rows, and movies from before 1960
titles = titles[titles['titleType'] == 'movie']
titles = titles[titles['startYear'] >= '1955']
titles = titles[titles['startYear'] != '/N']
titles = titles[titles['genres'] != '/N']
titles = titles[titles['tconst'].isin(tconsts_filtered_langs)]
titles = titles[titles['tconst'].isin(tconsts_filtered_regions)]


#get tconsts for remaining non-movie rows
film_tconsts = titles['tconst'].tolist()

#remove corresponding non-movie rows
crew = crew[crew['tconst'].isin(film_tconsts)]
ratings = ratings[ratings['tconst'].isin(film_tconsts)]

#set columns to remove from dataset
remove_from_titles = ['originalTitle', 'endYear', 'titleType', 'isAdult']
remove_from_crew = ['ordering','job','characters']
remove_from_ratings = ['numVotes']
remove_from_names = ['birthYear', 'deathYear', 'primaryProfession', 'knownForTitles']

titles = titles.drop(columns=remove_from_titles)
crew = crew.drop(columns=remove_from_crew)
ratings = ratings.drop(columns=remove_from_ratings)
names = names.drop(columns=remove_from_names)


In [4]:
titles

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres
34798,tt0035423,Kate & Leopold,2001,118,"Comedy,Fantasy,Romance"
38753,tt0039442,"Habla, mudita",1973,88,Drama
41678,tt0042423,The Dungeon of Harrow,1962,86,Horror
45039,tt0045853,Hadaka no taishô,1958,92,Comedy
45466,tt0046292,The Sign of Venus,1955,97,"Comedy,Drama,Romance"
...,...,...,...,...,...
10460540,tt9915592,A Husband to Rent,1974,\N,Comedy
10460648,tt9915872,The Last White Witch,2019,97,"Comedy,Drama,Fantasy"
10460788,tt9916170,The Rehearsal,2019,51,Drama
10460881,tt9916362,Coven,2020,92,"Drama,History"


In [5]:
crew

Unnamed: 0,tconst,nconst,category
283669,tt0035423,nm0107463,editor
283670,tt0035423,nm0000212,actress
283671,tt0035423,nm0413168,actor
283672,tt0035423,nm0000630,actor
283673,tt0035423,nm0005227,actor
...,...,...,...
59958234,tt9916428,nm8594703,actor
59958235,tt9916428,nm0422639,actress
59958236,tt9916428,nm0910951,director
59958237,tt9916428,nm8680851,actor


In [6]:
names

Unnamed: 0,nconst,primaryName
0,nm0000001,Fred Astaire
1,nm0000002,Lauren Bacall
2,nm0000003,Brigitte Bardot
3,nm0000004,John Belushi
4,nm0000005,Ingmar Bergman
...,...,...
13155956,nm9993714,Romeo del Rosario
13155957,nm9993716,Essias Loberg
13155958,nm9993717,Harikrishnan Rajan
13155959,nm9993718,Aayush Nair


In [7]:
ratings

Unnamed: 0,tconst,averageRating
18057,tt0035423,6.4
21386,tt0039442,6.1
23769,tt0042423,3.4
26576,tt0045853,7.4
26950,tt0046292,6.5
...,...,...
1388372,tt9915592,5.8
1388384,tt9915872,6.4
1388397,tt9916170,7.0
1388409,tt9916362,6.4


In [8]:
#merge datasets for one complete table

crew_data = crew.pivot_table(index='tconst', columns='category', values='nconst', aggfunc=lambda x: ', '.join(x)).reset_index()
crew_data = crew_data.drop(columns=['archive_footage','archive_sound','self'])
film_data = pd.merge(titles, ratings, on='tconst', how='left')
film_data = pd.merge(film_data, crew_data, on='tconst', how='left')

film_data

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres,averageRating,actor,actress,cinematographer,composer,director,editor,producer,production_designer,writer
0,tt0035423,Kate & Leopold,2001,118,"Comedy,Fantasy,Romance",6.4,"nm0413168, nm0000630, nm0005227",nm0000212,nm0238698,nm0448843,nm0003506,nm0107463,nm0465298,,nm0737216
1,tt0039442,"Habla, mudita",1973,88,Drama,6.1,"nm0007023, nm0019330","nm0544330, nm0370455",nm0190834,nm0006280,nm0349426,nm0215327,,nm0651204,nm0305869
2,tt0042423,The Dungeon of Harrow,1962,86,Horror,3.4,"nm0367706, nm0574106","nm0389534, nm0120684",nm0396837,,nm0102124,,nm0751113,,nm0305239
3,tt0045853,Hadaka no taishô,1958,92,Comedy,7.4,"nm0462013, nm0441961","nm0031884, nm0590775, nm0620390",nm0620014,nm0006191,nm0394687,,,nm0441407,nm0594666
4,tt0046292,The Sign of Venus,1955,97,"Comedy,Drama,Romance",6.5,"nm0001120, nm0885203","nm0000047, nm0884518",,,nm0728271,,nm0320981,,"nm0031196, nm0173728, nm0280919, nm0953790"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99343,tt9915592,A Husband to Rent,1974,\N,Comedy,5.8,"nm1165023, nm1033328, nm1171534, nm1173203, nm...","nm2794834, nm2151352",nm1437188,,,,,,nm8514564
99344,tt9915872,The Last White Witch,2019,97,"Comedy,Drama,Fantasy",6.4,,"nm0755725, nm4392634",,,nm8063415,,,,nm2507310
99345,tt9916170,The Rehearsal,2019,51,Drama,7.0,"nm8370368, nm8742390, nm0578248",nm0406305,nm1204748,,nm5412267,nm6703006,,,"nm6743460, nm3245789"
99346,tt9916362,Coven,2020,92,"Drama,History",6.4,"nm0107165, nm0266723","nm3766704, nm10678594",,nm5813626,nm1893148,,"nm2970042, nm4065853, nm1086949",,nm3471432


In [9]:
#remove movies that don't have enough data (roughly 70,000)

rows_with_no_info = film_data[film_data[['actor', 'actress', 'cinematographer', 'producer', 'editor', 'composer', 'averageRating']].isna().all(axis=1)]
film_data = film_data.drop(rows_with_no_info.index)

film_data

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres,averageRating,actor,actress,cinematographer,composer,director,editor,producer,production_designer,writer
0,tt0035423,Kate & Leopold,2001,118,"Comedy,Fantasy,Romance",6.4,"nm0413168, nm0000630, nm0005227",nm0000212,nm0238698,nm0448843,nm0003506,nm0107463,nm0465298,,nm0737216
1,tt0039442,"Habla, mudita",1973,88,Drama,6.1,"nm0007023, nm0019330","nm0544330, nm0370455",nm0190834,nm0006280,nm0349426,nm0215327,,nm0651204,nm0305869
2,tt0042423,The Dungeon of Harrow,1962,86,Horror,3.4,"nm0367706, nm0574106","nm0389534, nm0120684",nm0396837,,nm0102124,,nm0751113,,nm0305239
3,tt0045853,Hadaka no taishô,1958,92,Comedy,7.4,"nm0462013, nm0441961","nm0031884, nm0590775, nm0620390",nm0620014,nm0006191,nm0394687,,,nm0441407,nm0594666
4,tt0046292,The Sign of Venus,1955,97,"Comedy,Drama,Romance",6.5,"nm0001120, nm0885203","nm0000047, nm0884518",,,nm0728271,,nm0320981,,"nm0031196, nm0173728, nm0280919, nm0953790"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99343,tt9915592,A Husband to Rent,1974,\N,Comedy,5.8,"nm1165023, nm1033328, nm1171534, nm1173203, nm...","nm2794834, nm2151352",nm1437188,,,,,,nm8514564
99344,tt9915872,The Last White Witch,2019,97,"Comedy,Drama,Fantasy",6.4,,"nm0755725, nm4392634",,,nm8063415,,,,nm2507310
99345,tt9916170,The Rehearsal,2019,51,Drama,7.0,"nm8370368, nm8742390, nm0578248",nm0406305,nm1204748,,nm5412267,nm6703006,,,"nm6743460, nm3245789"
99346,tt9916362,Coven,2020,92,"Drama,History",6.4,"nm0107165, nm0266723","nm3766704, nm10678594",,nm5813626,nm1893148,,"nm2970042, nm4065853, nm1086949",,nm3471432


In [10]:
#shuffle order of films
film_data = film_data.sample(frac=1)

#export cleaned data to csv
film_data.to_csv('webpage/film_data.csv')