In [7]:
#import modules and packages

import pandas as pd #data processing
import numpy as np #maths
import requests as req #get data from url
import gzip
from io import BytesIO

import seaborn as sns
sns.set()
import statsmodels.api as sm
from sklearn.cluster import KMeans


In [8]:
#get film datasets

#set urls
url_title_basics = 'https://datasets.imdbws.com/title.basics.tsv.gz' #film name, year, runtime, genres
url_crew = 'https://datasets.imdbws.com/title.principals.tsv.gz'#actors, actresses, cinematographers, directors (redundant)
url_ratings = 'https://datasets.imdbws.com/title.ratings.tsv.gz' #ratings for films (not all)
url_names = 'https://datasets.imdbws.com/name.basics.tsv.gz' #link table for names against nconst
url_langs = 'https://datasets.imdbws.com/title.akas.tsv.gz' #link table for names against nconst


#download from url
res_title_basics = req.get(url_title_basics).content
res_crew = req.get(url_crew).content
res_ratings = req.get(url_ratings).content
res_names = req.get(url_names).content
res_lang = req.get(url_langs).content


#decompress
title_basics_gzip = gzip.decompress(res_title_basics)
crew_basics_gzip = gzip.decompress(res_crew)
title_ratings_gzip = gzip.decompress(res_ratings)
names_gzip = gzip.decompress(res_names)
title_langs_gzip = gzip.decompress(res_lang)


#read csv into dataframes
titles = pd.read_csv(BytesIO(title_basics_gzip), delimiter='\t')
crew = pd.read_csv(BytesIO(crew_basics_gzip), delimiter='\t')
ratings = pd.read_csv(BytesIO(title_ratings_gzip), delimiter='\t')
names = pd.read_csv(BytesIO(names_gzip), delimiter='\t')
langs = pd.read_csv(BytesIO(title_langs_gzip), delimiter='\t')



  titles = pd.read_csv(BytesIO(title_basics_gzip), delimiter='\t')
  langs = pd.read_csv(BytesIO(title_langs_gzip), delimiter='\t')


In [9]:
#clean data

#filter only English-speaking regions
desired_regions = ['CA', 'US', 'GB', 'IE', 'AU', 'NZ']
filtered_langs = langs[langs['region'].isin(desired_regions)]
tconsts_filtered_langs = filtered_langs['titleId'].tolist()


#remove non-movie rows, and movies from before 1930
titles = titles[titles['titleType'] == 'movie']
titles = titles[titles['startYear'] >= '1930']
titles = titles[titles['tconst'].isin(tconsts_filtered_langs)]

#get tconsts for remaining non-movie rows
film_tconsts = titles['tconst'].tolist()

#remove corresponding non-movie rows
crew = crew[crew['tconst'].isin(film_tconsts)]
ratings = ratings[ratings['tconst'].isin(film_tconsts)]

#set columns to remove from dataset
remove_from_titles = ['originalTitle', 'endYear', 'titleType', 'isAdult']
remove_from_crew = ['ordering','job','characters']
remove_from_ratings = ['numVotes']
remove_from_names = ['birthYear', 'deathYear', 'primaryProfession', 'knownForTitles']

titles = titles.drop(columns=remove_from_titles)
crew = crew.drop(columns=remove_from_crew)
ratings = ratings.drop(columns=remove_from_ratings)
names = names.drop(columns=remove_from_names)


In [10]:
titles

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres
3816,tt0003854,Dodge City Trail,1936,56,"Drama,Music,Western"
15784,tt0016029,The Little Colonel,1935,81,"Comedy,Family,Musical"
17700,tt0017961,Happiness,1935,95,"Comedy,Drama"
18369,tt0018644,Anarkali,1930,118,\N
18591,tt0018867,Escape from Hong Kong,1942,60,"Adventure,Mystery,War"
...,...,...,...,...,...
10427577,tt9916190,Safeguard,2020,95,"Action,Adventure,Thriller"
10427616,tt9916270,Il talento del calabrone,2020,84,Thriller
10427661,tt9916362,Coven,2020,92,"Drama,History"
10427693,tt9916428,The Secret of China,2019,\N,"Adventure,History,War"


In [11]:
crew

Unnamed: 0,tconst,nconst,category
20812,tt0003854,nm0108653,actor
20813,tt0003854,nm0823633,actor
20814,tt0003854,nm0337083,actor
20815,tt0003854,nm0919737,actress
20816,tt0003854,nm0382954,actor
...,...,...,...
59761046,tt9916428,nm0422639,actress
59761047,tt9916428,nm0910951,director
59761048,tt9916428,nm8680851,actor
59761049,tt9916428,nm3370295,actor


In [12]:
names

Unnamed: 0,nconst,primaryName
0,nm0000001,Fred Astaire
1,nm0000002,Lauren Bacall
2,nm0000003,Brigitte Bardot
3,nm0000004,John Belushi
4,nm0000005,Ingmar Bergman
...,...,...
13125737,nm9993714,Romeo del Rosario
13125738,nm9993716,Essias Loberg
13125739,nm9993717,Harikrishnan Rajan
13125740,nm9993718,Aayush Nair


In [13]:
ratings

Unnamed: 0,tconst,averageRating
1392,tt0003854,3.7
4520,tt0016029,7.0
5330,tt0017961,7.3
5707,tt0018867,5.6
5930,tt0019403,7.6
...,...,...
1383251,tt9916170,7.0
1383252,tt9916190,3.7
1383259,tt9916270,5.8
1383263,tt9916362,6.4


In [14]:
#merge datasets for one complete table

crew_data = crew.pivot_table(index='tconst', columns='category', values='nconst', aggfunc=lambda x: ', '.join(x)).reset_index()
crew_data = crew_data.drop(columns=['archive_footage','archive_sound','self'])
film_data = pd.merge(titles, ratings, on='tconst', how='left')
film_data = pd.merge(film_data, crew_data, on='tconst', how='left')

film_data

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres,averageRating,actor,actress,cinematographer,composer,director,editor,producer,production_designer,writer
0,tt0003854,Dodge City Trail,1936,56,"Drama,Music,Western",3.7,"nm0108653, nm0823633, nm0337083, nm0382954, nm...",nm0919737,nm0576037,,nm0170973,nm0266933,,,nm0795851
1,tt0016029,The Little Colonel,1935,81,"Comedy,Family,Musical",7.0,"nm0000859, nm0517099","nm0000073, nm0892867","nm0005877, nm0587926",nm0006203,nm0124877,,,,"nm0175902, nm0426531"
2,tt0017961,Happiness,1935,95,"Comedy,Drama",7.3,"nm0802512, nm0320559, nm0474428, nm2610509, nm...","nm0626035, nm0947279",nm0873934,,nm0575963,,,nm0882532,
3,tt0018644,Anarkali,1930,118,\N,,"nm0082234, nm0883388","nm1998268, nm13525648, nm0701190",nm0409780,,nm0159342,,,,nm13705715
4,tt0018867,Escape from Hong Kong,1942,60,"Adventure,Mystery,War",5.6,"nm0388280, nm0855969, nm0140504, nm0222596, nm...",nm0520467,nm0106608,,nm0631438,nm0942639,,,nm0151949
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
350130,tt9916190,Safeguard,2020,95,"Action,Adventure,Thriller",3.7,"nm0302466, nm5464699, nm2018573, nm2952127",,"nm8262250, nm5785503",nm7879007,nm7308376,nm4877791,nm10299811,,
350131,tt9916270,Il talento del calabrone,2020,84,Thriller,5.8,"nm0144812, nm3080119","nm2063290, nm9428255",nm0130846,nm2747888,nm1480867,,"nm1799384, nm0656465",,nm10538402
350132,tt9916362,Coven,2020,92,"Drama,History",6.4,"nm0107165, nm0266723","nm3766704, nm10678594",,nm5813626,nm1893148,,"nm2970042, nm4065853, nm1086949",,nm3471432
350133,tt9916428,The Secret of China,2019,\N,"Adventure,History,War",3.5,"nm3611859, nm9445072, nm8594703, nm8680851, nm...",nm0422639,,,nm0910951,,,,


In [15]:
#remove movies that don't have enough data (roughly 70,000)

rows_with_no_info = film_data[film_data[['actor', 'actress', 'cinematographer', 'producer', 'editor', 'composer', 'averageRating']].isna().all(axis=1)]
film_data = film_data.drop(rows_with_no_info.index)

film_data

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres,averageRating,actor,actress,cinematographer,composer,director,editor,producer,production_designer,writer
0,tt0003854,Dodge City Trail,1936,56,"Drama,Music,Western",3.7,"nm0108653, nm0823633, nm0337083, nm0382954, nm...",nm0919737,nm0576037,,nm0170973,nm0266933,,,nm0795851
1,tt0016029,The Little Colonel,1935,81,"Comedy,Family,Musical",7.0,"nm0000859, nm0517099","nm0000073, nm0892867","nm0005877, nm0587926",nm0006203,nm0124877,,,,"nm0175902, nm0426531"
2,tt0017961,Happiness,1935,95,"Comedy,Drama",7.3,"nm0802512, nm0320559, nm0474428, nm2610509, nm...","nm0626035, nm0947279",nm0873934,,nm0575963,,,nm0882532,
3,tt0018644,Anarkali,1930,118,\N,,"nm0082234, nm0883388","nm1998268, nm13525648, nm0701190",nm0409780,,nm0159342,,,,nm13705715
4,tt0018867,Escape from Hong Kong,1942,60,"Adventure,Mystery,War",5.6,"nm0388280, nm0855969, nm0140504, nm0222596, nm...",nm0520467,nm0106608,,nm0631438,nm0942639,,,nm0151949
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
350129,tt9916188,Minotaur,\N,\N,Thriller,,,,,,nm2410311,,nm0865189,,
350130,tt9916190,Safeguard,2020,95,"Action,Adventure,Thriller",3.7,"nm0302466, nm5464699, nm2018573, nm2952127",,"nm8262250, nm5785503",nm7879007,nm7308376,nm4877791,nm10299811,,
350131,tt9916270,Il talento del calabrone,2020,84,Thriller,5.8,"nm0144812, nm3080119","nm2063290, nm9428255",nm0130846,nm2747888,nm1480867,,"nm1799384, nm0656465",,nm10538402
350132,tt9916362,Coven,2020,92,"Drama,History",6.4,"nm0107165, nm0266723","nm3766704, nm10678594",,nm5813626,nm1893148,,"nm2970042, nm4065853, nm1086949",,nm3471432


In [None]:
#export cleaned data to csv

film_data.to_csv('film_data')