In [2]:
#import modules and packages

import pandas as pd #data processing
import numpy as np #maths
import requests as req #get data from url
import gzip
from io import BytesIO

import seaborn as sns
sns.set()
import statsmodels.api as sm
from sklearn.cluster import KMeans


In [3]:
#get film datasets

#set urls
url_title_basics = 'https://datasets.imdbws.com/title.basics.tsv.gz' #film name, year, runtime, genres
url_crew = 'https://datasets.imdbws.com/title.principals.tsv.gz'#actors, actresses, cinematographers, directors (redundant)
url_ratings = 'https://datasets.imdbws.com/title.ratings.tsv.gz' #ratings for films (not all)
url_names = 'https://datasets.imdbws.com/name.basics.tsv.gz' #link table for names against nconst


#download from url
res_title_basics = req.get(url_title_basics).content
res_crew = req.get(url_crew).content
res_ratings = req.get(url_ratings).content
res_names = req.get(url_names).content

#decompress
title_basics_gzip = gzip.decompress(res_title_basics)
crew_basics_gzip = gzip.decompress(res_crew)
title_ratings_gzip = gzip.decompress(res_ratings)
names_gzip = gzip.decompress(res_names)

#read csv into dataframes
titles = pd.read_csv(BytesIO(title_basics_gzip), delimiter='\t')
crew = pd.read_csv(BytesIO(crew_basics_gzip), delimiter='\t')
ratings = pd.read_csv(BytesIO(title_ratings_gzip), delimiter='\t')
names = pd.read_csv(BytesIO(names_gzip), delimiter='\t')


  titles = pd.read_csv(BytesIO(title_basics_gzip), delimiter='\t')


In [4]:
#clean data

#remove non-movie rows, and movies from before 1930
titles = titles[titles['titleType'] == 'movie']
titles = titles[titles['startYear'] >= '1930']

#get tconsts for remaining non-movie rows
film_tconsts = titles['tconst'].tolist()

#remove corresponding non-movie rows
crew = crew[crew['tconst'].isin(film_tconsts)]
ratings = ratings[ratings['tconst'].isin(film_tconsts)]

#set columns to remove from dataset
remove_from_titles = ['originalTitle', 'endYear', 'titleType', 'isAdult']
remove_from_crew = ['ordering','job','characters']
remove_from_ratings = ['numVotes']
remove_from_names = ['birthYear', 'deathYear', 'primaryProfession', 'knownForTitles']

titles = titles.drop(columns=remove_from_titles)
crew = crew.drop(columns=remove_from_crew)
ratings = ratings.drop(columns=remove_from_ratings)
names = names.drop(columns=remove_from_names)


In [5]:
titles

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres
3816,tt0003854,Dodge City Trail,1936,56,"Drama,Music,Western"
11636,tt0011801,Tötet nicht mehr,2019,\N,"Action,Crime"
13081,tt0013274,Istoriya grazhdanskoy voyny,2021,94,Documentary
15176,tt0015414,La tierra de los toros,2000,60,\N
15484,tt0015724,Dama de noche,1993,102,"Drama,Mystery,Romance"
...,...,...,...,...,...
10332627,tt9916622,Rodolpho Teóphilo - O Legado de um Pioneiro,2015,57,Documentary
10332654,tt9916680,De la ilusión al desconcierto: cine colombiano...,2007,100,Documentary
10332666,tt9916706,Dankyavar Danka,2013,\N,Comedy
10332676,tt9916730,6 Gunn,2017,116,Drama


In [6]:
crew

Unnamed: 0,tconst,nconst,category
20811,tt0003854,nm0108653,actor
20812,tt0003854,nm0823633,actor
20813,tt0003854,nm0337083,actor
20814,tt0003854,nm0919737,actress
20815,tt0003854,nm0382954,actor
...,...,...,...
59185545,tt9916754,nm9272490,director
59185546,tt9916754,nm8349149,director
59185547,tt9916754,nm9272489,cinematographer
59185548,tt9916754,nm10538638,cinematographer


In [7]:
names

Unnamed: 0,nconst,primaryName
0,nm0000001,Fred Astaire
1,nm0000002,Lauren Bacall
2,nm0000003,Brigitte Bardot
3,nm0000004,John Belushi
4,nm0000005,Ingmar Bergman
...,...,...
13029342,nm9993714,Romeo del Rosario
13029343,nm9993716,Essias Loberg
13029344,nm9993717,Harikrishnan Rajan
13029345,nm9993718,Aayush Nair


In [8]:
ratings

Unnamed: 0,tconst,averageRating
1388,tt0003854,3.7
3590,tt0013274,6.8
4264,tt0015414,5.2
4379,tt0015724,6.1
4498,tt0016029,7.0
...,...,...
1371415,tt9916270,5.8
1371419,tt9916362,6.4
1371424,tt9916428,3.5
1371430,tt9916538,8.6


In [45]:
#merge datasets for one complete table

crew_data = crew.pivot_table(index='tconst', columns='category', values='nconst', aggfunc=lambda x: ', '.join(x)).reset_index()
crew_data = crew_data.drop(columns=['archive_footage','archive_sound','self'])
film_data = pd.merge(titles, ratings, on='tconst', how='left')
film_data = pd.merge(film_data, crew_data, on='tconst', how='left')

film_data

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres,averageRating,actor,actress,cinematographer,composer,director,editor,producer,production_designer,writer
0,tt0003854,Dodge City Trail,1936,56,"Drama,Music,Western",3.7,"nm0108653, nm0823633, nm0337083, nm0382954, nm...",nm0919737,nm0576037,,nm0170973,nm0266933,,,nm0795851
1,tt0011801,Tötet nicht mehr,2019,\N,"Action,Crime",,"nm0459029, nm0681726, nm0726256, nm0776458, nm...",nm0692612,nm1773808,,,,,,nm0483944
2,tt0013274,Istoriya grazhdanskoy voyny,2021,94,Documentary,6.8,,,,,"nm0412842, nm0895048",,nm13054604,,
3,tt0015414,La tierra de los toros,2000,60,\N,5.2,,,,,,,,,
4,tt0015724,Dama de noche,1993,102,"Drama,Mystery,Romance",6.1,"nm0844752, nm0194720","nm0869732, nm0650495",nm0006509,nm0255550,nm0529960,nm1457911,nm0600039,,nm1597742
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
628265,tt9916622,Rodolpho Teóphilo - O Legado de um Pioneiro,2015,57,Documentary,,nm9272513,,"nm9272492, nm9272489, nm8349149, nm9275317",,"nm9272490, nm9272491",,,,
628266,tt9916680,De la ilusión al desconcierto: cine colombiano...,2007,100,Documentary,,,,"nm10538579, nm10538578, nm10538577",,nm0652213,nm4762061,,,nm10538576
628267,tt9916706,Dankyavar Danka,2013,\N,Comedy,,"nm1778107, nm2585097, nm5697682","nm9722080, nm3272130","nm9722084, nm10538597",,nm7764440,,nm2983963,,nm7933903
628268,tt9916730,6 Gunn,2017,116,Drama,7.6,"nm6096005, nm0059461, nm13233318, nm4852679",,nm1957275,,nm10538612,nm9785908,"nm10538614, nm10538613",,


In [55]:
#remove movies that don't have enough data (roughly 70,000)

rows_with_no_info = film_data[film_data[['actor', 'actress', 'cinematographer', 'producer', 'editor', 'composer', 'averageRating']].isna().all(axis=1)]
film_data = film_data.drop(rows_with_no_info.index)

film_data

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres,averageRating,actor,actress,cinematographer,composer,director,editor,producer,production_designer,writer
0,tt0003854,Dodge City Trail,1936,56,"Drama,Music,Western",3.7,"nm0108653, nm0823633, nm0337083, nm0382954, nm...",nm0919737,nm0576037,,nm0170973,nm0266933,,,nm0795851
1,tt0011801,Tötet nicht mehr,2019,\N,"Action,Crime",,"nm0459029, nm0681726, nm0726256, nm0776458, nm...",nm0692612,nm1773808,,,,,,nm0483944
2,tt0013274,Istoriya grazhdanskoy voyny,2021,94,Documentary,6.8,,,,,"nm0412842, nm0895048",,nm13054604,,
3,tt0015414,La tierra de los toros,2000,60,\N,5.2,,,,,,,,,
4,tt0015724,Dama de noche,1993,102,"Drama,Mystery,Romance",6.1,"nm0844752, nm0194720","nm0869732, nm0650495",nm0006509,nm0255550,nm0529960,nm1457911,nm0600039,,nm1597742
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
628265,tt9916622,Rodolpho Teóphilo - O Legado de um Pioneiro,2015,57,Documentary,,nm9272513,,"nm9272492, nm9272489, nm8349149, nm9275317",,"nm9272490, nm9272491",,,,
628266,tt9916680,De la ilusión al desconcierto: cine colombiano...,2007,100,Documentary,,,,"nm10538579, nm10538578, nm10538577",,nm0652213,nm4762061,,,nm10538576
628267,tt9916706,Dankyavar Danka,2013,\N,Comedy,,"nm1778107, nm2585097, nm5697682","nm9722080, nm3272130","nm9722084, nm10538597",,nm7764440,,nm2983963,,nm7933903
628268,tt9916730,6 Gunn,2017,116,Drama,7.6,"nm6096005, nm0059461, nm13233318, nm4852679",,nm1957275,,nm10538612,nm9785908,"nm10538614, nm10538613",,


In [56]:
rows_with_directors = film_data[film_data['director'].notna()]
num_rows_with_directors = len(rows_with_directors)
print(f'Films with 1 or more directors: {num_rows_with_directors}')

rows_without_directors = film_data[film_data['director'].isna()]
num_rows_without_directors = len(rows_without_directors)
print(f'Films with no directors: {num_rows_without_directors}')

rows_with_writers = film_data[film_data['writer'].notna()]
num_rows_with_writers = len(rows_with_writers)
print(f'Films with 1 or more writers: {num_rows_with_writers}')

rows_without_writers = film_data[film_data['writer'].isna()]
num_rows_without_writers = len(rows_without_writers)
print(f'Films with no writers: {num_rows_without_writers}')

rows_with_actors = film_data[film_data['actor'].notna()]
num_rows_with_actors = len(rows_with_actors)
print(f'Films with 1 or more actors: {num_rows_with_actors}')

rows_without_actors = film_data[film_data['actor'].isna()]
num_rows_without_actors = len(rows_without_actors)
print(f'Films with no actors: {num_rows_without_actors}')

rows_with_actresses = film_data[film_data['actress'].notna()]
num_rows_with_actresses = len(rows_with_actresses)
print(f'Films with 1 or more actresses: {num_rows_with_actresses}')

rows_without_actresses = film_data[film_data['actress'].isna()]
num_rows_without_actresses = len(rows_without_actresses)
print(f'Films with no actresses: {num_rows_without_actresses}')

rows_with_cinematographers = film_data[film_data['cinematographer'].notna()]
num_rows_with_cinematographers = len(rows_with_cinematographers)
print(f'Films with 1 or more cinematographers: {num_rows_with_cinematographers}')

rows_without_cinematographers = film_data[film_data['cinematographer'].isna()]
num_rows_without_cinematographers = len(rows_without_cinematographers)
print(f'Films with no cinematographers: {num_rows_without_cinematographers}')

rows_with_producers = film_data[film_data['producer'].notna()]
num_rows_with_producers = len(rows_with_producers)
print(f'Films with 1 or more producers: {num_rows_with_producers}')

rows_without_producers = film_data[film_data['producer'].isna()]
num_rows_without_producers = len(rows_without_producers)
print(f'Films with no producers: {num_rows_without_producers}')

rows_with_editors = film_data[film_data['editor'].notna()]
num_rows_with_editors = len(rows_with_editors)
print(f'Films with 1 or more editors: {num_rows_with_editors}')

rows_without_editors = film_data[film_data['editor'].isna()]
num_rows_without_editors = len(rows_without_editors)
print(f'Films with no editors: {num_rows_without_editors}')

rows_with_composers = film_data[film_data['composer'].notna()]
num_rows_with_composers = len(rows_with_composers)
print(f'Films with 1 or more composers: {num_rows_with_composers}')

rows_without_composers = film_data[film_data['composer'].isna()]
num_rows_without_composers = len(rows_without_composers)
print(f'Films with no composers: {num_rows_without_composers}')

rows_with_ratings = film_data[film_data['averageRating'].notna()]
num_rows_with_ratings = len(rows_with_ratings)
print(f'Films with 1 or more ratings: {num_rows_with_ratings}')

rows_without_ratings = film_data[film_data['averageRating'].isna()]
num_rows_without_ratings = len(rows_without_ratings)
print(f'Films with no ratings: {num_rows_without_ratings}')



Films with 1 or more directors: 486434
Films with no directors: 72065
Films with 1 or more writers: 266333
Films with no writers: 292166
Films with 1 or more actors: 429015
Films with no actors: 129484
Films with 1 or more actresses: 373436
Films with no actresses: 185063
Films with 1 or more cinematographers: 290134
Films with no cinematographers: 268365
Films with 1 or more producers: 278710
Films with no producers: 279789
Films with 1 or more editors: 184429
Films with no editors: 374070
Films with 1 or more composers: 261652
Films with no composers: 296847
Films with 1 or more ratings: 294669
Films with no ratings: 263830


In [None]:
#export cleaned data to csv

film_data.to_csv('film_data')