In [1]:
#import modules and packages

import pandas as pd #data processing
import requests as req #get data from url
import gzip
from io import BytesIO

import seaborn as sns
sns.set()


In [2]:
#get film datasets

#set urls
url_title_basics = 'https://datasets.imdbws.com/title.basics.tsv.gz' #film name, year, runtime, genres
url_crew = 'https://datasets.imdbws.com/title.principals.tsv.gz' #actors, actresses, cinematographers, directors (redundant)
url_ratings = 'https://datasets.imdbws.com/title.ratings.tsv.gz' #ratings for films (not all)
url_names = 'https://datasets.imdbws.com/name.basics.tsv.gz' #link table for names against nconst
url_langs = 'https://datasets.imdbws.com/title.akas.tsv.gz' #link table for names against nconst


#download from url
res_title_basics = req.get(url_title_basics).content
res_crew = req.get(url_crew).content
res_ratings = req.get(url_ratings).content
res_names = req.get(url_names).content
res_lang = req.get(url_langs).content


#decompress
title_basics_gzip = gzip.decompress(res_title_basics)
crew_basics_gzip = gzip.decompress(res_crew)
title_ratings_gzip = gzip.decompress(res_ratings)
names_gzip = gzip.decompress(res_names)
title_langs_gzip = gzip.decompress(res_lang)


#read csv into dataframes
titles = pd.read_csv(BytesIO(title_basics_gzip), delimiter='\t')
crew = pd.read_csv(BytesIO(crew_basics_gzip), delimiter='\t')
ratings = pd.read_csv(BytesIO(title_ratings_gzip), delimiter='\t')
names = pd.read_csv(BytesIO(names_gzip), delimiter='\t')
langs = pd.read_csv(BytesIO(title_langs_gzip), delimiter='\t')



  titles = pd.read_csv(BytesIO(title_basics_gzip), delimiter='\t')
  langs = pd.read_csv(BytesIO(title_langs_gzip), delimiter='\t')


In [3]:
titles

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
10470493,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2009,\N,\N,"Action,Drama,Family"
10470494,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
10470495,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
10470496,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


In [8]:
#clean data

#filter only English-speaking regions
desired_regions = ['CA', 'US', 'GB', 'IE', 'AU', 'NZ']
desired_langs = ['en']
filtered_regions = langs[langs['region'].isin(desired_regions)]
filtered_langs = langs[langs['language'].isin(desired_langs)]
tconsts_filtered_regions = filtered_regions['titleId'].tolist()
tconsts_filtered_langs = filtered_langs['titleId'].tolist()


#remove non-movie rows, and movies from before 1960
titles = titles[titles['titleType'] == 'movie']
titles = titles[(titles['startYear'] >= '1955') & (titles['startYear'] != r'\N')]
titles = titles[titles['genres'] != r'\N']
titles = titles[(titles['tconst'].isin(tconsts_filtered_langs)) & (titles['tconst'].isin(tconsts_filtered_regions))]



#get tconsts for remaining non-movie rows
film_tconsts = titles['tconst'].tolist()

#remove corresponding non-movie rows
crew = crew[crew['tconst'].isin(film_tconsts)]
ratings = ratings[ratings['tconst'].isin(film_tconsts)]

#set columns to remove from dataset
remove_from_titles = ['originalTitle', 'endYear', 'titleType', 'isAdult']
remove_from_crew = ['ordering','job','characters']
remove_from_ratings = ['numVotes']
remove_from_names = ['birthYear', 'deathYear', 'primaryProfession', 'knownForTitles']

titles = titles.drop(columns=remove_from_titles)
crew = crew.drop(columns=remove_from_crew)
ratings = ratings.drop(columns=remove_from_ratings)
names = names.drop(columns=remove_from_names)


In [9]:
titles

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres
34798,tt0035423,Kate & Leopold,2001,118,"Comedy,Fantasy,Romance"
38753,tt0039442,"Habla, mudita",1973,88,Drama
41678,tt0042423,The Dungeon of Harrow,1962,86,Horror
45039,tt0045853,Hadaka no taishô,1958,92,Comedy
45466,tt0046292,The Sign of Venus,1955,97,"Comedy,Drama,Romance"
...,...,...,...,...,...
10469923,tt9915592,A Husband to Rent,1974,\N,Comedy
10470031,tt9915872,The Last White Witch,2019,97,"Comedy,Drama,Fantasy"
10470171,tt9916170,The Rehearsal,2019,51,Drama
10470264,tt9916362,Coven,2020,92,"Drama,History"


In [10]:
crew

Unnamed: 0,tconst,nconst,category
283671,tt0035423,nm0107463,editor
283672,tt0035423,nm0000212,actress
283673,tt0035423,nm0413168,actor
283674,tt0035423,nm0000630,actor
283675,tt0035423,nm0005227,actor
...,...,...,...
60021082,tt9916428,nm8594703,actor
60021083,tt9916428,nm0422639,actress
60021084,tt9916428,nm0910951,director
60021085,tt9916428,nm8680851,actor


In [11]:
names

Unnamed: 0,nconst,primaryName
0,nm0000001,Fred Astaire
1,nm0000002,Lauren Bacall
2,nm0000003,Brigitte Bardot
3,nm0000004,John Belushi
4,nm0000005,Ingmar Bergman
...,...,...
13167334,nm9993714,Romeo del Rosario
13167335,nm9993716,Essias Loberg
13167336,nm9993717,Harikrishnan Rajan
13167337,nm9993718,Aayush Nair


In [12]:
ratings

Unnamed: 0,tconst,averageRating
18080,tt0035423,6.4
21409,tt0039442,6.1
23795,tt0042423,3.4
26603,tt0045853,7.4
26977,tt0046292,6.5
...,...,...
1391364,tt9915592,5.8
1391376,tt9915872,6.4
1391389,tt9916170,7.0
1391401,tt9916362,6.4


In [13]:
crew_data = crew.copy()

# Merge crew data with names table to get respective names rather than nconst
crew_data['nconst'] = crew_data['nconst'].str.split(', ')
crew_data = crew_data.explode('nconst')
crew_data = pd.merge(crew_data, names, on='nconst', how='left')
crew_data = crew_data.pivot_table(index=['tconst'], columns=['category'], values=['primaryName'], aggfunc=lambda x: ', '.join(x)).reset_index()
crew_data.columns = [' '.join(col).strip() for col in crew_data.columns.values]
crew_data.columns = ['tconst', 'actor', 'actress', 'archive_footage', 'archive_sound', 'cinematographer', 'composer', 'director', 'editor', 'producer', 'production_designer', 'self', 'writer']

crew_data


Unnamed: 0,tconst,actor,actress,archive_footage,archive_sound,cinematographer,composer,director,editor,producer,production_designer,self,writer
0,tt0035423,"Hugh Jackman, Liev Schreiber, Breckin Meyer",Meg Ryan,,,Stuart Dryburgh,Rolfe Kent,James Mangold,David Brenner,Cathy Konrad,,,Steven Rogers
1,tt0039442,"José Luis López Vázquez, Francisco Algora","Kiti Mánver, Hanna Haxmann",,,Luis Cuadrado,Franz Schubert,Manuel Gutiérrez Aragón,Pablo G. del Amo,,Mario Ortiz,,José Luis García Sánchez
2,tt0042423,"Russ Harvey, William McNulty","Helen Hogan, Michele Buquor",,,James C. Houston,,Pat Boyette,,Don Russell,,,Henry Garcia
3,tt0045853,"Keiju Kobayashi, Daisuke Katô","Kyôko Aoyama, Aiko Mimasu, Yasuko Nakada",,,Asakazu Nakai,Toshirô Mayuzumi,Hiromichi Horikawa,,,Yasuhide Kato,,Yôko Mizuki
4,tt0046292,"Vittorio De Sica, Raf Vallone","Sophia Loren, Franca Valeri",,,,,Dino Risi,,Marcello Girosi,,,"Edoardo Anton, Luigi Comencini, Ennio Flaiano,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
96369,tt9915592,"Farhad Hamidi, Nosratolah Vahdat, Armais Varta...","Zhaleh Karimi, Soroor Rajai",,,Azizollah Rafie,,,,,,,Mohammad Reza Koofard
96370,tt9915872,,"Hinako Saeki, Fumika Shimizu",,,,,Hideki Kiyota,,,,,Ryuho Okawa
96371,tt9916170,"Pablo Lafuente, Kelner Macêdo, Germano Melo",Julia Ianina,,,Barbara Alvarez,,Tamar Guimaraes,Beatriz Pomar,,,,"Lillah Halla, Melissa de Raaf"
96372,tt9916362,"Alex Brendemühl, Daniel Fanego","Amaia Aberasturi, Garazi Urkola",,,,Maite Arroitajauregi,Pablo Agüero,,"Iker Ganuza, Fred Prémel, Koldo Zuazua",,,Katell Guillou


In [14]:
#merge datasets for one complete table
crew_data = crew_data.drop(columns=['archive_footage','archive_sound','self'])
film_data = pd.merge(titles, ratings, on='tconst', how='left')
film_data = pd.merge(film_data, crew_data, on='tconst', how='left')

film_data

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres,averageRating,actor,actress,cinematographer,composer,director,editor,producer,production_designer,writer
0,tt0035423,Kate & Leopold,2001,118,"Comedy,Fantasy,Romance",6.4,"Hugh Jackman, Liev Schreiber, Breckin Meyer",Meg Ryan,Stuart Dryburgh,Rolfe Kent,James Mangold,David Brenner,Cathy Konrad,,Steven Rogers
1,tt0039442,"Habla, mudita",1973,88,Drama,6.1,"José Luis López Vázquez, Francisco Algora","Kiti Mánver, Hanna Haxmann",Luis Cuadrado,Franz Schubert,Manuel Gutiérrez Aragón,Pablo G. del Amo,,Mario Ortiz,José Luis García Sánchez
2,tt0042423,The Dungeon of Harrow,1962,86,Horror,3.4,"Russ Harvey, William McNulty","Helen Hogan, Michele Buquor",James C. Houston,,Pat Boyette,,Don Russell,,Henry Garcia
3,tt0045853,Hadaka no taishô,1958,92,Comedy,7.4,"Keiju Kobayashi, Daisuke Katô","Kyôko Aoyama, Aiko Mimasu, Yasuko Nakada",Asakazu Nakai,Toshirô Mayuzumi,Hiromichi Horikawa,,,Yasuhide Kato,Yôko Mizuki
4,tt0046292,The Sign of Venus,1955,97,"Comedy,Drama,Romance",6.5,"Vittorio De Sica, Raf Vallone","Sophia Loren, Franca Valeri",,,Dino Risi,,Marcello Girosi,,"Edoardo Anton, Luigi Comencini, Ennio Flaiano,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96473,tt9915592,A Husband to Rent,1974,\N,Comedy,5.8,"Farhad Hamidi, Nosratolah Vahdat, Armais Varta...","Zhaleh Karimi, Soroor Rajai",Azizollah Rafie,,,,,,Mohammad Reza Koofard
96474,tt9915872,The Last White Witch,2019,97,"Comedy,Drama,Fantasy",6.4,,"Hinako Saeki, Fumika Shimizu",,,Hideki Kiyota,,,,Ryuho Okawa
96475,tt9916170,The Rehearsal,2019,51,Drama,7.0,"Pablo Lafuente, Kelner Macêdo, Germano Melo",Julia Ianina,Barbara Alvarez,,Tamar Guimaraes,Beatriz Pomar,,,"Lillah Halla, Melissa de Raaf"
96476,tt9916362,Coven,2020,92,"Drama,History",6.4,"Alex Brendemühl, Daniel Fanego","Amaia Aberasturi, Garazi Urkola",,Maite Arroitajauregi,Pablo Agüero,,"Iker Ganuza, Fred Prémel, Koldo Zuazua",,Katell Guillou


In [19]:
#final cleaning, remove films with a lack of data

rows_with_no_info = film_data[film_data[['actor', 'actress', 'genres','cinematographer', 'director', 'producer', 'editor', 'composer', 'averageRating', 'writer']].isna().all(axis=1)]
film_data = film_data.drop(rows_with_no_info.index)
film_data= film_data.dropna(subset=['actor', 'actress'])# Remove titles without both actors and actresses
film_data = film_data.dropna(subset=['runtimeMinutes'])

film_data

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres,averageRating,actor,actress,cinematographer,composer,director,editor,producer,production_designer,writer
16380,tt0107615,Angel of the Road,1993,89,"Action,Drama",5.2,"Yau-Kei Tang, Siu Chung Mok, Kuan Tai Chen, Ro...",Loletta Lee,Wah-Sing Choi,,Barry Chung,Shao-Hsi Chang,,,
36926,tt10160886,Sneakerella,2022,112,"Comedy,Family,Musical",4.9,"Chosen Jacobs, Bryan Terrell Clark","Lexi Underwood, Devyn Nekoda",,,Elizabeth Allen Rosenbaum,,,,"Tamara Chestna, Mindy Stern, George Gore II, D..."
49653,tt14197038,The Piper,2023,95,Horror,5.0,"Julian Sands, Oliver Savell","Charlotte Hope, Kate Nichols",,,Erlingur Thoroddsen,,"Les Weldon, Jeffrey Greenstein, Bernard Kira, ...",,
61251,tt2076216,He's Way More Famous Than You,2013,96,Comedy,3.7,"Ryan Spahn, Michael Ausiello","Halley Feiffer, Ashlie Atkinson",Austin F. Schmidt,Jeff Beal,Michael Urie,,"Michael Anderson, Christopher Sepulveda, Geoff...",,
38552,tt10584480,Thorp,2020,103,"Comedy,Drama,Romance",5.0,"Walker Hare, Otoja Abit","Kim Blanck, Alice Callahan",Michael Girandola,Jarkko Hietanen,Dennis Donovan,,"Liz Printz, Diana Rivera Vera",Joann Cowley,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38226,tt10488984,Breadwinner,1972,102,"Drama,Family",5.5,"Akbar Alemi, Reza Beyk Imanverdi, Sasan, Jamsh...",Sepideh,Ghodratallah Ehsani,,Aman Manteghi,Rubik Mansuri,Esmail Koushan,,Soheila Nasr
18978,tt0119981,Hikers,1997,95,"Comedy,Romance",5.9,"Benoît Poelvoorde, Philippe Harel","Karin Viard, Géraldine Pailhas",Gilles Henry,Philippe Eidel,,,Adeline Lecallier,,"Eric Assous, Dodine Herry, Nelly Ryher"
85483,tt6340500,Inuyashiki,2018,127,"Action,Drama,Mystery",6.6,"Noritake Kinashi, Takeru Satoh, Kanata Hongô",Fumi Nikaidô,,Yutaka Yamada,Shinsuke Sato,,"Morio Amagi, Kei Kajimoto",,"Hiroya Oku, Hiroshi Hashimoto"
15938,tt0105592,Tiger Claws,1991,92,Action,5.4,"Nick Dibley, Fern Figueiredo, Jack Vorvis",Cynthia Rothrock,"Mark Willis, Curtis Petersen",Varouje,Kelly Makin,,Jalal Merhi,,J. Stephen Maunder


In [20]:
#shuffle order and export cleaned films to csv
film_data = film_data.sample(frac=1)
film_data.to_csv('webpage/film_data.csv')

In [21]:
def csv_to_json(input_csv, output_json):
    # Read CSV file into a DataFrame
    df = pd.read_csv(input_csv)

    # Convert DataFrame to a list of dictionaries (JSON-like structure)
    json_list = df.to_dict(orient='records')

    # Write the list of dictionaries to a JSON file as an array
    with open(output_json, 'w', encoding='utf-8') as json_file:
        json_file.write(pd.Series(json_list).to_json(orient='records', lines=False, default_handler=str, indent=2))

# Example usage
csv_to_json('webpage/film_data.csv', 'webpage/film_data.json')