In [1]:
#import modules and packages

import pandas as pd #data processing
import requests as req #get data from url
import gzip
from io import BytesIO


import seaborn as sns
sns.set()


In [2]:
#get film datasets

#set urls
url_title_basics = 'https://datasets.imdbws.com/title.basics.tsv.gz' #film name, year, runtime, genres
url_crew = 'https://datasets.imdbws.com/title.principals.tsv.gz' #actors, actresses, cinematographers, directors (redundant)
url_ratings = 'https://datasets.imdbws.com/title.ratings.tsv.gz' #ratings for films (not all)
url_names = 'https://datasets.imdbws.com/name.basics.tsv.gz' #link table for names against nconst
url_langs = 'https://datasets.imdbws.com/title.akas.tsv.gz' #link table for names against nconst


#download from url
res_title_basics = req.get(url_title_basics).content
res_crew = req.get(url_crew).content
res_ratings = req.get(url_ratings).content
res_names = req.get(url_names).content
res_lang = req.get(url_langs).content


#decompress
title_basics_gzip = gzip.decompress(res_title_basics)
crew_basics_gzip = gzip.decompress(res_crew)
title_ratings_gzip = gzip.decompress(res_ratings)
names_gzip = gzip.decompress(res_names)
title_langs_gzip = gzip.decompress(res_lang)


#read csv into dataframes
titles = pd.read_csv(BytesIO(title_basics_gzip), delimiter='\t',low_memory=False)
crew = pd.read_csv(BytesIO(crew_basics_gzip), delimiter='\t',low_memory=False)
ratings = pd.read_csv(BytesIO(title_ratings_gzip), delimiter='\t',low_memory=False)
names = pd.read_csv(BytesIO(names_gzip), delimiter='\t',low_memory=False)
langs = pd.read_csv(BytesIO(title_langs_gzip), delimiter='\t',low_memory=False)



In [3]:
#clean data

#filter only English-speaking regions
desired_regions = ['CA', 'US', 'GB', 'IE', 'AU', 'NZ']
desired_langs = ['en']
filtered_regions = langs[langs['region'].isin(desired_regions)]
filtered_langs = langs[langs['language'].isin(desired_langs)]
tconsts_filtered_regions = filtered_regions['titleId'].tolist()
tconsts_filtered_langs = filtered_langs['titleId'].tolist()


#remove non-movie rows, and movies from before 1960
titles = titles[titles['titleType'] == 'movie']
titles = titles[(titles['startYear'] >= '1955') & (titles['startYear'] != r'\N')]
titles = titles[titles['genres'] != r'\N']
titles = titles[(titles['tconst'].isin(tconsts_filtered_langs)) & (titles['tconst'].isin(tconsts_filtered_regions))]


#get tconsts for remaining non-movie rows
film_tconsts = titles['tconst'].tolist()

#remove corresponding non-movie rows
crew = crew[crew['tconst'].isin(film_tconsts)]
ratings = ratings[ratings['tconst'].isin(film_tconsts)]

#set columns to remove from dataset
remove_from_titles = ['originalTitle', 'endYear', 'titleType', 'isAdult']
remove_from_crew = ['ordering','job','characters']
remove_from_ratings = ['numVotes']
remove_from_names = ['birthYear', 'deathYear', 'primaryProfession', 'knownForTitles']

titles = titles.drop(columns=remove_from_titles)
crew = crew.drop(columns=remove_from_crew)
ratings = ratings.drop(columns=remove_from_ratings)
names = names.drop(columns=remove_from_names)


In [4]:
titles

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres
34798,tt0035423,Kate & Leopold,2001,118,"Comedy,Fantasy,Romance"
38753,tt0039442,"Habla, mudita",1973,88,Drama
41678,tt0042423,The Dungeon of Harrow,1962,86,Horror
45039,tt0045853,Hadaka no taishô,1958,92,Comedy
45466,tt0046292,The Sign of Venus,1955,97,"Comedy,Drama,Romance"
...,...,...,...,...,...
10503647,tt9915592,A Husband to Rent,1974,\N,Comedy
10503755,tt9915872,The Last White Witch,2019,97,"Comedy,Drama,Fantasy"
10503895,tt9916170,The Rehearsal,2019,51,Drama
10503988,tt9916362,Coven,2020,92,"Drama,History"


In [5]:
crew

Unnamed: 0,tconst,nconst,category
283691,tt0035423,nm0107463,editor
283692,tt0035423,nm0000212,actress
283693,tt0035423,nm0413168,actor
283694,tt0035423,nm0000630,actor
283695,tt0035423,nm0005227,actor
...,...,...,...
60225365,tt9916428,nm8594703,actor
60225366,tt9916428,nm0422639,actress
60225367,tt9916428,nm0910951,director
60225368,tt9916428,nm8680851,actor


In [6]:
names

Unnamed: 0,nconst,primaryName
0,nm0000001,Fred Astaire
1,nm0000002,Lauren Bacall
2,nm0000003,Brigitte Bardot
3,nm0000004,John Belushi
4,nm0000005,Ingmar Bergman
...,...,...
13205093,nm9993714,Romeo del Rosario
13205094,nm9993716,Essias Loberg
13205095,nm9993717,Harikrishnan Rajan
13205096,nm9993718,Aayush Nair


In [7]:
ratings

Unnamed: 0,tconst,averageRating
18137,tt0035423,6.4
21468,tt0039442,6.1
23852,tt0042423,3.4
26665,tt0045853,7.4
27039,tt0046292,6.5
...,...,...
1394940,tt9915592,5.8
1394952,tt9915872,6.4
1394965,tt9916170,7.0
1394977,tt9916362,6.4


In [8]:
crew_data = crew.copy()

# Merge crew data with names table to get respective names rather than nconst
crew_data['nconst'] = crew_data['nconst'].str.split(', ')
crew_data = crew_data.explode('nconst')
crew_data = pd.merge(crew_data, names, on='nconst', how='left')

crew_data = crew_data.pivot_table(
    index=['tconst'],
    columns=['category'],
    values=['primaryName'],
    aggfunc=lambda x: ', '.join(str(item) for item in x),
).reset_index()

crew_data.columns = [' '.join(col).strip() for col in crew_data.columns.values]
crew_data.columns = ['tconst', 'actor', 'actress', 'archive_footage', 'archive_sound', 'cinematographer', 'composer', 'director', 'editor', 'producer', 'production_designer', 'self', 'writer']

In [9]:
#merge datasets for one complete table
crew_data = crew_data.drop(columns=['archive_footage','archive_sound','self', 'production_designer'])
film_data = pd.merge(titles, ratings, on='tconst', how='left')
film_data = pd.merge(film_data, crew_data, on='tconst', how='left')


In [10]:
#final cleaning, remove films with a lack of data

rows_with_no_info = film_data[film_data[['actor', 'actress', 'genres','cinematographer', 'director', 'producer', 'editor', 'composer', 'averageRating', 'writer']].isna().all(axis=1)]
film_data = film_data.drop(rows_with_no_info.index)
film_data= film_data.dropna(subset=['actor', 'actress'])
film_data = film_data.dropna(subset=['runtimeMinutes'])
film_data = film_data.dropna(subset=['averageRating'])

film_data

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres,averageRating,actor,actress,cinematographer,composer,director,editor,producer,writer
0,tt0035423,Kate & Leopold,2001,118,"Comedy,Fantasy,Romance",6.4,"Hugh Jackman, Liev Schreiber, Breckin Meyer",Meg Ryan,Stuart Dryburgh,Rolfe Kent,James Mangold,David Brenner,Cathy Konrad,Steven Rogers
1,tt0039442,"Habla, mudita",1973,88,Drama,6.1,"José Luis López Vázquez, Francisco Algora","Kiti Mánver, Hanna Haxmann",Luis Cuadrado,Franz Schubert,Manuel Gutiérrez Aragón,Pablo G. del Amo,,José Luis García Sánchez
2,tt0042423,The Dungeon of Harrow,1962,86,Horror,3.4,"Russ Harvey, William McNulty","Helen Hogan, Michele Buquor",James C. Houston,,Pat Boyette,,Don Russell,Henry Garcia
3,tt0045853,Hadaka no taishô,1958,92,Comedy,7.4,"Keiju Kobayashi, Daisuke Katô","Kyôko Aoyama, Aiko Mimasu, Yasuko Nakada",Asakazu Nakai,Toshirô Mayuzumi,Hiromichi Horikawa,,,Yôko Mizuki
4,tt0046292,The Sign of Venus,1955,97,"Comedy,Drama,Romance",6.5,"Vittorio De Sica, Raf Vallone","Sophia Loren, Franca Valeri",,,Dino Risi,,Marcello Girosi,"Edoardo Anton, Luigi Comencini, Ennio Flaiano,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97080,tt9914942,Life Without Sara Amat,2019,74,Drama,6.7,Biel Rossell,"Maria Morera, Francesca Piñón, Anna Sabaté",Gris Jordana,Pau Vallvé,Laura Jou,Raúl Román,,"Coral Cruz, Pep Puig"
97082,tt9915592,A Husband to Rent,1974,\N,Comedy,5.8,"Farhad Hamidi, Nosratolah Vahdat, Armais Varta...","Zhaleh Karimi, Soroor Rajai",Azizollah Rafie,,,,,Mohammad Reza Koofard
97084,tt9916170,The Rehearsal,2019,51,Drama,7.0,"Pablo Lafuente, Kelner Macêdo, Germano Melo",Julia Ianina,Barbara Alvarez,,Tamar Guimaraes,Beatriz Pomar,,"Lillah Halla, Melissa de Raaf"
97085,tt9916362,Coven,2020,92,"Drama,History",6.4,"Alex Brendemühl, Daniel Fanego","Amaia Aberasturi, Garazi Urkola",,Maite Arroitajauregi,Pablo Agüero,,"Iker Ganuza, Fred Prémel, Koldo Zuazua",Katell Guillou


In [11]:
#shuffle order and export cleaned films to csv
film_data = film_data.sample(frac=1)
film_data.to_csv('webpage/film_data.csv')

In [12]:
#convert csv to json
def csv_to_json(input_csv, output_json):
    # Read CSV file into a DataFrame
    df = pd.read_csv(input_csv)

    # Convert DataFrame to a list of dictionaries (JSON-like structure)
    json_list = df.to_dict(orient='records')

    # Write the list of dictionaries to a JSON file as an array
    with open(output_json, 'w', encoding='utf-8') as json_file:
        json_str = pd.Series(json_list).to_json(orient='records', lines=False, default_handler=str, indent=2)
        json_str_no_apostrophes = json_str.replace("'", "")
        json_file.write(json_str_no_apostrophes)

csv_to_json('webpage/film_data.csv', 'webpage/film_data.json')