In [1]:
# IMDB2SQL
# we're taking the IMDB data and converting it to SQL tables
# along the way, doing some data cleaning and manipulation
# goal here is to have a database that is minimal for the purpose of the project
# project is to replicate oracle of bacon
# so we only need movies, actors, and the relationships between them
import pandas as pd

# import name.basics.tsv
name_basics = pd.read_csv('name.basics.tsv', sep='\t')
# re-index on nconst
name_basics.set_index('nconst', inplace=True)
# random sample
name_basics.sample(5)

Unnamed: 0_level_0,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
nm14774559,Mark R. Poff,\N,\N,\N,\N
nm6807114,Krysti Pryde,\N,\N,\N,\N
nm11972685,Kurt Richter,\N,\N,actor,tt13262184
nm14636002,Zack Huffman,\N,\N,producer,tt16390652
nm10698972,Jan Koch,\N,\N,visual_effects,"tt9253926,tt9255182,tt9364406,tt9181732"


In [2]:
# EDAs
# find all entries with \N in any column and set to NaN
name_basics = name_basics.replace('\\N', pd.NA)
# count missing values
name_basics.describe(include='all')

Unnamed: 0,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
count,13779152,624696,233853,11116963,12236455
unique,10594057,542,484,22999,5748263
top,Alex,1980,2021,actor,tt0123338
freq,512,10067,7429,2448369,8289


In [3]:
# change birthyear and deathyear to int
name_basics['birthYear'] = pd.to_numeric(name_basics['birthYear'], errors='coerce')
name_basics['deathYear'] = pd.to_numeric(name_basics['deathYear'], errors='coerce')
name_basics.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13779210 entries, nm0000001 to nm9993719
Data columns (total 5 columns):
 #   Column             Dtype  
---  ------             -----  
 0   primaryName        object 
 1   birthYear          float64
 2   deathYear          float64
 3   primaryProfession  object 
 4   knownForTitles     object 
dtypes: float64(2), object(3)
memory usage: 630.8+ MB


In [4]:
# import title.basics.tsv
title_basics = pd.read_csv('title.basics.tsv', sep='\t')
title_basics = title_basics.replace('\\N', pd.NA)
title_basics.set_index('tconst', inplace=True)
title_basics.sample(5)

  title_basics = pd.read_csv('title.basics.tsv', sep='\t')


Unnamed: 0_level_0,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
tt26898027,movie,The Heiress,The Heiress,0,2023,,84.0,Horror
tt3921780,tvEpisode,Padre Pio's Later Years,Padre Pio's Later Years,0,1999,,,Biography
tt1501343,video,Cinematic Titanic: Blood of the Vampires,Cinematic Titanic: Blood of the Vampires,0,2009,,83.0,Comedy
tt7137414,tvSeries,Wacky World Adventures,Wacky World Adventures,0,2017,,,Animation
tt2274582,tvSeries,Shark Wranglers,Shark Wranglers,0,2012,,,"Adventure,Reality-TV"


In [5]:
# convert certain columns to numeric
title_basics['startYear'] = pd.to_numeric(title_basics['startYear'], errors='coerce')
title_basics['endYear'] = pd.to_numeric(title_basics['endYear'], errors='coerce')
title_basics['runtimeMinutes'] = pd.to_numeric(title_basics['runtimeMinutes'], errors='coerce')
title_basics['isAdult'] = pd.to_numeric(title_basics['isAdult'], errors='coerce')
title_basics.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11059690 entries, tt0000001 to tt9916880
Data columns (total 8 columns):
 #   Column          Dtype  
---  ------          -----  
 0   titleType       object 
 1   primaryTitle    object 
 2   originalTitle   object 
 3   isAdult         float64
 4   startYear       float64
 5   endYear         float64
 6   runtimeMinutes  float64
 7   genres          object 
dtypes: float64(4), object(4)
memory usage: 759.4+ MB


In [6]:
# quick sanity check
title_basics.sample(5)

Unnamed: 0_level_0,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
tt13592456,tvEpisode,Episode dated 4 December 2020,Episode dated 4 December 2020,0.0,2020.0,,,News
tt6581442,tvEpisode,Lords of the Flies,Lords of the Flies,0.0,2016.0,,,"Adventure,Reality-TV"
tt0789758,tvEpisode,Pilot,Pilot,0.0,2006.0,,,"Comedy,Drama"
tt2873816,short,"Small Nations, Big Games: Palestinian Soccer G...","Small Nations, Big Games: Palestinian Soccer G...",0.0,2010.0,,10.0,"Documentary,Short"
tt31121576,tvEpisode,Los Angeles Lakers @ Boston Celtics,Los Angeles Lakers @ Boston Celtics,0.0,2024.0,,,


In [7]:
# import title.principals.tsv
title_principals = pd.read_csv('title.principals.tsv', sep='\t')
title_principals = title_principals.replace('\\N', pd.NA)

# check whether there are repeated entries for tconst
title_principals['tconst'].value_counts().head(5)

# logical check
if title_principals['tconst'].value_counts().max() == 1:
    print('No repeated entries for tconst')
else:
    print('There are repeated entries for tconst')


There are repeated entries for tconst


In [8]:
# ok so we can't re-index on tconst

title_principals.head(25)

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,,"[""Self""]"
1,tt0000001,2,nm0005690,director,,
2,tt0000001,3,nm0005690,producer,producer,
3,tt0000001,4,nm0374658,cinematographer,director of photography,
4,tt0000002,1,nm0721526,director,,
5,tt0000002,2,nm1335271,composer,,
6,tt0000003,1,nm0721526,director,,
7,tt0000003,2,nm1770680,producer,producer,
8,tt0000003,3,nm0721526,producer,producer,
9,tt0000003,4,nm1335271,composer,,


In [9]:
title_principals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87799799 entries, 0 to 87799798
Data columns (total 6 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   tconst      object
 1   ordering    int64 
 2   nconst      object
 3   category    object
 4   job         object
 5   characters  object
dtypes: int64(1), object(5)
memory usage: 3.9+ GB


In [10]:
# import title.ratings.tsv
title_ratings = pd.read_csv('title.ratings.tsv', sep='\t')
title_ratings = title_ratings.replace('\\N', pd.NA)
# re-index on tconst
title_ratings.set_index('tconst', inplace=True)
title_ratings.sample(5)

Unnamed: 0_level_0,averageRating,numVotes
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1
tt12721864,1.4,30
tt9034160,7.5,40
tt10309832,6.6,89
tt0011925,5.4,38
tt2836138,7.4,39


In [11]:
title_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1474118 entries, tt0000001 to tt9916880
Data columns (total 2 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   averageRating  1474118 non-null  float64
 1   numVotes       1474118 non-null  int64  
dtypes: float64(1), int64(1)
memory usage: 33.7+ MB


In [12]:
# import title.crew.tsv
title_crew = pd.read_csv('title.crew.tsv', sep='\t')
title_crew = title_crew.replace('\\N', pd.NA)
# re-index on tconst
title_crew.set_index('tconst', inplace=True)
title_crew.sample(5)

Unnamed: 0_level_0,directors,writers
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1
tt4037816,,
tt13784096,,nm0393007
tt14924416,nm3436296,nm3436296
tt11602026,,
tt14973234,"nm8847198,nm7726992","nm8847198,nm12726397,nm7418629,nm7726992"


In [13]:
# import title.akas.tsv
title_akas = pd.read_csv('title.akas.tsv', sep='\t')
title_akas = title_akas.replace('\\N', pd.NA)
# re-index on titleId
title_akas.set_index('titleId', inplace=True)
title_akas.sample(5)

Unnamed: 0_level_0,ordering,title,region,language,types,attributes,isOriginalTitle
titleId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
tt15053472,7,2021年4月22日 のエピソード,JP,ja,,,0
tt1573202,4,Rasender Stillstand - Das Theater des Christop...,DE,,imdbDisplay,,0
tt16391044,1,Folge 389,,,original,,1
tt2069669,7,エピソード #10.33,JP,ja,,,0
tt6505174,4,Épisode #1.428,FR,fr,,,0


In [14]:
title_akas.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49556194 entries, tt0000001 to tt9916880
Data columns (total 7 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   ordering         int64 
 1   title            object
 2   region           object
 3   language         object
 4   types            object
 5   attributes       object
 6   isOriginalTitle  int64 
dtypes: int64(2), object(5)
memory usage: 3.0+ GB


In [15]:
# import title.episode.tsv
title_episode = pd.read_csv('title.episode.tsv', sep='\t')
title_episode = title_episode.replace('\\N', pd.NA)
# re-index on tconst
title_episode.set_index('tconst', inplace=True)
title_episode.sample(5)

Unnamed: 0_level_0,parentTconst,seasonNumber,episodeNumber
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tt12014202,tt0185058,11.0,1.0
tt13173064,tt13154212,1.0,1.0
tt3487438,tt3485248,1.0,2.0
tt17526690,tt0442646,,
tt27163505,tt22299710,1.0,1.0


In [16]:
title_episode.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8480573 entries, tt0031458 to tt9916880
Data columns (total 3 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   parentTconst   object
 1   seasonNumber   object
 2   episodeNumber  object
dtypes: object(3)
memory usage: 258.8+ MB


In [17]:
# set seasonNumber and episodeNumber to numeric
title_episode['seasonNumber'] = pd.to_numeric(title_episode['seasonNumber'], errors='coerce')
title_episode['episodeNumber'] = pd.to_numeric(title_episode['episodeNumber'], errors='coerce')
title_episode.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8480573 entries, tt0031458 to tt9916880
Data columns (total 3 columns):
 #   Column         Dtype  
---  ------         -----  
 0   parentTconst   object 
 1   seasonNumber   float64
 2   episodeNumber  float64
dtypes: float64(2), object(1)
memory usage: 258.8+ MB


In [19]:
from sqlalchemy import create_engine
engine = create_engine('postgresql+psycopg2://postgres:postgres@localhost/imdb_pg')

In [20]:
name_basics.to_sql('name_basics', engine, index=True, if_exists='replace')
print('name_basics done')

name_basics done


In [44]:
# ok now do the same for the other dataframes
title_principals.to_sql('title_principals', engine, index=True, if_exists='replace')
print('title_principals done')

In [None]:

title_basics.to_sql('title_basics', engine, index=True, if_exists='replace')
print('title_basics done')

title_ratings.to_sql('title_ratings', engine, index=True, if_exists='replace')
print('title_ratings done')

title_crew.to_sql('title_crew', engine, index=True, if_exists='replace')
print('title_crew done')

title_akas.to_sql('title_akas', engine, index=True, if_exists='replace')
print('title_akas done')

title_episode.to_sql('title_episode', engine, index=True, if_exists='replace')
print('title_episode done')

# now dispose of the engine
engine.dispose()
