In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [9]:
budget_df = pd.read_csv('data/hollywood_domestic_boxoffice.csv', header=0)

#Update column names to match.  Drop unneeded columns
budget_df.rename(columns = {"title": "primary_title"}, inplace = True)
budget_df.drop(columns = ["rank", "studio"], inplace = True)

# #Add in year of release for accuracy
budget_df['title_year'] = budget_df['primary_title'] + ' - ' + budget_df['year'].astype(str)
budget_df.head()

Unnamed: 0,primary_title,lifetime_gross,year,title_year
0,Star Wars: The Force Awakens,936662225,2015,Star Wars: The Force Awakens - 2015
1,Avengers: Endgame,857190335,2019,Avengers: Endgame - 2019
2,Avatar,760507625,2009,Avatar - 2009
3,Black Panther,700059566,2018,Black Panther - 2018
4,Avengers: Infinity War,678815482,2018,Avengers: Infinity War - 2018


In [10]:
#No null values
budget_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16542 entries, 0 to 16541
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   primary_title   16542 non-null  object
 1   lifetime_gross  16542 non-null  int64 
 2   year            16542 non-null  int64 
 3   title_year      16542 non-null  object
dtypes: int64(2), object(2)
memory usage: 517.1+ KB


In [11]:
##need: Primary Title, Genres
##match on: title_year

imdb_titles_df = pd.read_csv('data/imdb.title.basics.csv.gz')

In [12]:
#Some null values in runtime, genre
imdb_titles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146144 entries, 0 to 146143
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   tconst           146144 non-null  object 
 1   primary_title    146144 non-null  object 
 2   original_title   146123 non-null  object 
 3   start_year       146144 non-null  int64  
 4   runtime_minutes  114405 non-null  float64
 5   genres           140736 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 6.7+ MB


**Import IMDB Title AKAs**

In [13]:
#Get title akas and merge with imdb titles
imdb_akas_df = pd.read_csv('data/imdb.title.akas.csv.gz')
imdb_akas_df.head()

Unnamed: 0,title_id,ordering,title,region,language,types,attributes,is_original_title
0,tt0369610,10,Джурасик свят,BG,bg,,,0.0
1,tt0369610,11,Jurashikku warudo,JP,,imdbDisplay,,0.0
2,tt0369610,12,Jurassic World: O Mundo dos Dinossauros,BR,,imdbDisplay,,0.0
3,tt0369610,13,O Mundo dos Dinossauros,BR,,,short title,0.0
4,tt0369610,14,Jurassic World,FR,,imdbDisplay,,0.0


In [14]:
#Drop columns.  We only need id and title
imdb_akas_df = imdb_akas_df[['title_id', 'title']]

In [15]:
#No null values
imdb_akas_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 331703 entries, 0 to 331702
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   title_id  331703 non-null  object
 1   title     331703 non-null  object
dtypes: object(2)
memory usage: 5.1+ MB


**Merge Title AKAs with Title Basics**

In [16]:
#May not be alternate titles for each movie.  Should add them only where appropriate - NOT LEFT JOIN
#Merge with imdb titles so that there is a full row of data for each alternate title
#Merge on tconst

#rename column to match
imdb_akas_df.rename(columns = {"title_id": "tconst", "title": "alternate_title"}, inplace = True)

In [17]:
titles_with_akas = imdb_akas_df.merge(imdb_titles_df, how='left', on='tconst')
titles_with_akas.head(20)

Unnamed: 0,tconst,alternate_title,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0369610,Джурасик свят,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
1,tt0369610,Jurashikku warudo,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
2,tt0369610,Jurassic World: O Mundo dos Dinossauros,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
3,tt0369610,O Mundo dos Dinossauros,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
4,tt0369610,Jurassic World,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
5,tt0369610,Jurassic World,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
6,tt0369610,Jurassic World,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
7,tt0369610,Jurski svijet,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
8,tt0369610,Olam ha'Yura,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
9,tt0369610,Jurassic World: Mundo Jurásico,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"


In [18]:
# titles_with_akas.loc[(titles_with_akas['tconst'] == 'tt0063540')]


In [19]:
titles_with_akas[titles_with_akas.duplicated()]

Unnamed: 0,tconst,alternate_title,primary_title,original_title,start_year,runtime_minutes,genres
5,tt0369610,Jurassic World,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
6,tt0369610,Jurassic World,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
11,tt0369610,Jurassic World,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
13,tt0369610,Jurassic World 3D,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
14,tt0369610,Jurassic World 3D,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
...,...,...,...,...,...,...,...
331688,tt9705860,Dusan Vukotic hrvatski okarovac,Dusan Vukotic Croatian Oscar Winner,Dusan Vukotic hrvatski okarovac,2011,55.0,Documentary
331690,tt9723084,Anderswo. Allein in Afrika,Anderswo. Allein in Afrika,Anderswo. Allein in Afrika,2018,103.0,"Adventure,Documentary"
331696,tt9755806,Big Shark,Big Shark,Big Shark,2019,,Horror
331698,tt9827784,Sayonara kuchibiru,Farewell Song,Sayonara kuchibiru,2019,116.0,"Music,Romance"


In [20]:
titles_with_akas = titles_with_akas.drop_duplicates()

In [21]:
titles_with_akas

Unnamed: 0,tconst,alternate_title,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0369610,Джурасик свят,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
1,tt0369610,Jurashikku warudo,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
2,tt0369610,Jurassic World: O Mundo dos Dinossauros,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
3,tt0369610,O Mundo dos Dinossauros,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
4,tt0369610,Jurassic World,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
...,...,...,...,...,...,...,...
331695,tt9755806,Большая Акула,Big Shark,Big Shark,2019,,Horror
331697,tt9827784,Sayonara kuchibiru,Farewell Song,Sayonara kuchibiru,2019,116.0,"Music,Romance"
331699,tt9827784,Farewell Song,Farewell Song,Sayonara kuchibiru,2019,116.0,"Music,Romance"
331700,tt9880178,La atención,The Attention,La atención,2019,61.0,Documentary


In [22]:
#Convert year to string in order to create title_year column
titles_with_akas['start_year'] = titles_with_akas['start_year'].astype(str)
#Combine year and title for merging
titles_with_akas['aka_title_year'] = titles_with_akas['alternate_title'] + ' - ' + titles_with_akas['start_year']
titles_with_akas.head()



Unnamed: 0,tconst,alternate_title,primary_title,original_title,start_year,runtime_minutes,genres,aka_title_year
0,tt0369610,Джурасик свят,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",Джурасик свят - 2015
1,tt0369610,Jurashikku warudo,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",Jurashikku warudo - 2015
2,tt0369610,Jurassic World: O Mundo dos Dinossauros,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",Jurassic World: O Mundo dos Dinossauros - 2015
3,tt0369610,O Mundo dos Dinossauros,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",O Mundo dos Dinossauros - 2015
4,tt0369610,Jurassic World,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",Jurassic World - 2015


**Merge in Revenue Data for each Title AKA**

In [23]:
titles_and_rev = titles_with_akas.merge(budget_df, how='left', left_on='aka_title_year', right_on='title_year')
titles_and_rev.head()
# titles_and_rev.loc[(titles_and_rev['tconst'] == 'tt1637725')]

Unnamed: 0,tconst,alternate_title,primary_title_x,original_title,start_year,runtime_minutes,genres,aka_title_year,primary_title_y,lifetime_gross,year,title_year
0,tt0369610,Джурасик свят,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",Джурасик свят - 2015,,,,
1,tt0369610,Jurashikku warudo,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",Jurashikku warudo - 2015,,,,
2,tt0369610,Jurassic World: O Mundo dos Dinossauros,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",Jurassic World: O Mundo dos Dinossauros - 2015,,,,
3,tt0369610,O Mundo dos Dinossauros,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",O Mundo dos Dinossauros - 2015,,,,
4,tt0369610,Jurassic World,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",Jurassic World - 2015,Jurassic World,652270625.0,2015.0,Jurassic World - 2015


In [24]:
#No duplicates
titles_and_rev[titles_and_rev.duplicated()]

Unnamed: 0,tconst,alternate_title,primary_title_x,original_title,start_year,runtime_minutes,genres,aka_title_year,primary_title_y,lifetime_gross,year,title_year


In [25]:
#Still only 3200 movies with all data
titles_and_rev.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 266869 entries, 0 to 266868
Data columns (total 12 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   tconst           266869 non-null  object 
 1   alternate_title  266869 non-null  object 
 2   primary_title_x  266869 non-null  object 
 3   original_title   266859 non-null  object 
 4   start_year       266869 non-null  object 
 5   runtime_minutes  239721 non-null  float64
 6   genres           263489 non-null  object 
 7   aka_title_year   266869 non-null  object 
 8   primary_title_y  3214 non-null    object 
 9   lifetime_gross   3214 non-null    float64
 10  year             3214 non-null    float64
 11  title_year       3214 non-null    object 
dtypes: float64(3), object(9)
memory usage: 26.5+ MB


In [26]:
titles_and_rev.head()

Unnamed: 0,tconst,alternate_title,primary_title_x,original_title,start_year,runtime_minutes,genres,aka_title_year,primary_title_y,lifetime_gross,year,title_year
0,tt0369610,Джурасик свят,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",Джурасик свят - 2015,,,,
1,tt0369610,Jurashikku warudo,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",Jurashikku warudo - 2015,,,,
2,tt0369610,Jurassic World: O Mundo dos Dinossauros,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",Jurassic World: O Mundo dos Dinossauros - 2015,,,,
3,tt0369610,O Mundo dos Dinossauros,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",O Mundo dos Dinossauros - 2015,,,,
4,tt0369610,Jurassic World,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",Jurassic World - 2015,Jurassic World,652270625.0,2015.0,Jurassic World - 2015


In [27]:
no_nas = titles_and_rev.dropna(subset=['lifetime_gross'])

In [28]:
no_nas.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 3214 entries, 4 to 266833
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   tconst           3214 non-null   object 
 1   alternate_title  3214 non-null   object 
 2   primary_title_x  3214 non-null   object 
 3   original_title   3214 non-null   object 
 4   start_year       3214 non-null   object 
 5   runtime_minutes  3184 non-null   float64
 6   genres           3210 non-null   object 
 7   aka_title_year   3214 non-null   object 
 8   primary_title_y  3214 non-null   object 
 9   lifetime_gross   3214 non-null   float64
 10  year             3214 non-null   float64
 11  title_year       3214 non-null   object 
dtypes: float64(3), object(9)
memory usage: 326.4+ KB


In [29]:
no_nas.head()

Unnamed: 0,tconst,alternate_title,primary_title_x,original_title,start_year,runtime_minutes,genres,aka_title_year,primary_title_y,lifetime_gross,year,title_year
4,tt0369610,Jurassic World,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",Jurassic World - 2015,Jurassic World,652270625.0,2015.0,Jurassic World - 2015
37,tt0401729,John Carter,John Carter,John Carter,2012,132.0,"Action,Adventure,Sci-Fi",John Carter - 2012,John Carter,73078100.0,2012.0,John Carter - 2012
111,tt1179034,From Paris with Love,From Paris with Love,From Paris with Love,2010,92.0,"Action,Crime,Thriller",From Paris with Love - 2010,From Paris with Love,24077427.0,2010.0,From Paris with Love - 2010
158,tt1194173,The Bourne Legacy,The Bourne Legacy,The Bourne Legacy,2012,135.0,"Action,Adventure,Thriller",The Bourne Legacy - 2012,The Bourne Legacy,113203870.0,2012.0,The Bourne Legacy - 2012
174,tt1219289,Limitless,Limitless,Limitless,2011,105.0,"Mystery,Sci-Fi,Thriller",Limitless - 2011,Limitless,79249455.0,2011.0,Limitless - 2011


In [30]:
#Duplicated title ids (tconst)
no_nas.loc[(no_nas.duplicated(subset='tconst') == True)]


Unnamed: 0,tconst,alternate_title,primary_title_x,original_title,start_year,runtime_minutes,genres,aka_title_year,primary_title_y,lifetime_gross,year,title_year
39498,tt1637725,Teddy Bear,Ted,Ted,2012,106.0,"Comedy,Fantasy",Teddy Bear - 2012,Teddy Bear,16138.0,2012.0,Teddy Bear - 2012
41171,tt1243974,Welcome Back,Aloha,Aloha,2015,105.0,"Comedy,Drama,Romance",Welcome Back - 2015,Welcome Back,1388365.0,2015.0,Welcome Back - 2015
147787,tt2581480,The Channel,The Channel,The Channel,2016,93.0,"Drama,Horror,Thriller",The Channel - 2016,The Channel,18319.0,2016.0,The Channel - 2016
219212,tt5162658,Oro,Gold,Oro,2017,103.0,"Adventure,Drama,History",Oro - 2017,Oro,5539.0,2017.0,Oro - 2017
252893,tt4814290,Three,Te3n,Te3n,2016,136.0,"Drama,Mystery,Thriller",Three - 2016,Three,119550.0,2016.0,Three - 2016


**Manually Remove Incorrectly Duplicated Movie_IDs**

In [31]:
#Have the same ID & are same movie in IMDB
#Rev data has a separate movie in 2012 called Teddy Bear
#Manually remove duplicates for these 5 cases
no_nas = no_nas.reset_index()
no_nas.loc[(no_nas['tconst'] == 'tt1637725')]
no_nas = no_nas.drop([616,616])
no_nas.loc[(no_nas['tconst'] == 'tt1637725')]

Unnamed: 0,index,tconst,alternate_title,primary_title_x,original_title,start_year,runtime_minutes,genres,aka_title_year,primary_title_y,lifetime_gross,year,title_year
615,39493,tt1637725,Ted,Ted,Ted,2012,106.0,"Comedy,Fantasy",Ted - 2012,Ted,218815487.0,2012.0,Ted - 2012


In [32]:
no_nas.loc[(no_nas['tconst'] == 'tt1243974')]
no_nas = no_nas.drop([635,635])
no_nas.loc[(no_nas['tconst'] == 'tt1243974')]

Unnamed: 0,index,tconst,alternate_title,primary_title_x,original_title,start_year,runtime_minutes,genres,aka_title_year,primary_title_y,lifetime_gross,year,title_year
634,41164,tt1243974,Aloha,Aloha,Aloha,2015,105.0,"Comedy,Drama,Romance",Aloha - 2015,Aloha,21067116.0,2015.0,Aloha - 2015


In [33]:
no_nas.loc[(no_nas['tconst'] == 'tt2581480')]
no_nas = no_nas.drop([2055,2055])
no_nas.loc[(no_nas['tconst'] == 'tt2581480')]

Unnamed: 0,index,tconst,alternate_title,primary_title_x,original_title,start_year,runtime_minutes,genres,aka_title_year,primary_title_y,lifetime_gross,year,title_year
2056,147787,tt2581480,The Channel,The Channel,The Channel,2016,93.0,"Drama,Horror,Thriller",The Channel - 2016,The Channel,18319.0,2016.0,The Channel - 2016


In [34]:
no_nas.loc[(no_nas['tconst'] == 'tt5162658')]
no_nas = no_nas.drop([2752,2752])
no_nas.loc[(no_nas['tconst'] == 'tt5162658')]

Unnamed: 0,index,tconst,alternate_title,primary_title_x,original_title,start_year,runtime_minutes,genres,aka_title_year,primary_title_y,lifetime_gross,year,title_year
2751,219210,tt5162658,Gold,Gold,Oro,2017,103.0,"Adventure,Drama,History",Gold - 2017,Gold,7227038.0,2017.0,Gold - 2017


In [35]:
no_nas.loc[(no_nas['tconst'] == 'tt4814290')]
no_nas = no_nas.drop([3057,3057])
no_nas.loc[(no_nas['tconst'] == 'tt4814290')]

Unnamed: 0,index,tconst,alternate_title,primary_title_x,original_title,start_year,runtime_minutes,genres,aka_title_year,primary_title_y,lifetime_gross,year,title_year
3056,252892,tt4814290,Te3n,Te3n,Te3n,2016,136.0,"Drama,Mystery,Thriller",Te3n - 2016,Te3n,331507.0,2016.0,Te3n - 2016


In [36]:
#Check that no more duplicate IDs
no_nas.loc[(no_nas.duplicated(subset='tconst') == True)]

Unnamed: 0,index,tconst,alternate_title,primary_title_x,original_title,start_year,runtime_minutes,genres,aka_title_year,primary_title_y,lifetime_gross,year,title_year


In [37]:
#Still have some movies with the same title/year, but these are valid and have the correct imdb data
no_nas.loc[(no_nas.duplicated(subset='aka_title_year') == True)]

Unnamed: 0,index,tconst,alternate_title,primary_title_x,original_title,start_year,runtime_minutes,genres,aka_title_year,primary_title_y,lifetime_gross,year,title_year
220,14616,tt2230954,Gone,Gone,Gone,2012,50.0,Drama,Gone - 2012,Gone,11682205.0,2012.0,Gone - 2012
248,16531,tt1967651,Unconditional,Unconditional Love,Unconditional,2012,92.0,"Drama,Thriller",Unconditional - 2012,Unconditional,1005800.0,2012.0,Unconditional - 2012
330,21586,tt2613286,Playback,Playback,Dur d'être Dieu,2012,66.0,Documentary,Playback - 2012,Playback,264.0,2012.0,Playback - 2012
349,22692,tt1727825,Rebirth,Rebirth,Yôkame no semi,2011,147.0,Drama,Rebirth - 2011,Rebirth,12358.0,2011.0,Rebirth - 2011
433,28153,tt1380279,Rebirth,Rebirth,Rebirth,2011,105.0,Documentary,Rebirth - 2011,Rebirth,12358.0,2011.0,Rebirth - 2011
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3121,258578,tt3634104,Consumed,Consumed,Consumed,2015,73.0,Drama,Consumed - 2015,Consumed,20476.0,2015.0,Consumed - 2015
3146,260785,tt4793074,Sisters,Sisters,Sisters,2015,53.0,"Biography,Documentary,Music",Sisters - 2015,Sisters,87044645.0,2015.0,Sisters - 2015
3150,261059,tt5097070,Dancer,Dancer,Dancer,2016,85.0,"Biography,Documentary",Dancer - 2016,Dancer,71917.0,2016.0,Dancer - 2016
3170,263022,tt4382872,Extraction,Extraction,Extraction,2015,92.0,"Action,Adventure,Crime",Extraction - 2015,Extraction,16775.0,2015.0,Extraction - 2015


In [38]:
#No completely duplicated rows
no_nas.duplicated().sum()

0

In [39]:
no_nas['start_year'].isna().sum()

0

In [40]:
#Create Master Table
#Will keep original_title - There are a few duplicates, so use tconst for comparison
master_table = no_nas.drop(columns=['index', 'alternate_title', 'primary_title_y', 'primary_title_x', 'aka_title_year', 'year', 'title_year'])
master_table

Unnamed: 0,tconst,original_title,start_year,runtime_minutes,genres,lifetime_gross
0,tt0369610,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",652270625.0
1,tt0401729,John Carter,2012,132.0,"Action,Adventure,Sci-Fi",73078100.0
2,tt1179034,From Paris with Love,2010,92.0,"Action,Crime,Thriller",24077427.0
3,tt1194173,The Bourne Legacy,2012,135.0,"Action,Adventure,Thriller",113203870.0
4,tt1219289,Limitless,2011,105.0,"Mystery,Sci-Fi,Thriller",79249455.0
...,...,...,...,...,...,...
3209,tt7089878,Rizu to aoi tori,2018,90.0,"Animation,Drama,Fantasy",63204.0
3210,tt7208564,Blood Fest,2018,92.0,"Comedy,Horror",82774.0
3211,tt7290740,Unrest,2017,45.0,Drama,40081.0
3212,tt7342204,Ai kaen seupikeu,2017,119.0,"Comedy,Drama",63239.0


In [41]:
def clean(col_name):
    cleaned = col_name.title()
    return cleaned
master_table.columns = master_table.columns.map(clean)


In [42]:
master_table = master_table.rename(columns={'Tconst':'Title_ID', 'Original_Title':'Title', 'Start_Year':'Year'})

In [43]:
master_table.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3209 entries, 0 to 3213
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Title_ID         3209 non-null   object 
 1   Title            3209 non-null   object 
 2   Year             3209 non-null   object 
 3   Runtime_Minutes  3179 non-null   float64
 4   Genres           3205 non-null   object 
 5   Lifetime_Gross   3209 non-null   float64
dtypes: float64(2), object(4)
memory usage: 175.5+ KB


In [44]:
master_table.to_csv(path_or_buf="data/Master_Table.csv")

**Creating People Master Table**

**Read in master table**

In [50]:
#Read in master table
master_df = master_table
master_df.head()

Unnamed: 0,Title_ID,Title,Year,Runtime_Minutes,Genres,Lifetime_Gross
0,tt0369610,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",652270625.0
1,tt0401729,John Carter,2012,132.0,"Action,Adventure,Sci-Fi",73078100.0
2,tt1179034,From Paris with Love,2010,92.0,"Action,Crime,Thriller",24077427.0
3,tt1194173,The Bourne Legacy,2012,135.0,"Action,Adventure,Thriller",113203870.0
4,tt1219289,Limitless,2011,105.0,"Mystery,Sci-Fi,Thriller",79249455.0


In [51]:
master_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3209 entries, 0 to 3213
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Title_ID         3209 non-null   object 
 1   Title            3209 non-null   object 
 2   Year             3209 non-null   object 
 3   Runtime_Minutes  3179 non-null   float64
 4   Genres           3205 non-null   object 
 5   Lifetime_Gross   3209 non-null   float64
dtypes: float64(2), object(4)
memory usage: 175.5+ KB


**Pull In Principal Data & Explore**

In [52]:
principals = pd.read_csv('data/imdb.title.principals.csv.gz')
names = pd.read_csv('data/imdb.name.basics.csv.gz')
names_only = names[['nconst', 'primary_name']]
principals_names = principals.merge(names_only, on='nconst', how='left')
principals_names.drop(columns = ["ordering", "job", "characters"], inplace = True)
principals_names.rename(columns={'tconst':'Title_ID', 'nconst':'Name_ID', 'category':'Role', 'primary_name':'Name'}, inplace = True)
principals_names.head()

Unnamed: 0,Title_ID,Name_ID,Role,Name
0,tt0111414,nm0246005,actor,Tommy Dysart
1,tt0111414,nm0398271,director,Frank Howson
2,tt0111414,nm3739909,producer,Barry Porter-Robinson
3,tt0323808,nm0059247,editor,Sean Barton
4,tt0323808,nm3579312,actress,Brittania Nicol


In [53]:
principals_names.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1028186 entries, 0 to 1028185
Data columns (total 4 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   Title_ID  1028186 non-null  object
 1   Name_ID   1028186 non-null  object
 2   Role      1028186 non-null  object
 3   Name      1027912 non-null  object
dtypes: object(4)
memory usage: 39.2+ MB


In [54]:
principals_names['Name'].value_counts()

Kevin MacLeod          378
William Shakespeare    160
Eric Roberts           148
Brahmanandam           126
Sen Arima              103
                      ... 
Sabine Buchanan          1
Malin Nicander           1
Clive Chisnall           1
Jigmet Dewa Lhamo        1
Gonzalo Herrerias        1
Name: Name, Length: 575032, dtype: int64

In [55]:
principals_names['Role'].value_counts()

actor                  256718
director               146393
actress                146208
producer               113724
cinematographer         80091
composer                77063
writer                  74357
self                    65424
editor                  55512
production_designer      9373
archive_footage          3307
archive_sound              16
Name: Role, dtype: int64

In [56]:
people_master = master_df.merge(principals_names, how='left', on='Title_ID')
people_master = people_master.dropna(subset=['Name'])
people_master.head()

Unnamed: 0,Title_ID,Title,Year,Runtime_Minutes,Genres,Lifetime_Gross,Name_ID,Role,Name
0,tt0369610,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",652270625.0,nm0189777,producer,Patrick Crowley
1,tt0369610,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",652270625.0,nm0695435,actor,Chris Pratt
2,tt0369610,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",652270625.0,nm0397171,actress,Bryce Dallas Howard
3,tt0369610,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",652270625.0,nm1339223,actor,Ty Simpkins
4,tt0369610,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",652270625.0,nm0339460,actress,Judy Greer


In [57]:
people_master.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30854 entries, 0 to 30856
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Title_ID         30854 non-null  object 
 1   Title            30854 non-null  object 
 2   Year             30854 non-null  object 
 3   Runtime_Minutes  30627 non-null  float64
 4   Genres           30819 non-null  object 
 5   Lifetime_Gross   30854 non-null  float64
 6   Name_ID          30854 non-null  object 
 7   Role             30854 non-null  object 
 8   Name             30854 non-null  object 
dtypes: float64(2), object(7)
memory usage: 2.4+ MB


In [58]:
people_master.to_csv(path_or_buf='data/People_Master.csv')