**Import Revenue Data**

In [32]:
import pandas as pd
import numpy as np
budget_df = pd.read_csv('data/hollywood_domestic_boxoffice.csv', header=0)

#Update column names to match.  Drop unneeded columns
budget_df.rename(columns = {"title": "primary_title"}, inplace = True)
budget_df.drop(columns = ["rank", "studio"], inplace = True)

# #Add in year of release for accuracy
budget_df['title_year'] = budget_df['primary_title'] + ' - ' + budget_df['year'].astype(str)
budget_df.head()

Unnamed: 0,primary_title,lifetime_gross,year,title_year
0,Star Wars: The Force Awakens,936662225,2015,Star Wars: The Force Awakens - 2015
1,Avengers: Endgame,857190335,2019,Avengers: Endgame - 2019
2,Avatar,760507625,2009,Avatar - 2009
3,Black Panther,700059566,2018,Black Panther - 2018
4,Avengers: Infinity War,678815482,2018,Avengers: Infinity War - 2018


In [33]:
#No null values
budget_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16542 entries, 0 to 16541
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   primary_title   16542 non-null  object
 1   lifetime_gross  16542 non-null  int64 
 2   year            16542 non-null  int64 
 3   title_year      16542 non-null  object
dtypes: int64(2), object(2)
memory usage: 517.1+ KB


**Import IMDB Title Data**

In [34]:
##need: Primary Title, Genres
##match on: title_year

imdb_titles_df = pd.read_csv('data/imdb.title.basics.csv.gz')
# imdb_titles_df = imdb_titles_df.drop(columns = ["start_year", "runtime_minutes"])


In [35]:
#Some null values in runtime, genre
imdb_titles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146144 entries, 0 to 146143
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   tconst           146144 non-null  object 
 1   primary_title    146144 non-null  object 
 2   original_title   146123 non-null  object 
 3   start_year       146144 non-null  int64  
 4   runtime_minutes  114405 non-null  float64
 5   genres           140736 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 6.7+ MB


In [36]:
# titles_and_rev = budget_df.merge(imdb_titles_df, how='left', on='title_year')
# titles_and_rev.head()

In [37]:
#Only 3k out of 17k have IMDB data
#Need to bring in title akas to make this work
# titles_and_rev.info()

**Import IMDB Title AKAs**

In [38]:
#Get title akas and merge with imdb titles
imdb_akas_df = pd.read_csv('data/imdb.title.akas.csv.gz')
imdb_akas_df.head()

Unnamed: 0,title_id,ordering,title,region,language,types,attributes,is_original_title
0,tt0369610,10,Джурасик свят,BG,bg,,,0.0
1,tt0369610,11,Jurashikku warudo,JP,,imdbDisplay,,0.0
2,tt0369610,12,Jurassic World: O Mundo dos Dinossauros,BR,,imdbDisplay,,0.0
3,tt0369610,13,O Mundo dos Dinossauros,BR,,,short title,0.0
4,tt0369610,14,Jurassic World,FR,,imdbDisplay,,0.0


In [39]:
#Drop columns.  We only need id and title
imdb_akas_df = imdb_akas_df[['title_id', 'title']]

In [40]:
#No null values
imdb_akas_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 331703 entries, 0 to 331702
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   title_id  331703 non-null  object
 1   title     331703 non-null  object
dtypes: object(2)
memory usage: 5.1+ MB


**Merge Title AKAs with Title Basics**

In [41]:
#May not be alternate titles for each movie.  Should add them only where appropriate - NOT LEFT JOIN
#Merge with imdb titles so that there is a full row of data for each alternate title
#Merge on tconst

#rename column to match
imdb_akas_df.rename(columns = {"title_id": "tconst", "title": "alternate_title"}, inplace = True)

In [42]:
titles_with_akas = imdb_akas_df.merge(imdb_titles_df, how='left', on='tconst')
titles_with_akas.head(20)

Unnamed: 0,tconst,alternate_title,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0369610,Джурасик свят,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
1,tt0369610,Jurashikku warudo,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
2,tt0369610,Jurassic World: O Mundo dos Dinossauros,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
3,tt0369610,O Mundo dos Dinossauros,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
4,tt0369610,Jurassic World,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
5,tt0369610,Jurassic World,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
6,tt0369610,Jurassic World,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
7,tt0369610,Jurski svijet,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
8,tt0369610,Olam ha'Yura,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
9,tt0369610,Jurassic World: Mundo Jurásico,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"


In [43]:
# titles_with_akas.loc[(titles_with_akas['tconst'] == 'tt0063540')]

In [44]:
titles_with_akas[titles_with_akas.duplicated()]

Unnamed: 0,tconst,alternate_title,primary_title,original_title,start_year,runtime_minutes,genres
5,tt0369610,Jurassic World,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
6,tt0369610,Jurassic World,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
11,tt0369610,Jurassic World,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
13,tt0369610,Jurassic World 3D,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
14,tt0369610,Jurassic World 3D,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
...,...,...,...,...,...,...,...
331688,tt9705860,Dusan Vukotic hrvatski okarovac,Dusan Vukotic Croatian Oscar Winner,Dusan Vukotic hrvatski okarovac,2011,55.0,Documentary
331690,tt9723084,Anderswo. Allein in Afrika,Anderswo. Allein in Afrika,Anderswo. Allein in Afrika,2018,103.0,"Adventure,Documentary"
331696,tt9755806,Big Shark,Big Shark,Big Shark,2019,,Horror
331698,tt9827784,Sayonara kuchibiru,Farewell Song,Sayonara kuchibiru,2019,116.0,"Music,Romance"


In [45]:
titles_with_akas = titles_with_akas.drop_duplicates()

In [46]:
titles_with_akas

Unnamed: 0,tconst,alternate_title,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0369610,Джурасик свят,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
1,tt0369610,Jurashikku warudo,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
2,tt0369610,Jurassic World: O Mundo dos Dinossauros,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
3,tt0369610,O Mundo dos Dinossauros,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
4,tt0369610,Jurassic World,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi"
...,...,...,...,...,...,...,...
331695,tt9755806,Большая Акула,Big Shark,Big Shark,2019,,Horror
331697,tt9827784,Sayonara kuchibiru,Farewell Song,Sayonara kuchibiru,2019,116.0,"Music,Romance"
331699,tt9827784,Farewell Song,Farewell Song,Sayonara kuchibiru,2019,116.0,"Music,Romance"
331700,tt9880178,La atención,The Attention,La atención,2019,61.0,Documentary


In [47]:
#Convert year to string in order to create title_year column
titles_with_akas['start_year'] = titles_with_akas['start_year'].astype(str)
#Combine year and title for merging
titles_with_akas['aka_title_year'] = titles_with_akas['alternate_title'] + ' - ' + titles_with_akas['start_year']
titles_with_akas.head()


Unnamed: 0,tconst,alternate_title,primary_title,original_title,start_year,runtime_minutes,genres,aka_title_year
0,tt0369610,Джурасик свят,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",Джурасик свят - 2015
1,tt0369610,Jurashikku warudo,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",Jurashikku warudo - 2015
2,tt0369610,Jurassic World: O Mundo dos Dinossauros,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",Jurassic World: O Mundo dos Dinossauros - 2015
3,tt0369610,O Mundo dos Dinossauros,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",O Mundo dos Dinossauros - 2015
4,tt0369610,Jurassic World,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",Jurassic World - 2015


**Merge in Revenue Data for each Title AKA**

In [48]:
titles_and_rev = titles_with_akas.merge(budget_df, how='left', left_on='aka_title_year', right_on='title_year')
titles_and_rev.head()

Unnamed: 0,tconst,alternate_title,primary_title_x,original_title,start_year,runtime_minutes,genres,aka_title_year,primary_title_y,lifetime_gross,year,title_year
0,tt0369610,Джурасик свят,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",Джурасик свят - 2015,,,,
1,tt0369610,Jurashikku warudo,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",Jurashikku warudo - 2015,,,,
2,tt0369610,Jurassic World: O Mundo dos Dinossauros,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",Jurassic World: O Mundo dos Dinossauros - 2015,,,,
3,tt0369610,O Mundo dos Dinossauros,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",O Mundo dos Dinossauros - 2015,,,,
4,tt0369610,Jurassic World,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",Jurassic World - 2015,Jurassic World,652270625.0,2015.0,Jurassic World - 2015


In [49]:
#No duplicates
titles_and_rev[titles_and_rev.duplicated()]

Unnamed: 0,tconst,alternate_title,primary_title_x,original_title,start_year,runtime_minutes,genres,aka_title_year,primary_title_y,lifetime_gross,year,title_year


In [50]:
#Still only 3200 movies with all data
titles_and_rev.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 266869 entries, 0 to 266868
Data columns (total 12 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   tconst           266869 non-null  object 
 1   alternate_title  266869 non-null  object 
 2   primary_title_x  266869 non-null  object 
 3   original_title   266859 non-null  object 
 4   start_year       266869 non-null  object 
 5   runtime_minutes  239721 non-null  float64
 6   genres           263489 non-null  object 
 7   aka_title_year   266869 non-null  object 
 8   primary_title_y  3214 non-null    object 
 9   lifetime_gross   3214 non-null    float64
 10  year             3214 non-null    float64
 11  title_year       3214 non-null    object 
dtypes: float64(3), object(9)
memory usage: 26.5+ MB


In [51]:
no_nas = titles_and_rev.dropna(subset=['lifetime_gross'])

In [52]:
no_nas.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 3214 entries, 4 to 266833
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   tconst           3214 non-null   object 
 1   alternate_title  3214 non-null   object 
 2   primary_title_x  3214 non-null   object 
 3   original_title   3214 non-null   object 
 4   start_year       3214 non-null   object 
 5   runtime_minutes  3184 non-null   float64
 6   genres           3210 non-null   object 
 7   aka_title_year   3214 non-null   object 
 8   primary_title_y  3214 non-null   object 
 9   lifetime_gross   3214 non-null   float64
 10  year             3214 non-null   float64
 11  title_year       3214 non-null   object 
dtypes: float64(3), object(9)
memory usage: 326.4+ KB


**Add in principal data & names**

In [53]:
principals = pd.read_csv('data/imdb.title.principals.csv.gz')
names = pd.read_csv('data/imdb.name.basics.csv.gz')
names_only = names[['nconst', 'primary_name']]
principals_names = principals.merge(names_only, on='nconst', how='left')
principals_names.drop(columns = ["ordering", "job", "characters"], inplace = True)

**Merge Principal Data with Title Data**

In [54]:
#Merge on tconst
Merged_People_Movie_Data = no_nas.merge(principals_names, on = "tconst")
Merged_People_Movie_Data['category'] = Merged_People_Movie_Data['category'].replace("actress", "actor")
Merged_People_Movie_Data.head()


Unnamed: 0,tconst,alternate_title,primary_title_x,original_title,start_year,runtime_minutes,genres,aka_title_year,primary_title_y,lifetime_gross,year,title_year,nconst,category,primary_name
0,tt0369610,Jurassic World,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",Jurassic World - 2015,Jurassic World,652270625.0,2015.0,Jurassic World - 2015,nm0189777,producer,Patrick Crowley
1,tt0369610,Jurassic World,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",Jurassic World - 2015,Jurassic World,652270625.0,2015.0,Jurassic World - 2015,nm0695435,actor,Chris Pratt
2,tt0369610,Jurassic World,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",Jurassic World - 2015,Jurassic World,652270625.0,2015.0,Jurassic World - 2015,nm0397171,actor,Bryce Dallas Howard
3,tt0369610,Jurassic World,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",Jurassic World - 2015,Jurassic World,652270625.0,2015.0,Jurassic World - 2015,nm1339223,actor,Ty Simpkins
4,tt0369610,Jurassic World,Jurassic World,Jurassic World,2015,124.0,"Action,Adventure,Sci-Fi",Jurassic World - 2015,Jurassic World,652270625.0,2015.0,Jurassic World - 2015,nm0339460,actor,Judy Greer


In [59]:
#Create df for each role
actor_df = Merged_People_Movie_Data.loc[(Merged_People_Movie_Data['category'] == 'actor')]
director_df = Merged_People_Movie_Data.loc[(Merged_People_Movie_Data['category'] == 'director')]
cinematographer_df = Merged_People_Movie_Data.loc[(Merged_People_Movie_Data['category'] == 'cinematographer')]
writer_df = Merged_People_Movie_Data.loc[(Merged_People_Movie_Data['category'] == 'writer')]

In [56]:
#Get top 20 actors by average domestic box office
#SUM could show better summary of track record since we have limited data
actor_df = actor_df.groupby('primary_name').mean().sort_values(by='lifetime_gross', ascending=False)
actor_df.head(50)

Unnamed: 0_level_0,runtime_minutes,lifetime_gross,year
primary_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ty Simpkins,124.0,652270600.0,2015.0
Carrie Fisher,152.0,620181400.0,2017.0
Craig T. Nelson,118.0,608581700.0,2018.0
Sarah Vowell,118.0,608581700.0,2018.0
Huck Milner,118.0,608581700.0,2018.0
Diego Luna,133.0,532177300.0,2016.0
Alan Tudyk,133.0,532177300.0,2016.0
Daisy Ridley,125.0,520004300.0,2016.0
Ellen DeGeneres,97.0,486295600.0,2016.0
Kaitlin Olson,97.0,486295600.0,2016.0


In [60]:
actor_df = actor_df.groupby('primary_name').sum().sort_values(by='lifetime_gross', ascending=False)
actor_df.head(50)

Unnamed: 0_level_0,runtime_minutes,lifetime_gross,year
primary_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Robert Downey Jr.,1627.0,4448033000.0,24169.0
Chris Evans,1425.0,3525348000.0,22154.0
Chris Hemsworth,1562.0,2980078000.0,24179.0
Mark Ruffalo,1502.0,2698892000.0,22158.0
Dwayne Johnson,1821.0,2489222000.0,34251.0
Jennifer Lawrence,1692.0,2230092000.0,28195.0
Chris Pratt,895.0,1948200000.0,14101.0
Vin Diesel,1122.0,1864814000.0,18132.0
Scarlett Johansson,1150.0,1847908000.0,20139.0
Mark Wahlberg,2296.0,1784375000.0,40282.0


In [57]:
#Check actors
#Only shows one Star Wars movie b/c limited by imdb.title.principals data.  Carrie Fisher only has one actor row in the list.
Merged_People_Movie_Data.loc[(Merged_People_Movie_Data['primary_name'] == 'Carrie Fisher')]

Unnamed: 0,tconst,alternate_title,primary_title_x,original_title,start_year,runtime_minutes,genres,aka_title_year,primary_title_y,lifetime_gross,year,title_year,nconst,category,primary_name
7269,tt2527336,Star Wars: The Last Jedi,Star Wars: The Last Jedi,Star Wars: Episode VIII - The Last Jedi,2017,152.0,"Action,Adventure,Fantasy",Star Wars: The Last Jedi - 2017,Star Wars: The Last Jedi,620181382.0,2017.0,Star Wars: The Last Jedi - 2017,nm0000402,actor,Carrie Fisher
