In [20]:
import pandas as pd
import gzip as gz
import sqlite3
from zipfile import ZipFile

In [21]:
with gz.open('data/tn.movie_budgets.csv.gz') as f:
    budgets = pd.read_csv(f,encoding='latin1')

budgets

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"
...,...,...,...,...,...,...
5777,78,"Dec 31, 2018",Red 11,"$7,000",$0,$0
5778,79,"Apr 2, 1999",Following,"$6,000","$48,482","$240,495"
5779,80,"Jul 13, 2005",Return to the Land of Wonders,"$5,000","$1,338","$1,338"
5780,81,"Sep 29, 2015",A Plague So Pleasant,"$1,400",$0,$0


In [22]:
imdb = 'data/im.db.zip'
with ZipFile(imdb, 'r') as zip:
    zip.extractall('data/imdb_unzipped')

conn = sqlite3.connect('data/imdb_unzipped/im.db')

In [23]:
q = """
SELECT pr.movie_id, pr.person_id, pr.category, p.primary_name, p.death_year, m.primary_title AS 'Movie Title'
FROM principals pr
    JOIN persons p
        USING(person_id)
    JOIN movie_basics m
        USING(movie_id)
"""
principals_df = pd.read_sql(q, conn)

In [24]:
# Replace Null to give value (0) for entries to keep
principals_df['death_year'].fillna(0, inplace=True)

# Remove rows without valid death year
alive_ppl = principals_df[principals_df['death_year'] == 0]

# Create cleaned DF
alive_ppl.drop(columns=['death_year'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [25]:
principals_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1027912 entries, 0 to 1027911
Data columns (total 6 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   movie_id      1027912 non-null  object 
 1   person_id     1027912 non-null  object 
 2   category      1027912 non-null  object 
 3   primary_name  1027912 non-null  object 
 4   death_year    1027912 non-null  float64
 5   Movie Title   1027912 non-null  object 
dtypes: float64(1), object(5)
memory usage: 47.1+ MB


In [26]:
# Remove extraneous '$' and ',' symbols
budgets['production_budget'] = budgets['production_budget'].str.replace(',', '')
budgets['production_budget'] = budgets['production_budget'].str.replace("$", '')

budgets['domestic_gross'] = budgets['domestic_gross'].str.replace(",", '')
budgets['domestic_gross'] = budgets['domestic_gross'].str.replace("$", '')

budgets['worldwide_gross'] = budgets['worldwide_gross'].str.replace(",", '')
budgets['worldwide_gross'] = budgets['worldwide_gross'].str.replace("$", '')

In [27]:
# Convert money columns to integers
cash_columns = ['production_budget', 'domestic_gross', 'worldwide_gross']
budgets[cash_columns] = budgets[cash_columns].apply(pd.to_numeric)

In [28]:
# Convert to datetime
budgets['release_date'] = pd.to_datetime(budgets['release_date'])

In [29]:
# Create column to evaluate profitability
budgets['net_earnings'] = budgets['worldwide_gross'] - budgets['production_budget']
budgets.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,net_earnings
0,1,2009-12-18,Avatar,425000000,760507625,2776345279,2351345279
1,2,2011-05-20,Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875,635063875
2,3,2019-06-07,Dark Phoenix,350000000,42762350,149762350,-200237650
3,4,2015-05-01,Avengers: Age of Ultron,330600000,459005868,1403013963,1072413963
4,5,2017-12-15,Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747,999721747


In [30]:
# Filter budgets to include only those in our target range
recent = budgets.loc[(budgets['release_date'] >= '2012-01-01')]

# Verify earliest entry
recent['release_date'].min()        

Timestamp('2012-01-06 00:00:00')

In [31]:
recent.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1666 entries, 2 to 5780
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   id                 1666 non-null   int64         
 1   release_date       1666 non-null   datetime64[ns]
 2   movie              1666 non-null   object        
 3   production_budget  1666 non-null   int64         
 4   domestic_gross     1666 non-null   int64         
 5   worldwide_gross    1666 non-null   int64         
 6   net_earnings       1666 non-null   int64         
dtypes: datetime64[ns](1), int64(5), object(1)
memory usage: 104.1+ KB


In [32]:
# Merge recent and principals

combined = recent.merge(alive_ppl, left_on='movie', right_on='Movie Title', how='inner')
combined.drop(['Movie Title'], axis=1,inplace=True)
combined

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,net_earnings,movie_id,person_id,category,primary_name
0,3,2019-06-07,Dark Phoenix,350000000,42762350,149762350,-200237650,tt6565702,nm0564215,actor,James McAvoy
1,3,2019-06-07,Dark Phoenix,350000000,42762350,149762350,-200237650,tt6565702,nm1055413,actor,Michael Fassbender
2,3,2019-06-07,Dark Phoenix,350000000,42762350,149762350,-200237650,tt6565702,nm2225369,actress,Jennifer Lawrence
3,3,2019-06-07,Dark Phoenix,350000000,42762350,149762350,-200237650,tt6565702,nm0396558,actor,Nicholas Hoult
4,3,2019-06-07,Dark Phoenix,350000000,42762350,149762350,-200237650,tt6565702,nm1334526,director,Simon Kinberg
...,...,...,...,...,...,...,...,...,...,...,...
18183,81,2015-09-29,A Plague So Pleasant,1400,0,0,-1400,tt2107644,nm4572412,actress,Eva Boehnke
18184,81,2015-09-29,A Plague So Pleasant,1400,0,0,-1400,tt2107644,nm4767160,actor,David Chandler
18185,81,2015-09-29,A Plague So Pleasant,1400,0,0,-1400,tt2107644,nm5497357,actor,Maxwell Moody
18186,81,2015-09-29,A Plague So Pleasant,1400,0,0,-1400,tt2107644,nm4766951,director,Benjamin Roberds


In [33]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18188 entries, 0 to 18187
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   id                 18188 non-null  int64         
 1   release_date       18188 non-null  datetime64[ns]
 2   movie              18188 non-null  object        
 3   production_budget  18188 non-null  int64         
 4   domestic_gross     18188 non-null  int64         
 5   worldwide_gross    18188 non-null  int64         
 6   net_earnings       18188 non-null  int64         
 7   movie_id           18188 non-null  object        
 8   person_id          18188 non-null  object        
 9   category           18188 non-null  object        
 10  primary_name       18188 non-null  object        
dtypes: datetime64[ns](1), int64(5), object(5)
memory usage: 1.7+ MB


In [35]:
streamlined = combined
streamlined.drop(columns=['id', 'movie_id', 'person_id', 'release_date', 'category'],inplace=True)
streamlined

Unnamed: 0,movie,production_budget,domestic_gross,worldwide_gross,net_earnings,primary_name
0,Dark Phoenix,350000000,42762350,149762350,-200237650,James McAvoy
1,Dark Phoenix,350000000,42762350,149762350,-200237650,Michael Fassbender
2,Dark Phoenix,350000000,42762350,149762350,-200237650,Jennifer Lawrence
3,Dark Phoenix,350000000,42762350,149762350,-200237650,Nicholas Hoult
4,Dark Phoenix,350000000,42762350,149762350,-200237650,Simon Kinberg
...,...,...,...,...,...,...
18183,A Plague So Pleasant,1400,0,0,-1400,Eva Boehnke
18184,A Plague So Pleasant,1400,0,0,-1400,David Chandler
18185,A Plague So Pleasant,1400,0,0,-1400,Maxwell Moody
18186,A Plague So Pleasant,1400,0,0,-1400,Benjamin Roberds


In [36]:
# Group indviduals my their mean net earnings
individual = streamlined.groupby('primary_name')['net_earnings'].mean().to_frame()
individual.reset_index(inplace=True)

individual.describe()

Unnamed: 0,net_earnings
count,12779.0
mean,59420930.0
std,142788500.0
min,-200237600.0
25%,-2096670.0
50%,4221211.0
75%,57129150.0
max,1433855000.0


In [None]:
#individual['max_earnings'] = individual['net_earnings'].max()
#individual.info()