Exploratory Data Analysis: Summary Tables

In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [11]:
final_df = pd.read_csv('./data/final_df.csv')
final_df.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [12]:
# Change figures to USD millions 
final_df['production_budget_x'] = np.round(final_df['production_budget_x'].map(lambda x: x/1_000_000), 2)
final_df['worldwide_gross'] = np.round(final_df['worldwide_gross'].map(lambda x: x/1_000_000), 2)
final_df['net_revenue'] = np.round(final_df['net_revenue'].map(lambda x: x/1_000_000), 2)
final_df['roi'] = np.round(final_df['roi'], 2)

In [13]:
final_df.head()

Unnamed: 0,release_date,production_budget_x,worldwide_gross,roi,net_revenue,title_year,studio,studio_size
0,2015-07-10,0.1,41.66,416.56,41.56,The Gallows 2015,WB (NL),Large
1,2014-10-03,6.5,256.86,39.52,250.36,Annabelle 2014,WB (NL),Large
2,2016-07-22,5.0,148.81,29.76,143.81,Lights Out 2016,WB (NL),Large
3,2017-08-11,15.0,305.38,20.36,290.38,Annabelle: Creation 2017,WB (NL),Large
4,2017-09-08,35.0,697.46,19.93,662.46,It 2017,WB (NL),Large


**Table 1: Film ROI Distributions By Studio Size**

In [7]:
final_df.groupby('studio_size')['roi'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
studio_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Large,1085.0,4.665825,14.240693,0.0,1.39,2.62,4.51,416.56
Medium,97.0,2.555876,4.369614,0.01,0.43,1.0,2.62,31.02
Small,32.0,1.605625,2.086806,0.01,0.24,0.79,2.4575,9.54


**Table 2: Average ROI and Average Net Revenue on Films Released over 2010-2018 by Studio**

In [8]:
final_df.pivot_table(values = ['roi', 'net_revenue'], index = ['studio', 'studio_size']).sort_values(by = 'roi', ascending = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,net_revenue,roi
studio,studio_size,Unnamed: 2_level_1,Unnamed: 3_level_1
WB (NL),Large,172.764324,17.495405
FD,Large,41.065556,12.568889
UTV,Medium,95.010000,12.173333
Orch.,Small,21.350000,9.540000
GrtIndia,Medium,233.500000,8.780000
BH Tilt,Medium,10.436000,7.894000
Uni.,Large,180.411652,6.886174
Par.,Large,133.670972,6.856806
TriS,Large,63.703333,5.917222
MBox,Small,12.700000,5.880000


**Table 3: ROI and Net Revenue of Top 15 Films**

In [9]:
final_df.pivot_table(values = ['roi', 'net_revenue'], index = 'title_year').sort_values(by = 'roi', ascending = False)

Unnamed: 0_level_0,net_revenue,roi
title_year,Unnamed: 1_level_1,Unnamed: 2_level_1
The Gallows 2015,41.56,416.56
The Devil Inside 2012,100.76,101.76
Insidious 2011,98.37,66.58
Unfriended 2015,63.36,64.36
Paranormal Activity 2 2010,174.51,59.17
Split 2017,273.96,55.79
Get Out 2017,250.37,51.07
Chernobyl Diaries 2012,41.41,42.41
Paranormal Activity 3 2011,202.04,41.41
Annabelle 2014,250.36,39.52


**Extra**

**Question**
What are the characteristics of large, medium, and small studios?

In [36]:
# From 2010-2018, the average net revenue, production budget and ROI for in millions of USD
np.round(final_df.pivot_table(values = ['net_revenue', 'production_budget_x', 'roi'], index = 'studio_size'), 2)

Unnamed: 0_level_0,net_revenue,production_budget_x,roi
studio_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Large,119.04,51.84,4.67
Medium,10.8,15.07,2.56
Small,1.43,4.5,1.61


In [37]:
# From 2010-2018, the median net revenue, production budget and ROI for in millions of USD
np.round(final_df.pivot_table(values = ['net_revenue', 'production_budget_x', 'roi'], index = 'studio_size', aggfunc = 'median'), 2)

Unnamed: 0_level_0,net_revenue,production_budget_x,roi
studio_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Large,42.91,30.0,2.62
Medium,-0.08,11.0,1.0
Small,-0.36,4.0,0.79


In [38]:
final_df['studio_size'].value_counts()

Large     1085
Medium      97
Small       32
Name: studio_size, dtype: int64

**Question** Which studios have the highest ROI?

In [93]:
final_df.pivot_table(values = 'roi', index = 'studio_size', aggfunc = ['min', 'mean', 'median', 'max'])

Unnamed: 0_level_0,min,mean,median,max
Unnamed: 0_level_1,roi,roi,roi,roi
studio_size,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Large,0.000138,4.665849,2.619925,416.56474
Medium,0.005275,2.556121,0.995556,31.016276
Small,0.009503,1.606314,0.792297,9.538213


**Question** What are the characteristics of production budgets of differently sized studios?

In [95]:
np.round(final_df.pivot_table(values = 'production_budget_x', index = 'studio_size', aggfunc = ['min', 'mean', 'median', 'max']), 2)

Unnamed: 0_level_0,min,mean,median,max
Unnamed: 0_level_1,production_budget_x,production_budget_x,production_budget_x,production_budget_x
studio_size,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Large,0.05,51.84,30.0,410.6
Medium,0.25,15.07,11.0,77.5
Small,0.6,4.5,4.0,9.5


In [106]:
# The above ideally can be graphed, comparing the ranges of production budgets and where the top films fall

**Question** What are the production budget sizes and gross sales of the films with the top ROIs?

In [70]:
final_df = final_df.sort_values(by = 'roi', ascending = False)
final_df.reset_index()

In [77]:
final_df.loc[0:15, ['production_budget_x', 'worldwide_gross', 'roi', 'title_year', 'studio', 'studio_size']]

Unnamed: 0,production_budget_x,worldwide_gross,roi,title_year,studio,studio_size
0,0.10,41.656474,416.564740,The Gallows 2015,WB (NL),Large
37,1.00,101.759490,101.759490,The Devil Inside 2012,Par.,Large
109,1.50,99.870886,66.580591,Insidious 2011,FD,Large
118,1.00,64.364198,64.364198,Unfriended 2015,Uni.,Large
38,3.00,177.512032,59.170677,Paranormal Activity 2 2010,Par.,Large
119,5.00,278.964806,55.792961,Split 2017,Uni.,Large
120,5.00,255.367951,51.073590,Get Out 2017,Uni.,Large
233,1.00,42.411721,42.411721,Chernobyl Diaries 2012,WB,Large
39,5.00,207.039844,41.407969,Paranormal Activity 3 2011,Par.,Large
1,6.50,256.862920,39.517372,Annabelle 2014,WB (NL),Large
