In [1]:
import pandas as pd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split, validation_curve,cross_val_score,KFold
from sklearn.linear_model import Lasso,LassoCV,LinearRegression,RidgeCV
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.metrics import mean_squared_error,mean_absolute_error
import statsmodels.api as sm
import statsmodels.formula.api as smf 
from math import sqrt

## Explore the data

In [2]:
oscars_df = pd.read_csv('oscar_movies_data_2.csv')

In [3]:
oscars_df.drop(columns=['Unnamed: 0'],inplace=True)

In [4]:
oscars_df.head()

Unnamed: 0,IMDBId,movie title,language,country,runtime (mins),mpaarating,metacritic score,budget,distributionCompany,Awards,...,Writer_Aaron Sorkin,Writer_Anthony McCarten,Writer_George Miller,Writer_Joel Coen,Writer_John Logan,Writer_Lawrence Kasdan,Writer_Paul Thomas Anderson,Writer_Pedro Almodóvar,Writer_Quentin Tarantino,Writer_Woody Allen
0,tt0091763,Platoon,English,USA,120,R,92.0,6000000.0,Orion Pictures,Best Actor in a Supporting Role,...,,,,,,,,,,
1,tt0082979,Reds,English,USA,195,PG,76.0,32000000.0,Paramount Pictures,Best Actor in a Leading Role,...,,,,,,,,,,
2,tt0084434,An Officer and a Gentleman,English,USA,124,R,75.0,7500000.0,Paramount Pictures,Best Actor in a Supporting Role,...,,,,,,,,,,
3,tt0119360,In & Out,English,USA,90,PG-13,70.0,35000000.0,Paramount Pictures,Best Actress in a Supporting Role,...,,,,,,,,,,
4,tt0292542,Son of the Bride,Spanish,Argentina,123,R,68.0,11936760.0,Sony Pictures Classics,Best Foreign Language Film,...,,,,,,,,,,


In [5]:
numerics = ['int16','int32','int64','float16','float32','float64']
numerical_vars = list(oscars_df.select_dtypes(include=numerics).columns)
oscars_data = oscars_df[numerical_vars]
oscars_data.shape

(1136, 92)

In [6]:
oscars_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1136 entries, 0 to 1135
Data columns (total 92 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   runtime (mins)                            1136 non-null   int64  
 1   metacritic score                          1136 non-null   float64
 2   budget                                    1072 non-null   float64
 3   Total_Noms                                1136 non-null   int64  
 4   Award_Year                                1124 non-null   float64
 5   Awards_Best Actor in a Leading Role       479 non-null    float64
 6   Awards_Best Actor in a Supporting Role    479 non-null    float64
 7   Awards_Best Actress in a Leading Role     479 non-null    float64
 8   Awards_Best Art Direction-Set Decoration  479 non-null    float64
 9   Awards_Best Cinematography                479 non-null    float64
 10  Awards_Best Director                

In [7]:
oscars_data['budget'].fillna(oscars_data['budget'].mean(),inplace=True)
oscars_data.fillna(0,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [8]:
oscars_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1136 entries, 0 to 1135
Data columns (total 92 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   runtime (mins)                            1136 non-null   int64  
 1   metacritic score                          1136 non-null   float64
 2   budget                                    1136 non-null   float64
 3   Total_Noms                                1136 non-null   int64  
 4   Award_Year                                1136 non-null   float64
 5   Awards_Best Actor in a Leading Role       1136 non-null   float64
 6   Awards_Best Actor in a Supporting Role    1136 non-null   float64
 7   Awards_Best Actress in a Leading Role     1136 non-null   float64
 8   Awards_Best Art Direction-Set Decoration  1136 non-null   float64
 9   Awards_Best Cinematography                1136 non-null   float64
 10  Awards_Best Director                

## Baseline Model

In [9]:
baseline_df = oscars_data.drop(columns=['Award_Avg_Score','Director_Avg_Score','Total_Director_Movies',\
                                        'Lead_Actor_Avg_Score','Total_Movies_Lead_Actor',\
                                        'Genre_Avg_Score','Avg_Rating_by_writer','Award_Year','Total_Noms'])

In [10]:
def ols_model(X,y):
    '''
    Scale and print results summary
    '''
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
    X_train_scale = X_train.copy()
  
    scale = StandardScaler().fit(X_train_scale)
    
    X_train_scale = scale.transform(X_train_scale)
    model = sm.OLS(y_train, sm.add_constant(X_train_scale))
    results = model.fit()

    return results.summary()

In [28]:
def train_and_test_linear(X,y):
    '''
    Scale data and perform a linear regression on it and cross validation on it 
    '''
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=10)
    
    X_train_scale = X_train.values
    X_test_scale = X_test.values
    
    scale = StandardScaler()

    X_train_scaled = scale.fit_transform(X_train_scale)
    X_test_scaled = scale.transform(X_test_scale)
  
    lm = LinearRegression()
    lm.fit(X_train_scale,y_train)
    y_pred = lm.predict(X_test_scale)
        
    print(f'Linear Regression val R^2: {lm.score(X_train_scale, y_train):.3f}')
    print(f'Linear Regression val RME: {sqrt(mean_squared_error(y_test,y_pred)):.3f}')
    #return y_pred

In [41]:
def scale_test_and_train_Lasso(X,y):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=10)
    
    X_train_scale = X_train.values
    X_test_scale = X_test.values
    
    scale = StandardScaler()

    X_train_scale = scale.fit_transform(X_train_scale)
    X_test_scale = scale.transform(X_test_scale)
    
    lasso = LassoCV()
    lasso.fit(X_train_scale,y_train)
    
    lasso.score(X_train_scale,y_train)
    
    y_pred = lasso.predict(X_test_scale)
    
    
    print(f'Linear Regression val R^2: {lasso.score(X_train_scale, y_train):.3f}')
    print(f'Linear Regression val RME: {sqrt(mean_squared_error(y_test,y_pred)):.3f}')
    
    return lasso.coef_

## Run initial regressions on baseline model

In [42]:
X = baseline_df.drop(columns='metacritic score')
y = baseline_df['metacritic score']

In [43]:
train_and_test_linear(X,y)

Linear Regression val R^2: 0.337
Linear Regression val RME: 13.547


In [44]:
ols_model(X,y)

0,1,2,3
Dep. Variable:,metacritic score,R-squared:,0.317
Model:,OLS,Adj. R-squared:,0.252
Method:,Least Squares,F-statistic:,4.863
Date:,"Sun, 11 Oct 2020",Prob (F-statistic):,2.35e-32
Time:,15:06:13,Log-Likelihood:,-3480.2
No. Observations:,908,AIC:,7120.0
Df Residuals:,828,BIC:,7505.0
Df Model:,79,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,71.9559,0.388,185.244,0.000,71.194,72.718
x1,0.5820,0.523,1.113,0.266,-0.444,1.608
x2,-0.4601,0.561,-0.821,0.412,-1.561,0.640
x3,2.2643,0.443,5.109,0.000,1.394,3.134
x4,1.5582,0.431,3.617,0.000,0.713,2.404
x5,1.1254,0.431,2.612,0.009,0.280,1.971
x6,0.3634,0.416,0.873,0.383,-0.454,1.181
x7,0.8721,0.404,2.161,0.031,0.080,1.664
x8,0.9201,0.418,2.202,0.028,0.100,1.740

0,1,2,3
Omnibus:,43.379,Durbin-Watson:,1.974
Prob(Omnibus):,0.0,Jarque-Bera (JB):,55.095
Skew:,-0.462,Prob(JB):,1.09e-12
Kurtosis:,3.776,Cond. No.,1.17e+16


## High condition score add in engineered features

In [45]:
oscars_data

Unnamed: 0,runtime (mins),metacritic score,budget,Total_Noms,Award_Year,Awards_Best Actor in a Leading Role,Awards_Best Actor in a Supporting Role,Awards_Best Actress in a Leading Role,Awards_Best Art Direction-Set Decoration,Awards_Best Cinematography,...,Writer_Aaron Sorkin,Writer_Anthony McCarten,Writer_George Miller,Writer_Joel Coen,Writer_John Logan,Writer_Lawrence Kasdan,Writer_Paul Thomas Anderson,Writer_Pedro Almodóvar,Writer_Quentin Tarantino,Writer_Woody Allen
0,120,92.0,6.000000e+06,7,1987.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,195,76.0,3.200000e+07,12,1981.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,124,75.0,7.500000e+06,6,1982.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,90,70.0,3.500000e+07,1,1997.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,123,68.0,1.193676e+07,1,2001.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1131,110,77.0,4.100000e+07,1,2002.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1132,105,65.0,1.000000e+08,3,1999.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1133,143,40.0,7.000000e+07,3,2004.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1134,107,57.0,4.425875e+07,3,1989.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
oscars_data.corr()

Unnamed: 0,runtime (mins),metacritic score,budget,Total_Noms,Award_Year,Awards_Best Actor in a Leading Role,Awards_Best Actor in a Supporting Role,Awards_Best Actress in a Leading Role,Awards_Best Art Direction-Set Decoration,Awards_Best Cinematography,...,Writer_Aaron Sorkin,Writer_Anthony McCarten,Writer_George Miller,Writer_Joel Coen,Writer_John Logan,Writer_Lawrence Kasdan,Writer_Paul Thomas Anderson,Writer_Pedro Almodóvar,Writer_Quentin Tarantino,Writer_Woody Allen
runtime (mins),1.000000,-0.019315,0.209810,0.355896,0.062201,0.132043,0.078923,0.005028,0.086136,0.033169,...,0.008801,0.018310,-0.017806,-0.013792,0.069414,0.023093,0.102951,-0.039734,0.130759,-0.111100
metacritic score,-0.019315,1.000000,-0.154606,0.327954,-0.005633,0.080313,0.025476,-0.013267,-0.042860,-0.028671,...,0.010307,-0.019014,0.029249,0.037558,-0.002373,-0.044491,0.069278,0.067315,0.023753,0.017157
budget,0.209810,-0.154606,1.000000,0.046763,0.034585,-0.100454,-0.087471,-0.070021,-0.020378,-0.026026,...,0.005100,-0.009568,0.059949,-0.017394,0.129906,0.025417,-0.019045,-0.044876,0.016457,-0.057594
Total_Noms,0.355896,0.327954,0.046763,1.000000,0.071295,0.237237,0.119842,0.046157,0.046992,0.026253,...,-0.002622,0.040079,0.012545,0.024378,0.075626,0.014534,0.041133,-0.037707,0.073881,-0.016481
Award_Year,0.062201,-0.005633,0.034585,0.071295,1.000000,0.017762,0.014441,0.015956,0.008851,0.006575,...,0.009254,0.010692,0.006631,0.012536,0.009039,0.003843,0.008715,0.007546,0.009788,0.007576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Writer_Lawrence Kasdan,0.023093,-0.044491,0.025417,0.014534,0.003843,-0.025391,-0.021232,-0.020024,0.112292,-0.010270,...,-0.005738,-0.004681,-0.004681,-0.007037,-0.005235,1.000000,-0.005235,-0.005235,-0.005738,-0.008136
Writer_Paul Thomas Anderson,0.102951,0.069278,-0.019045,0.041133,0.008715,-0.021441,0.087874,-0.016908,-0.012200,-0.008672,...,-0.004845,-0.003952,-0.003952,-0.005942,-0.004421,-0.005235,1.000000,-0.004421,-0.004845,-0.006870
Writer_Pedro Almodóvar,-0.039734,0.067315,-0.044876,-0.037707,0.007546,-0.021441,-0.017929,-0.016908,-0.012200,-0.008672,...,-0.004845,-0.003952,-0.003952,-0.005942,-0.004421,-0.005235,-0.004421,1.000000,-0.004845,-0.006870
Writer_Quentin Tarantino,0.130759,0.023753,0.016457,0.073881,0.009788,0.018081,0.028665,-0.018530,-0.013370,-0.009504,...,-0.005310,-0.004332,-0.004332,-0.006512,-0.004845,-0.005738,-0.004845,-0.004845,1.000000,-0.007529


In [47]:
X= oscars_data.drop(columns='metacritic score')
y= oscars_data['metacritic score']

In [48]:
train_and_test_linear(X,y)

Linear Regression val R^2: 0.496
Linear Regression val RME: 12.610


In [49]:
ols_model(X,y)

0,1,2,3
Dep. Variable:,metacritic score,R-squared:,0.476
Model:,OLS,Adj. R-squared:,0.419
Method:,Least Squares,F-statistic:,8.446
Date:,"Sun, 11 Oct 2020",Prob (F-statistic):,7.02e-69
Time:,15:06:18,Log-Likelihood:,-3360.1
No. Observations:,908,AIC:,6898.0
Df Residuals:,819,BIC:,7326.0
Df Model:,88,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,71.9559,0.342,210.300,0.000,71.284,72.628
x1,-1.0436,0.481,-2.170,0.030,-1.988,-0.100
x2,-0.8966,0.505,-1.775,0.076,-1.888,0.095
x3,4.5289,0.526,8.618,0.000,3.497,5.560
x4,-0.4230,0.685,-0.618,0.537,-1.767,0.921
x5,0.5335,0.429,1.244,0.214,-0.308,1.375
x6,0.1762,0.399,0.442,0.659,-0.607,0.960
x7,0.6696,0.388,1.727,0.085,-0.091,1.431
x8,-0.3599,0.374,-0.963,0.336,-1.093,0.374

0,1,2,3
Omnibus:,37.198,Durbin-Watson:,2.017
Prob(Omnibus):,0.0,Jarque-Bera (JB):,47.143
Skew:,-0.414,Prob(JB):,5.8e-11
Kurtosis:,3.749,Cond. No.,4190000000000000.0


### Going to perform a Lasso Regression on the model due to high P-values and still a high coefficient number

In [50]:
lasso = scale_test_and_train_Lasso(X,y)

Linear Regression val R^2: 0.465
Linear Regression val RME: 12.217


In [51]:
list(zip(oscars_data.drop(columns='metacritic score'),lasso))

[('runtime (mins)', -0.17472467502870734),
 ('budget', -1.0661970430072607),
 ('Total_Noms', 3.6081355284697754),
 ('Award_Year', -0.0),
 ('Awards_Best Actor in a Leading Role', 0.05912693609249956),
 ('Awards_Best Actor in a Supporting Role', 0.0),
 ('Awards_Best Actress in a Leading Role', 0.0),
 ('Awards_Best Art Direction-Set Decoration', -0.05111838723649458),
 ('Awards_Best Cinematography', -0.0),
 ('Awards_Best Director', 0.18726256823456555),
 ('Awards_Best Film Editing', -0.0),
 ('Awards_Best Motion Picture of the Year', 0.0),
 ('Awards_Best Picture', -0.0),
 ('Awards_Best Sound', -0.9843416481123445),
 ('Award_Avg_Score', 4.112856690258802),
 ('Director_Avg_Score', 2.3814336285386557),
 ('Total_Director_Movies', 0.1849894603041282),
 ('Lead_Actor_Avg_Score', 0.0),
 ('Total_Movies_Lead_Actor', -1.3546006762349094),
 ('Genre_Avg_Score', -0.0),
 ('Avg_Rating_by_writer', 0.8283719094703328),
 ('Buena Vista Pictures', -0.30611012888371836),
 ('Columbia Pictures', -0.68334103987670

## Pull out non zeroed out columns perform a simple linear regression and look at condition number

In [52]:
smaller_df = oscars_data[['metacritic score','runtime (mins)','budget','Total_Noms','Awards_Best Actor in a Leading Role',\
                          'Awards_Best Director','Awards_Best Art Direction-Set Decoration','Awards_Best Sound','Award_Avg_Score',\
                          'Director_Avg_Score','Total_Director_Movies','Total_Movies_Lead_Actor','Avg_Rating_by_writer',\
                          'Buena Vista Pictures','Columbia Pictures','Paramount Pictures','Sony Pictures Classics',\
                          'Universal Pictures','Walt Disney Studios Motion Pictures','Warner Bros.','Genre_Action','Genre_Adventure','Genre_Animation',\
                          'Genre_Crime','Genre_Documentary','Genre_Horror','Subgenre_Music','Subgenre_Romance','Subgenre_Thriller','Director_Martin Scorsese'\
                        ,'Director_Ron Howard','Director_Tim Burton','Lead_Actor_Michael Douglas','Lead_Actor_Robin Williams','Lead_Actor_Tom Hanks','Summer',\
                          'G','Not Rated','PG','PG-13','Writer_Anthony McCarten','Writer_Paul Thomas Anderson','Writer_Pedro Almodóvar']]