In [1]:
import pandas as pd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split, validation_curve,cross_val_score,KFold
from sklearn.linear_model import Lasso,LassoCV,LinearRegression,RidgeCV
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.metrics import mean_squared_error,mean_absolute_error
import statsmodels.api as sm
import statsmodels.formula.api as smf 
from math import sqrt

## Explore the data

In [2]:
oscars_df = pd.read_csv('oscar_movies_data_2.csv')

In [3]:
#oscars_df.drop(columns=['Unnamed: 0'],inplace=True)

In [4]:
oscars_df.head()

Unnamed: 0.1,Unnamed: 0,IMDBId,movie title,language,country,runtime (mins),mpaarating,metacritic score,budget,distributionCompany,...,Writer_Aaron Sorkin,Writer_Anthony McCarten,Writer_George Miller,Writer_Joel Coen,Writer_John Logan,Writer_Lawrence Kasdan,Writer_Paul Thomas Anderson,Writer_Pedro Almodóvar,Writer_Quentin Tarantino,Writer_Woody Allen
0,0,tt0091763,Platoon,English,USA,120,R,92.0,6000000.0,Orion Pictures,...,,,,,,,,,,
1,7,tt0082979,Reds,English,USA,195,PG,76.0,32000000.0,Paramount Pictures,...,,,,,,,,,,
2,19,tt0084434,An Officer and a Gentleman,English,USA,124,R,75.0,7500000.0,Paramount Pictures,...,,,,,,,,,,
3,25,tt0119360,In & Out,English,USA,90,PG-13,70.0,35000000.0,Paramount Pictures,...,,,,,,,,,,
4,26,tt0292542,Son of the Bride,Spanish,Argentina,123,R,68.0,11936760.0,Sony Pictures Classics,...,,,,,,,,,,


In [5]:
numerics = ['int16','int32','int64','float16','float32','float64']
numerical_vars = list(oscars_df.select_dtypes(include=numerics).columns)
oscars_data = oscars_df[numerical_vars]
oscars_data.shape

(1136, 93)

In [6]:
oscars_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1136 entries, 0 to 1135
Data columns (total 93 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Unnamed: 0                                1136 non-null   int64  
 1   runtime (mins)                            1136 non-null   int64  
 2   metacritic score                          1136 non-null   float64
 3   budget                                    1072 non-null   float64
 4   Total_Noms                                1136 non-null   int64  
 5   Award_Year                                1124 non-null   float64
 6   Awards_Best Actor in a Leading Role       479 non-null    float64
 7   Awards_Best Actor in a Supporting Role    479 non-null    float64
 8   Awards_Best Actress in a Leading Role     479 non-null    float64
 9   Awards_Best Art Direction-Set Decoration  479 non-null    float64
 10  Awards_Best Cinematography          

In [7]:
oscars_data['budget'].fillna(oscars_data['budget'].mean(),inplace=True)
oscars_data.fillna(0,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [8]:
oscars_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1136 entries, 0 to 1135
Data columns (total 93 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Unnamed: 0                                1136 non-null   int64  
 1   runtime (mins)                            1136 non-null   int64  
 2   metacritic score                          1136 non-null   float64
 3   budget                                    1136 non-null   float64
 4   Total_Noms                                1136 non-null   int64  
 5   Award_Year                                1136 non-null   float64
 6   Awards_Best Actor in a Leading Role       1136 non-null   float64
 7   Awards_Best Actor in a Supporting Role    1136 non-null   float64
 8   Awards_Best Actress in a Leading Role     1136 non-null   float64
 9   Awards_Best Art Direction-Set Decoration  1136 non-null   float64
 10  Awards_Best Cinematography          

## Baseline Model

In [9]:
baseline_df = oscars_data.drop(columns=['Award_Avg_Score','Director_Avg_Score','Total_Director_Movies',\
                                        'Lead_Actor_Avg_Score','Total_Movies_Lead_Actor',\
                                        'Genre_Avg_Score','Avg_Rating_by_writer','Award_Year','Total_Noms'])

In [37]:
def ols_model(X,y):
    '''
    Scale and print results summary
    '''
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
    X_train_scale = X.copy()
  
    scale = StandardScaler().fit(X_train_scale)
    
    X_train_scale = scale.transform(X_train_scale)
    model = sm.OLS(y_train, sm.add_constant(X_train_scale))
    results = model.fit()

    return results.summary()

In [42]:
def train_and_test_linear(X,y):
    '''
    Scale data and perform a linear regression on it and cross validation on it 
    '''
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
    
    X_train_scale = X_train.values
    X_test_scale = X_test.values
    
    scale = StandardScaler()

    X_train_scaled = scale.fit_transform(X_train_scale)
    X_test_scaled = scale.transform(X_test_scale)
  
    lm = LinearRegression()
    lm.fit(X_train_scale,y_train)
    y_pred = lm.predict(X_test_scale)
        
    print(f'Linear Regression val R^2: {lm.score(X_train_scale, y_train):.3f}')
    print(f'Linear Regression val RME: {sqrt(mean_squared_error(y_test,y_pred)):.3f}')
    #return y_pred

In [43]:
def scale_test_and_train_Lasso(X_train,X_test):
    
    X, X_test, y, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.25, random_state=3)
    
    X_train_scale = X_train.values
    X_val_scale = X_val.values
    X_test_scale = X_test.values
    
    scale = StandardScaler()

    X_train_scaled = scale.fit_transform(X_train_scale)
    X_val_scaled = scale.transform(X_val_scale)
    X_test_scaled = scale.transform(X_test_scale)
    
    lasso = LassoCV()
    lasso.fit(X_train_scale,y_train)
    
    lasso.score(X_val_scale,y_val)
    
    y_pred = lasso.predict(X_test_scale) 
    
    list(zip(baseline_df.drop(columns='metacritic score'),lm.coef_))
    
    lm.intercept_
    
    print(mean_squared_error(y_test,y_pred))
    print(mean_absolute_error(y_test,y_pred))
    rms = sqrt(mean_squared_error(y_test,y_pred))
    rms
    return X_train, X_test, y_train, y_test

## Run initial regressions on baseline model

In [44]:
X = baseline_df.drop(columns='metacritic score')
y = baseline_df['metacritic score']

In [45]:
train_and_test_linear(X,y)

Linear Regression val R^2: 0.339
Linear Regression val RME: 12.634


In [46]:
ols_model(X,y)

ValueError: endog and exog matrices are different sizes

## High condition score add in engineered features

In [32]:
oscars_data

Unnamed: 0.1,Unnamed: 0,runtime (mins),metacritic score,budget,Total_Noms,Award_Year,Awards_Best Actor in a Leading Role,Awards_Best Actor in a Supporting Role,Awards_Best Actress in a Leading Role,Awards_Best Art Direction-Set Decoration,...,Writer_Aaron Sorkin,Writer_Anthony McCarten,Writer_George Miller,Writer_Joel Coen,Writer_John Logan,Writer_Lawrence Kasdan,Writer_Paul Thomas Anderson,Writer_Pedro Almodóvar,Writer_Quentin Tarantino,Writer_Woody Allen
0,0,120,92.0,6.000000e+06,7,1987.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7,195,76.0,3.200000e+07,12,1981.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,19,124,75.0,7.500000e+06,6,1982.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,25,90,70.0,3.500000e+07,1,1997.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,26,123,68.0,1.193676e+07,1,2001.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1131,3292,110,77.0,4.100000e+07,1,2002.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1132,3293,105,65.0,1.000000e+08,3,1999.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1133,3296,143,40.0,7.000000e+07,3,2004.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1134,3299,107,57.0,4.425875e+07,3,1989.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
oscars_data.corr()

Unnamed: 0.1,Unnamed: 0,runtime (mins),metacritic score,budget,Total_Noms,Award_Year,Awards_Best Actor in a Leading Role,Awards_Best Actor in a Supporting Role,Awards_Best Actress in a Leading Role,Awards_Best Art Direction-Set Decoration,...,Writer_Aaron Sorkin,Writer_Anthony McCarten,Writer_George Miller,Writer_Joel Coen,Writer_John Logan,Writer_Lawrence Kasdan,Writer_Paul Thomas Anderson,Writer_Pedro Almodóvar,Writer_Quentin Tarantino,Writer_Woody Allen
Unnamed: 0,1.000000,-0.202851,-0.293779,0.287660,-0.361148,-0.041778,-0.190280,-0.180021,-0.140746,0.083369,...,0.001159,-0.070953,0.020358,0.020144,-0.058848,0.025631,-0.044722,-0.060705,-0.065390,-0.026727
runtime (mins),-0.202851,1.000000,-0.019315,0.209810,0.355896,0.062201,0.132043,0.078923,0.005028,0.086136,...,0.008801,0.018310,-0.017806,-0.013792,0.069414,0.023093,0.102951,-0.039734,0.130759,-0.111100
metacritic score,-0.293779,-0.019315,1.000000,-0.154606,0.327954,-0.005633,0.080313,0.025476,-0.013267,-0.042860,...,0.010307,-0.019014,0.029249,0.037558,-0.002373,-0.044491,0.069278,0.067315,0.023753,0.017157
budget,0.287660,0.209810,-0.154606,1.000000,0.046763,0.034585,-0.100454,-0.087471,-0.070021,-0.020378,...,0.005100,-0.009568,0.059949,-0.017394,0.129906,0.025417,-0.019045,-0.044876,0.016457,-0.057594
Total_Noms,-0.361148,0.355896,0.327954,0.046763,1.000000,0.071295,0.237237,0.119842,0.046157,0.046992,...,-0.002622,0.040079,0.012545,0.024378,0.075626,0.014534,0.041133,-0.037707,0.073881,-0.016481
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Writer_Lawrence Kasdan,0.025631,0.023093,-0.044491,0.025417,0.014534,0.003843,-0.025391,-0.021232,-0.020024,0.112292,...,-0.005738,-0.004681,-0.004681,-0.007037,-0.005235,1.000000,-0.005235,-0.005235,-0.005738,-0.008136
Writer_Paul Thomas Anderson,-0.044722,0.102951,0.069278,-0.019045,0.041133,0.008715,-0.021441,0.087874,-0.016908,-0.012200,...,-0.004845,-0.003952,-0.003952,-0.005942,-0.004421,-0.005235,1.000000,-0.004421,-0.004845,-0.006870
Writer_Pedro Almodóvar,-0.060705,-0.039734,0.067315,-0.044876,-0.037707,0.007546,-0.021441,-0.017929,-0.016908,-0.012200,...,-0.004845,-0.003952,-0.003952,-0.005942,-0.004421,-0.005235,-0.004421,1.000000,-0.004845,-0.006870
Writer_Quentin Tarantino,-0.065390,0.130759,0.023753,0.016457,0.073881,0.009788,0.018081,0.028665,-0.018530,-0.013370,...,-0.005310,-0.004332,-0.004332,-0.006512,-0.004845,-0.005738,-0.004845,-0.004845,1.000000,-0.007529


In [34]:
X= oscars_data.drop(columns=['metacritic score'])
y= oscars_data['metacritic score']

In [35]:
split_test_val_linear(X,y)

Linear Regression val R^2: 0.257
Linear Regression val RME: 11.591


In [36]:
ols_model(X,y)

0,1,2,3
Dep. Variable:,metacritic score,R-squared:,0.523
Model:,OLS,Adj. R-squared:,0.452
Method:,Least Squares,F-statistic:,7.291
Date:,"Sun, 11 Oct 2020",Prob (F-statistic):,9.09e-53
Time:,14:36:50,Log-Likelihood:,-2488.8
No. Observations:,681,AIC:,5158.0
Df Residuals:,591,BIC:,5565.0
Df Model:,89,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,71.8576,0.385,186.773,0.000,71.102,72.613
x1,-1.9313,0.543,-3.556,0.000,-2.998,-0.865
x2,-1.1693,0.542,-2.158,0.031,-2.234,-0.105
x3,-1.0798,0.580,-1.862,0.063,-2.218,0.059
x4,3.9283,0.597,6.582,0.000,2.756,5.100
x5,-0.2264,0.778,-0.291,0.771,-1.754,1.301
x6,0.1560,0.489,0.319,0.750,-0.804,1.116
x7,-0.0676,0.470,-0.144,0.886,-0.991,0.856
x8,0.1528,0.446,0.343,0.732,-0.723,1.029

0,1,2,3
Omnibus:,16.776,Durbin-Watson:,2.023
Prob(Omnibus):,0.0,Jarque-Bera (JB):,19.156
Skew:,-0.311,Prob(JB):,6.92e-05
Kurtosis:,3.536,Cond. No.,9000000000000000.0


### Going to perform a Lasso Regression on the model due to high P-values and still a high coefficient number