In [1]:
import pandas as pd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split, validation_curve,cross_val_score,KFold
from sklearn.linear_model import Lasso,LassoCV,LinearRegression,RidgeCV
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.metrics import mean_squared_error,mean_absolute_error
import statsmodels.api as sm
import statsmodels.formula.api as smf 
from math import sqrt

## Explore the data

In [2]:
oscars_df = pd.read_csv('oscar_movies_data_2.csv')

In [3]:
oscars_df.drop(columns=['Unnamed: 0'],inplace=True)

In [4]:
oscars_df.head()

Unnamed: 0,IMDBId,movie title,language,country,runtime (mins),mpaarating,metacritic score,budget,distributionCompany,Awards,...,Writer_Aaron Sorkin,Writer_Eric Roth,Writer_Henry James,Writer_Joel Coen,Writer_John Logan,Writer_Lawrence Kasdan,Writer_Paul Thomas Anderson,Writer_Pedro Almodóvar,Writer_Quentin Tarantino,Writer_Woody Allen
0,tt0091763,Platoon,English,USA,120,R,92.0,6000000.0,Orion Pictures,Best Actor in a Supporting Role,...,,,,,,,,,,
1,tt0082979,Reds,English,USA,195,PG,76.0,32000000.0,Paramount Pictures,Best Actor in a Leading Role,...,,,,,,,,,,
2,tt0084434,An Officer and a Gentleman,English,USA,124,R,75.0,7500000.0,Paramount Pictures,Best Actor in a Supporting Role,...,,,,,,,,,,
3,tt0119360,In & Out,English,USA,90,PG-13,70.0,35000000.0,Paramount Pictures,Best Actress in a Supporting Role,...,,,,,,,,,,
4,tt0292542,Son of the Bride,Spanish,Argentina,123,R,68.0,11936760.0,Sony Pictures Classics,Best Foreign Language Film,...,,,,,,,,,,


In [5]:
numerics = ['int16','int32','int64','float16','float32','float64']
numerical_vars = list(oscars_df.select_dtypes(include=numerics).columns)
oscars_data = oscars_df[numerical_vars]
oscars_data.shape

(1136, 90)

In [6]:
oscars_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1136 entries, 0 to 1135
Data columns (total 90 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   runtime (mins)                            1136 non-null   int64  
 1   metacritic score                          1136 non-null   float64
 2   budget                                    1072 non-null   float64
 3   Total_Noms                                1136 non-null   int64  
 4   Award_Year                                1124 non-null   float64
 5   Awards_Best Actor in a Leading Role       479 non-null    float64
 6   Awards_Best Actor in a Supporting Role    479 non-null    float64
 7   Awards_Best Actress in a Leading Role     479 non-null    float64
 8   Awards_Best Art Direction-Set Decoration  479 non-null    float64
 9   Awards_Best Cinematography                479 non-null    float64
 10  Awards_Best Director                

In [7]:
oscars_data['budget'].fillna(oscars_data['budget'].mean(),inplace=True)
oscars_data.fillna(0,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [8]:
oscars_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1136 entries, 0 to 1135
Data columns (total 90 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   runtime (mins)                            1136 non-null   int64  
 1   metacritic score                          1136 non-null   float64
 2   budget                                    1136 non-null   float64
 3   Total_Noms                                1136 non-null   int64  
 4   Award_Year                                1136 non-null   float64
 5   Awards_Best Actor in a Leading Role       1136 non-null   float64
 6   Awards_Best Actor in a Supporting Role    1136 non-null   float64
 7   Awards_Best Actress in a Leading Role     1136 non-null   float64
 8   Awards_Best Art Direction-Set Decoration  1136 non-null   float64
 9   Awards_Best Cinematography                1136 non-null   float64
 10  Awards_Best Director                

## Baseline Model

In [9]:
baseline_df = oscars_data.drop(columns=['Director_Avg_Score','Total_Director_Movies',\
                                        'Lead_Actor_Avg_Score','Total_Movies_Lead_Actor',\
                                        'Avg_Rating_by_writer','Award_Year','Total_Noms'])

In [10]:
def ols_model(X,y):
    '''
    Scale and print results summary
    '''
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
    X_train_scale = X_train.copy()
  
    scale = StandardScaler().fit(X_train_scale)
    
    X_train_scale = scale.transform(X_train_scale)
    model = sm.OLS(y_train, sm.add_constant(X_train_scale))
    results = model.fit()

    return results.summary()

In [11]:
def train_and_test_linear(X,y):
    '''
    Scale data and perform a linear regression on it and cross validation on it 
    '''
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=10)
    
    X_train_scale = X_train.values
    X_test_scale = X_test.values
    
    scale = StandardScaler()

    X_train_scaled = scale.fit_transform(X_train_scale)
    X_test_scaled = scale.transform(X_test_scale)
  
    lm = LinearRegression()
    lm.fit(X_train_scale,y_train)
    y_pred = lm.predict(X_test_scale)
        
    print(f'Linear Regression val R^2: {lm.score(X_train_scale, y_train):.3f}')
    print(f'Linear Regression val RME: {sqrt(mean_squared_error(y_test,y_pred)):.3f}')
    #return y_pred

In [12]:
def scale_test_and_train_Lasso(X,y):
    
    X, X_test, y, y_test = train_test_split(X, y, test_size=0.2,random_state=10)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.25, random_state=3)
    
    X_train_scale = X_train.values
    X_val_scale = X_val.values
    X_test_scale = X_test.values
    
    scale = StandardScaler()

    X_train_scale = scale.fit_transform(X_train_scale)
    X_test_scale = scale.transform(X_test_scale)
    X_val_scale = scale.transform(X_val_scale)
    
    lasso = LassoCV()
    lasso.fit(X_train_scale,y_train)
    
    lasso.score(X_val_scale,y_val)
    
    y_pred = lasso.predict(X_val_scale)
    
    
    print(f'Linear Regression val R^2: {lasso.score(X_train_scale, y_train):.3f}')
    print(f'Linear Regression val RME: {sqrt(mean_squared_error(y_val,y_pred)):.3f}')
    
    return lasso.coef_

In [13]:
def scale_test_and_train_ridge(X,y):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=10)
    
    X_train_scale = X_train.values
    X_test_scale = X_test.values
    
    scale = StandardScaler()

    X_train_scale = scale.fit_transform(X_train_scale)
    X_test_scale = scale.transform(X_test_scale)
    
    ridge = RidgeCV(cv=5)
    ridge.fit(X_train_scale,y_train)
    
    ridge.score(X_train_scale,y_train)
    
    y_pred = ridge.predict(X_test_scale)
    
    
    print(f'Ridge Regression val R^2: {ridge.score(X_train_scale, y_train):.3f}')
    print(f'Ridge Regression val RME: {sqrt(mean_squared_error(y_test,y_pred)):.3f}')

## Run initial regressions on baseline model

In [14]:
X = baseline_df.drop(columns='metacritic score')
y = baseline_df['metacritic score']

In [15]:
train_and_test_linear(X,y)

Linear Regression val R^2: 0.329
Linear Regression val RME: 13.470


In [16]:
ols_model(X,y)

0,1,2,3
Dep. Variable:,metacritic score,R-squared:,0.311
Model:,OLS,Adj. R-squared:,0.245
Method:,Least Squares,F-statistic:,4.725
Date:,"Sun, 11 Oct 2020",Prob (F-statistic):,4.62e-31
Time:,15:52:27,Log-Likelihood:,-3484.3
No. Observations:,908,AIC:,7129.0
Df Residuals:,828,BIC:,7513.0
Df Model:,79,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,71.9559,0.390,184.411,0.000,71.190,72.722
x1,0.5455,0.526,1.037,0.300,-0.487,1.578
x2,-0.3159,0.563,-0.561,0.575,-1.421,0.789
x3,2.3301,0.446,5.227,0.000,1.455,3.205
x4,1.5429,0.433,3.560,0.000,0.692,2.393
x5,1.1257,0.433,2.597,0.010,0.275,1.976
x6,0.4238,0.419,1.011,0.312,-0.399,1.247
x7,0.8638,0.405,2.132,0.033,0.069,1.659
x8,0.8990,0.420,2.139,0.033,0.074,1.724

0,1,2,3
Omnibus:,45.734,Durbin-Watson:,1.979
Prob(Omnibus):,0.0,Jarque-Bera (JB):,57.886
Skew:,-0.482,Prob(JB):,2.69e-13
Kurtosis:,3.775,Cond. No.,1.01e+16


## High condition score add in engineered features

In [17]:
oscars_data

Unnamed: 0,runtime (mins),metacritic score,budget,Total_Noms,Award_Year,Awards_Best Actor in a Leading Role,Awards_Best Actor in a Supporting Role,Awards_Best Actress in a Leading Role,Awards_Best Art Direction-Set Decoration,Awards_Best Cinematography,...,Writer_Aaron Sorkin,Writer_Eric Roth,Writer_Henry James,Writer_Joel Coen,Writer_John Logan,Writer_Lawrence Kasdan,Writer_Paul Thomas Anderson,Writer_Pedro Almodóvar,Writer_Quentin Tarantino,Writer_Woody Allen
0,120,92.0,6.000000e+06,7,1987.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,195,76.0,3.200000e+07,12,1981.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,124,75.0,7.500000e+06,6,1982.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,90,70.0,3.500000e+07,1,1997.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,123,68.0,1.193676e+07,1,2001.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1131,110,77.0,4.100000e+07,1,2002.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1132,105,65.0,1.000000e+08,3,1999.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1133,143,40.0,7.000000e+07,3,2004.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1134,107,57.0,4.425875e+07,3,1989.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
oscars_data.corr()

Unnamed: 0,runtime (mins),metacritic score,budget,Total_Noms,Award_Year,Awards_Best Actor in a Leading Role,Awards_Best Actor in a Supporting Role,Awards_Best Actress in a Leading Role,Awards_Best Art Direction-Set Decoration,Awards_Best Cinematography,...,Writer_Aaron Sorkin,Writer_Eric Roth,Writer_Henry James,Writer_Joel Coen,Writer_John Logan,Writer_Lawrence Kasdan,Writer_Paul Thomas Anderson,Writer_Pedro Almodóvar,Writer_Quentin Tarantino,Writer_Woody Allen
runtime (mins),1.000000,-0.019315,0.209810,0.355896,0.062201,0.132043,0.078923,0.005028,0.086136,0.033169,...,0.008801,0.080321,-0.015080,-0.013792,0.069414,0.023093,0.102951,-0.039734,0.130759,-0.111100
metacritic score,-0.019315,1.000000,-0.154606,0.327954,-0.005633,0.080313,0.025476,-0.013267,-0.042860,-0.028671,...,0.010307,-0.025596,-0.034371,0.037558,-0.002373,-0.044491,0.069278,0.067315,0.023753,0.017157
budget,0.209810,-0.154606,1.000000,0.046763,0.034585,-0.100454,-0.087471,-0.070021,-0.020378,-0.026026,...,0.005100,0.049677,-0.011493,-0.017394,0.129906,0.025417,-0.019045,-0.044876,0.016457,-0.057594
Total_Noms,0.355896,0.327954,0.046763,1.000000,0.071295,0.237237,0.119842,0.046157,0.046992,0.026253,...,-0.002622,0.067612,-0.014988,0.024378,0.075626,0.014534,0.041133,-0.037707,0.073881,-0.016481
Award_Year,0.062201,-0.005633,0.034585,0.071295,1.000000,0.017762,0.014441,0.015956,0.008851,0.006575,...,0.009254,0.008951,-0.140809,0.012536,0.009039,0.003843,0.008715,0.007546,0.009788,0.007576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Writer_Lawrence Kasdan,0.023093,-0.044491,0.025417,0.014534,0.003843,-0.025391,-0.021232,-0.020024,0.112292,-0.010270,...,-0.005738,-0.004681,-0.004681,-0.007037,-0.005235,1.000000,-0.005235,-0.005235,-0.005738,-0.008136
Writer_Paul Thomas Anderson,0.102951,0.069278,-0.019045,0.041133,0.008715,-0.021441,0.087874,-0.016908,-0.012200,-0.008672,...,-0.004845,-0.003952,-0.003952,-0.005942,-0.004421,-0.005235,1.000000,-0.004421,-0.004845,-0.006870
Writer_Pedro Almodóvar,-0.039734,0.067315,-0.044876,-0.037707,0.007546,-0.021441,-0.017929,-0.016908,-0.012200,-0.008672,...,-0.004845,-0.003952,-0.003952,-0.005942,-0.004421,-0.005235,-0.004421,1.000000,-0.004845,-0.006870
Writer_Quentin Tarantino,0.130759,0.023753,0.016457,0.073881,0.009788,0.018081,0.028665,-0.018530,-0.013370,-0.009504,...,-0.005310,-0.004332,-0.004332,-0.006512,-0.004845,-0.005738,-0.004845,-0.004845,1.000000,-0.007529


In [21]:
X= oscars_data.drop(columns=['metacritic score'])
y= oscars_data['metacritic score']

In [22]:
train_and_test_linear(X,y)

Linear Regression val R^2: 0.434
Linear Regression val RME: 13.060


In [23]:
ols_model(X,y)

0,1,2,3
Dep. Variable:,metacritic score,R-squared:,0.416
Model:,OLS,Adj. R-squared:,0.355
Method:,Least Squares,F-statistic:,6.803
Date:,"Sun, 11 Oct 2020",Prob (F-statistic):,4.89e-53
Time:,15:52:47,Log-Likelihood:,-3409.0
No. Observations:,908,AIC:,6992.0
Df Residuals:,821,BIC:,7411.0
Df Model:,86,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,71.9559,0.361,199.513,0.000,71.248,72.664
x1,-0.6854,0.505,-1.357,0.175,-1.676,0.306
x2,-0.6470,0.532,-1.215,0.225,-1.692,0.398
x3,4.8729,0.553,8.813,0.000,3.788,5.958
x4,-0.0776,0.725,-0.107,0.915,-1.500,1.345
x5,0.9629,0.449,2.145,0.032,0.082,1.844
x6,0.2946,0.421,0.700,0.484,-0.531,1.120
x7,0.5250,0.408,1.285,0.199,-0.277,1.327
x8,-0.0224,0.392,-0.057,0.954,-0.793,0.748

0,1,2,3
Omnibus:,37.197,Durbin-Watson:,2.027
Prob(Omnibus):,0.0,Jarque-Bera (JB):,43.817
Skew:,-0.445,Prob(JB):,3.06e-10
Kurtosis:,3.606,Cond. No.,5480000000000000.0


### Going to perform a Lasso Regression on the model due to high P-values and still a high coefficient number

In [24]:
lasso = scale_test_and_train_Lasso(X,y)

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


Linear Regression val R^2: 0.411
Linear Regression val RME: 11.294


In [25]:
list(zip(oscars_data.drop(columns='metacritic score'),lasso))

[('runtime (mins)', -0.0),
 ('budget', -0.4969680606417507),
 ('Total_Noms', 4.607823786248296),
 ('Award_Year', -0.0),
 ('Awards_Best Actor in a Leading Role', 0.09046779393389977),
 ('Awards_Best Actor in a Supporting Role', 0.0),
 ('Awards_Best Actress in a Leading Role', 0.0),
 ('Awards_Best Art Direction-Set Decoration', -0.0),
 ('Awards_Best Cinematography', -0.0),
 ('Awards_Best Director', 0.07982141996591134),
 ('Awards_Best Film Editing', -0.0),
 ('Awards_Best Motion Picture of the Year', 0.569503441442804),
 ('Awards_Best Picture', 0.0),
 ('Awards_Best Sound', -0.8240475322213064),
 ('Director_Avg_Score', 2.9141025399453406),
 ('Total_Director_Movies', 0.0),
 ('Lead_Actor_Avg_Score', 0.0),
 ('Total_Movies_Lead_Actor', -1.4242656912025542),
 ('Avg_Rating_by_writer', 0.9007655349945209),
 ('Buena Vista Pictures', -0.5863591818966543),
 ('Columbia Pictures', -0.6416805685464253),
 ('Fox Searchlight Pictures', 0.011484438968058326),
 ('Miramax', 0.0),
 ('Paramount Pictures', -1.1

## Pull out non zeroed out columns perform a simple linear regression and look at condition number

In [76]:
smaller_df = oscars_data[['metacritic score','runtime (mins)','budget','Total_Noms','Awards_Best Actor in a Leading Role',\
                          'Awards_Best Director','Awards_Best Art Direction-Set Decoration','Awards_Best Sound','Award_Avg_Score',\
                          'Director_Avg_Score','Total_Director_Movies','Total_Movies_Lead_Actor','Avg_Rating_by_writer',\
                          'Buena Vista Pictures','Columbia Pictures','Paramount Pictures','Sony Pictures Classics',\
                          'Universal Pictures','Walt Disney Studios Motion Pictures','Warner Bros.','Genre_Action','Genre_Adventure','Genre_Animation',\
                          'Genre_Crime','Genre_Documentary','Genre_Horror','Subgenre_Music','Subgenre_Romance','Subgenre_Thriller','Director_Martin Scorsese'\
                        ,'Director_Ron Howard','Director_Tim Burton','Lead_Actor_Michael Douglas','Lead_Actor_Robin Williams','Lead_Actor_Tom Hanks','Summer',\
                          'G','Not Rated','PG','PG-13','Writer_Anthony McCarten','Writer_Paul Thomas Anderson','Writer_Pedro Almodóvar']]

In [77]:
smaller_df.corr()

Unnamed: 0,metacritic score,runtime (mins),budget,Total_Noms,Awards_Best Actor in a Leading Role,Awards_Best Director,Awards_Best Art Direction-Set Decoration,Awards_Best Sound,Award_Avg_Score,Director_Avg_Score,...,Lead_Actor_Robin Williams,Lead_Actor_Tom Hanks,Summer,G,Not Rated,PG,PG-13,Writer_Anthony McCarten,Writer_Paul Thomas Anderson,Writer_Pedro Almodóvar
metacritic score,1.0,-0.019315,-0.154606,0.327954,0.080313,0.068001,-0.04286,-0.129017,0.461919,0.2747819,...,-0.098025,0.041248,0.009943,0.042005,0.082425,-0.087342,-0.125632,-0.019014,0.069278,0.067315
runtime (mins),-0.019315,1.0,0.20981,0.355896,0.132043,0.02314,0.086136,0.033011,0.105799,-0.005963884,...,0.014891,0.013655,-0.081928,-0.192144,-0.116102,-0.127112,0.067625,0.01831,0.102951,-0.039734
budget,-0.154606,0.20981,1.0,0.046763,-0.100454,-0.052227,-0.020378,0.044777,-0.089565,-0.04937678,...,0.00028,0.067131,0.073144,0.092223,-0.060157,0.116777,0.202384,-0.009568,-0.019045,-0.044876
Total_Noms,0.327954,0.355896,0.046763,1.0,0.237237,-0.032434,0.046992,-0.054768,0.274428,0.06914907,...,-0.016094,0.06695,-0.086312,-0.04716,-0.110333,-0.005239,0.01054,0.040079,0.041133,-0.037707
Awards_Best Actor in a Leading Role,0.080313,0.132043,-0.100454,0.237237,1.0,-0.033319,-0.059168,-0.033319,0.056577,-0.009187256,...,0.073164,0.075445,-0.061732,-0.038954,-0.056641,0.023451,-0.087654,-0.019169,-0.021441,-0.021441
Awards_Best Director,0.068001,0.02314,-0.052227,-0.032434,-0.033319,1.0,-0.018959,-0.010676,0.075682,0.03094825,...,-0.009234,-0.013476,-0.031861,-0.018149,-0.018149,-0.027392,-0.062878,-0.006142,-0.00687,0.123203
Awards_Best Art Direction-Set Decoration,-0.04286,0.086136,-0.020378,0.046992,-0.059168,-0.018959,1.0,-0.018959,-0.020615,-0.03118542,...,0.095476,-0.023931,0.003427,0.025978,-0.032229,0.064852,0.03351,-0.010907,-0.0122,-0.0122
Awards_Best Sound,-0.129017,0.033011,0.044777,-0.054768,-0.033319,-0.010676,-0.018959,1.0,-0.075008,-0.04385853,...,-0.009234,-0.013476,0.010521,-0.018149,-0.018149,0.016809,0.053457,-0.006142,-0.00687,-0.00687
Award_Avg_Score,0.461919,0.105799,-0.089565,0.274428,0.056577,0.075682,-0.020615,-0.075008,1.0,0.1565074,...,-0.09034,-0.00213,-0.060606,-0.070591,0.087438,-0.191414,-0.01714,0.061344,0.028327,0.021487
Director_Avg_Score,0.274782,-0.005964,-0.049377,0.069149,-0.009187,0.030948,-0.031185,-0.043859,0.156507,1.0,...,-0.072683,0.012415,-0.052624,0.015799,0.000376,-0.045109,-0.002726,0.044568,0.077947,0.081331


In [78]:
X = smaller_df.drop(columns ='metacritic score')
y = smaller_df['metacritic score']

In [79]:
train_and_test_linear(X,y)

Linear Regression val R^2: 0.482
Linear Regression val RME: 12.388


In [80]:
ols_model(X,y)

0,1,2,3
Dep. Variable:,metacritic score,R-squared:,0.454
Model:,OLS,Adj. R-squared:,0.427
Method:,Least Squares,F-statistic:,17.12
Date:,"Sun, 11 Oct 2020",Prob (F-statistic):,1.13e-86
Time:,15:35:53,Log-Likelihood:,-3378.5
No. Observations:,908,AIC:,6843.0
Df Residuals:,865,BIC:,7050.0
Df Model:,42,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,71.9559,0.340,211.772,0.000,71.289,72.623
x1,-0.9928,0.441,-2.253,0.024,-1.858,-0.128
x2,-1.0293,0.461,-2.231,0.026,-1.935,-0.124
x3,4.4954,0.410,10.976,0.000,3.692,5.299
x4,0.3920,0.365,1.075,0.283,-0.323,1.107
x5,0.4098,0.358,1.146,0.252,-0.292,1.112
x6,-0.4299,0.351,-1.223,0.222,-1.120,0.260
x7,-0.8678,0.353,-2.456,0.014,-1.561,-0.174
x8,3.8118,0.396,9.618,0.000,3.034,4.590

0,1,2,3
Omnibus:,39.263,Durbin-Watson:,1.995
Prob(Omnibus):,0.0,Jarque-Bera (JB):,47.043
Skew:,-0.454,Prob(JB):,6.09e-11
Kurtosis:,3.647,Cond. No.,3.04


## Satisfied with final features will now select the final model

In [81]:
def cross_val_linear(X,y):
    '''
    Scale data and perform a linear regression on it and cross validation on it 
    '''
    X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=10)
    
    kf = KFold(n_splits=5, shuffle=True, random_state = 71)
    r2_scores, rme_scores = [], [] #collect the validation results for both models

    for train_ind, val_ind in kf.split(X,y):
    
        X_train, y_train = X.iloc[train_ind], y.iloc[train_ind]
        X_val, y_val = X.iloc[val_ind], y.iloc[val_ind] 
        
        scale = StandardScaler()
        X_train_scale = scale.fit_transform(X_train)
        X_val_scale = scale.transform(X_val)
        
       
        lm = LinearRegression()
      
        

        lm.fit(X_train_scale, y_train)
        
        y_pred = lm.predict(X_val_scale)
        r2_scores.append(lm.score(X_val_scale, y_val))
        rme_scores.append(sqrt(mean_squared_error(y_val,y_pred)))
                        

    print('Scaled regression scores: ', r2_scores)
    print('Scaled regression RME scores: ',rme_scores)
    print(f'Scaled mean cv r^2: {np.mean(r2_scores):.3f} +- {np.std(r2_scores):.3f}')
    print(f'Scaled mean cv r^2: {np.mean(rme_scores):.3f} +- {np.std(rme_scores):.3f}')

In [82]:
cross_val_linear(X,y)

Scaled regression scores:  [0.39817403485632696, 0.3904281172927626, 0.443709119314424, 0.4645313018512045, 0.33716138221385383]
Scaled regression RME scores:  [11.16941737303148, 9.783981084692718, 10.290636555549806, 9.81704616471414, 10.428534615707315]
Scaled mean cv r^2: 0.407 +- 0.044
Scaled mean cv r^2: 10.298 +- 0.504


In [83]:
lasso = scale_test_and_train_Lasso(X,y)

Linear Regression val R^2: 0.481
Linear Regression val RME: 12.314


In [89]:
scale_test_and_train_ridge(X,y)

Ridge Regression val R^2: 0.482
Ridge Regression val RME: 12.371
