In [1]:
import pandas as pd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split, validation_curve,cross_val_score,KFold
from sklearn.linear_model import Lasso,LassoCV,LinearRegression,RidgeCV
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.metrics import mean_squared_error,mean_absolute_error
import statsmodels.api as sm
import statsmodels.formula.api as smf 
from math import sqrt

## Explore the data

In [2]:
oscars_df = pd.read_csv('oscar_movies_data_2.csv')

In [3]:
#oscars_df.drop(columns=['Unnamed: 0'],inplace=True)

In [4]:
oscars_df.head()

Unnamed: 0.1,Unnamed: 0,IMDBId,movie title,language,country,runtime (mins),mpaarating,metacritic score,budget,distributionCompany,...,Writer_Aaron Sorkin,Writer_Anthony McCarten,Writer_George Miller,Writer_Joel Coen,Writer_John Logan,Writer_Lawrence Kasdan,Writer_Paul Thomas Anderson,Writer_Pedro Almodóvar,Writer_Quentin Tarantino,Writer_Woody Allen
0,0,tt0091763,Platoon,English,USA,120,R,92.0,6000000.0,Orion Pictures,...,,,,,,,,,,
1,7,tt0082979,Reds,English,USA,195,PG,76.0,32000000.0,Paramount Pictures,...,,,,,,,,,,
2,19,tt0084434,An Officer and a Gentleman,English,USA,124,R,75.0,7500000.0,Paramount Pictures,...,,,,,,,,,,
3,25,tt0119360,In & Out,English,USA,90,PG-13,70.0,35000000.0,Paramount Pictures,...,,,,,,,,,,
4,26,tt0292542,Son of the Bride,Spanish,Argentina,123,R,68.0,11936760.0,Sony Pictures Classics,...,,,,,,,,,,


In [5]:
numerics = ['int16','int32','int64','float16','float32','float64']
numerical_vars = list(oscars_df.select_dtypes(include=numerics).columns)
oscars_data = oscars_df[numerical_vars]
oscars_data.shape

(1136, 93)

In [6]:
oscars_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1136 entries, 0 to 1135
Data columns (total 93 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Unnamed: 0                                1136 non-null   int64  
 1   runtime (mins)                            1136 non-null   int64  
 2   metacritic score                          1136 non-null   float64
 3   budget                                    1072 non-null   float64
 4   Total_Noms                                1136 non-null   int64  
 5   Award_Year                                1124 non-null   float64
 6   Awards_Best Actor in a Leading Role       479 non-null    float64
 7   Awards_Best Actor in a Supporting Role    479 non-null    float64
 8   Awards_Best Actress in a Leading Role     479 non-null    float64
 9   Awards_Best Art Direction-Set Decoration  479 non-null    float64
 10  Awards_Best Cinematography          

In [7]:
oscars_data['budget'].fillna(oscars_data['budget'].mean(),inplace=True)
oscars_data.fillna(0,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [8]:
oscars_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1136 entries, 0 to 1135
Data columns (total 93 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Unnamed: 0                                1136 non-null   int64  
 1   runtime (mins)                            1136 non-null   int64  
 2   metacritic score                          1136 non-null   float64
 3   budget                                    1136 non-null   float64
 4   Total_Noms                                1136 non-null   int64  
 5   Award_Year                                1136 non-null   float64
 6   Awards_Best Actor in a Leading Role       1136 non-null   float64
 7   Awards_Best Actor in a Supporting Role    1136 non-null   float64
 8   Awards_Best Actress in a Leading Role     1136 non-null   float64
 9   Awards_Best Art Direction-Set Decoration  1136 non-null   float64
 10  Awards_Best Cinematography          

## Baseline Model

In [9]:
baseline_df = oscars_data.drop(columns=['Award_Avg_Score','Director_Avg_Score','Total_Director_Movies',\
                                        'Lead_Actor_Avg_Score','Total_Movies_Lead_Actor',\
                                        'Genre_Avg_Score','Avg_Rating_by_writer','Award_Year','Total_Noms'])

In [19]:
def ols_model(X,y):
    '''
    Scale and print results summary
    '''
    X, X_test, y, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.25, random_state=3)
    X_train_scale = X_train.copy()
  
    scale = StandardScaler().fit(X_train_scale)
    
    X_train_scale = scale.transform(X_train_scale)
    model = sm.OLS(y_train, sm.add_constant(X_train_scale))
    results = model.fit()

    return results.summary()

In [28]:
def split_test_val_linear(X,y):
    '''
    Scale data and perform a linear regression on it and cross validation on it 
    '''
    X, X_test, y, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.25, random_state=3)
    
    X_train_scale = X_train.values
    X_val_scale = X_val.values
    X_test_scale = X_test.values
    
    scale = StandardScaler()

    X_train_scaled = scale.fit_transform(X_train_scale)
    X_val_scaled = scale.transform(X_val_scale)
    X_test_scaled = scale.transform(X_test_scale)
  
    lm = LinearRegression()
    lm.fit(X_train_scale,y_train)
    y_pred = lm.predict(X_val_scale)
        
    print(f'Linear Regression val R^2: {lm.score(X_val, y_val):.3f}')
    print(f'Linear Regression val RME: {sqrt(mean_squared_error(y_val,y_pred)):.3f}')
    #return y_pred

## Run initial regressions on baseline model

In [29]:
X = baseline_df.drop(columns='metacritic score')
y = baseline_df['metacritic score']

In [30]:
split_test_val_linear(X,y)

Linear Regression val R^2: 0.072
Linear Regression val RME: 12.952


In [31]:
ols_model(X,y)

0,1,2,3
Dep. Variable:,metacritic score,R-squared:,0.388
Model:,OLS,Adj. R-squared:,0.307
Method:,Least Squares,F-statistic:,4.758
Date:,"Sun, 11 Oct 2020",Prob (F-statistic):,2.75e-29
Time:,14:33:43,Log-Likelihood:,-2573.8
No. Observations:,681,AIC:,5310.0
Df Residuals:,600,BIC:,5676.0
Df Model:,80,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,71.8576,0.433,166.102,0.000,71.008,72.707
x1,-3.1601,0.594,-5.318,0.000,-4.327,-1.993
x2,0.1420,0.582,0.244,0.807,-1.002,1.286
x3,-0.5662,0.634,-0.893,0.372,-1.811,0.679
x4,1.4815,0.508,2.918,0.004,0.484,2.479
x5,1.2313,0.499,2.465,0.014,0.250,2.212
x6,0.1439,0.492,0.293,0.770,-0.822,1.109
x7,0.3768,0.474,0.796,0.426,-0.553,1.307
x8,0.7198,0.455,1.583,0.114,-0.173,1.612

0,1,2,3
Omnibus:,26.409,Durbin-Watson:,2.05
Prob(Omnibus):,0.0,Jarque-Bera (JB):,31.987
Skew:,-0.405,Prob(JB):,1.13e-07
Kurtosis:,3.686,Cond. No.,1.14e+16


## High condition score add in engineered features