In [1]:
%config InlineBackend.figure_formats = ['retina']

import sys
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import statsmodels.api as sm

from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

In [2]:
oscars_df = pd.read_csv('oscar_movies_data.csv')

In [3]:
oscars_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1532 entries, 0 to 1531
Data columns (total 93 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Unnamed: 0                                     1532 non-null   int64  
 1   IMDBId                                         908 non-null    object 
 2   movie title                                    1532 non-null   object 
 3   language                                       1532 non-null   object 
 4   country                                        1532 non-null   object 
 5   runtime (mins)                                 1532 non-null   int64  
 6   mpaarating                                     1532 non-null   object 
 7   metacritic score                               1532 non-null   float64
 8   budget                                         1211 non-null   float64
 9   distributionCompany                            1523 

## MVP Model

In [21]:
numerics = ['int16','int32','int64','float16','float32','float64']
numerical_vars = list(oscars_df.select_dtypes(include=numerics).columns)

In [62]:
oscars_numeric_df = oscars_df.loc[:,numerical_vars]

In [63]:
oscars_numeric_df.corr()

Unnamed: 0.1,Unnamed: 0,runtime (mins),metacritic score,budget,Total_Movies_Lead_Actor_Director,Total_Movies_First_Supporting_Actor_Director,Total_Movies_Second_Supporting_Actor_Director,Buena Vista Pictures,Columbia Pictures,Focus Features,...,Tony Kushner,Valerie Curtin,William Goldman,Winston Groom,G,Not Rated,PG,PG-13,R,Unrated
Unnamed: 0,1.000000,-0.112974,-0.151401,0.146535,0.170993,0.301774,0.296890,0.090260,0.002451,-0.028815,...,-0.003440,0.104433,0.099885,-0.107712,0.082745,0.016747,0.091126,0.028380,-0.125753,-0.019296
runtime (mins),-0.112974,1.000000,-0.023589,0.249181,0.088444,-0.013948,0.026571,-0.037870,0.013802,-0.053639,...,0.266500,-0.105789,-0.238844,0.068806,-0.177472,-0.133952,-0.127174,0.040058,0.173080,-0.051979
metacritic score,-0.151401,-0.023589,1.000000,-0.186416,0.000218,-0.057804,-0.062670,-0.055294,-0.047187,0.112641,...,0.027835,-0.165807,0.068378,0.095400,0.057504,0.054021,-0.113506,-0.108521,0.146526,-0.011982
budget,0.146535,0.249181,-0.186416,1.000000,0.112670,0.116715,0.118473,0.163029,-0.012280,-0.120425,...,0.063660,-0.092502,-0.128443,0.007216,0.101507,-0.086315,0.079554,0.223022,-0.277408,
Total_Movies_Lead_Actor_Director,0.170993,0.088444,0.000218,0.112670,1.000000,0.530756,0.581119,0.007054,-0.005962,0.079072,...,0.012893,-0.102551,0.012893,0.009047,-0.001908,-0.041135,0.019394,0.007696,-0.005869,-0.007916
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Not Rated,0.016747,-0.133952,0.054021,-0.086315,-0.041135,-0.020223,-0.012763,,,,...,-0.039193,-0.048387,-0.039193,-0.027496,-0.030235,1.000000,-0.083764,-0.107074,-0.177110,-0.010303
PG,0.091126,-0.127174,-0.113506,0.079554,0.019394,0.037916,0.023141,0.040442,0.164679,-0.063443,...,-0.084781,0.084317,0.374448,-0.059479,-0.079049,-0.083764,1.000000,-0.279945,-0.463056,-0.026938
PG-13,0.028380,0.040058,-0.108521,0.223022,0.007696,-0.007071,-0.007009,-0.052138,0.009606,-0.054390,...,-0.150188,-0.185419,-0.150188,0.148293,-0.101046,-0.107074,-0.279945,1.000000,-0.591915,-0.034434
R,-0.125753,0.173080,0.146526,-0.277408,-0.005869,-0.027929,-0.019332,-0.125851,-0.115591,0.111919,...,0.240772,0.143908,-0.131851,-0.092502,-0.167140,-0.177110,-0.463056,-0.591915,1.000000,-0.056957


In [None]:
plt.figure(figsize=(40,40))
sns.heatmap(oscars_numeric_df.corr(), cmap="seismic", annot=True, vmin=-1, vmax=1)

plt.gca().set_ylim(len(oscars_numeric_df.corr())+0.5, -0.5);

In [None]:
X = oscars_numeric_df.drop(labels=['metacritic score','Unnamed: 0'],axis=1)
y = oscars_numeric_df['metacritic score']

In [None]:
oscars_numeric_df = oscars_numeric_df.fillna(0)

In [None]:
oscars_numeric_df.info()

In [None]:
mvp_model = sm.OLS(y,X,missing='drop')
mvp_fit = mvp_model.fit()
mvp_fit.summary()

In [None]:
plt.figure(figsize=(10, 7))
plt.scatter(mvp_fit.predict(), mvp_fit.resid)    #change this if working with sklearn

plt.axhline(0, linestyle='--', color='gray')
plt.xlabel('Predicted Values', fontsize=18)
plt.ylabel('Residuals', fontsize=18);

In [None]:
from sklearn.preprocessing import PolynomialFeatures

p = PolynomialFeatures()
X_poly = p.fit_transform(X)

lr_full = LinearRegression()
lr_full.fit(X_poly, y)
lr_full.score(X_poly, y)

In [None]:
X.shape

In [None]:
X_poly.shape

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression  

# Let SKLearn fit the best line
lm = LinearRegression()
lm.fit(X, y)

In [None]:
lm.coef_

In [None]:
lm.intercept_