# 11 Modeling for IMDB Rating

## 11.01 Imports


### 11.01.01 Python Imports


In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.metrics as metrics
from sklearn.metrics import mean_squared_error, make_scorer

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV

from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.decomposition import PCA

### 11.01.02 Feature Film Import


In [32]:
df = pd.read_csv('../Bens_Data/Disney_Films_For_Visual.csv')
df.drop(columns=['Unnamed: 0','index'], inplace=True)

### 11.02 Determine and Create Features List

In [33]:
df.columns

Index(['DFL_title', 'tconst', 'titleType', 'primaryTitle', 'startYear',
       'runtimeMinutes', 'genres', 'averageRating', 'numVotes', 'newurl',
       'mpaarating', 'rlsdt', 'budget', 'worldwide', 'DIR_COUNT', 'DIR_AGE',
       'DIR_RTG', 'DIR_FILM_COUNT', 'WTR_COUNT', 'WTR_AGE', 'WTR_FILM_COUNT',
       'action', 'adventure', 'animation', 'biography', 'comedy', 'crime',
       'documentary', 'drama', 'family', 'fantasy', 'fi', 'history', 'horror',
       'music', 'musical', 'mystery', 'news', 'romance', 'sci', 'sport',
       'thriller', 'western', 'newmpaarating_G', 'newmpaarating_Not Rated',
       'newmpaarating_PG', 'newmpaarating_PG-13', 'budget_adj',
       'worldwide_adj', 'ROI', 'rlsdt_dt', 'rlsdt_mo', 'rlsdt_day',
       'rlsdt_daynm', 'rlsdt_dayofwk', 'rlsdt_season', 'rlsdt_season_NM'],
      dtype='object')

In [34]:
df.head()

Unnamed: 0,DFL_title,tconst,titleType,primaryTitle,startYear,runtimeMinutes,genres,averageRating,numVotes,newurl,...,budget_adj,worldwide_adj,ROI,rlsdt_dt,rlsdt_mo,rlsdt_day,rlsdt_daynm,rlsdt_dayofwk,rlsdt_season,rlsdt_season_NM
0,101 Dalmatians,tt0115433,movie,101 Dalmatians,1996,103,"Adventure,Comedy,Crime",5.7,109712.0,https://www.imdb.com/title/tt0115433,...,75000000,320689294,3.275857,1996-11-27,11.0,27.0,Wednesday,4.0,3.0,Fall
1,102 Dalmatians,tt0211181,movie,102 Dalmatians,2000,100,"Adventure,Comedy,Family",4.8,37056.0,https://www.imdb.com/title/tt0211181,...,85000000,183611771,1.160138,2000-11-22,11.0,22.0,Wednesday,4.0,3.0,Fall
2,"20,000 Leagues Under the Sea",tt0046672,movie,"20,000 Leagues Under the Sea",1954,127,"Adventure,Drama,Family",7.2,33109.0,https://www.imdb.com/title/tt0046672,...,9000000,0,-1.0,1955-07-20,7.0,20.0,Wednesday,4.0,2.0,Summer
3,A Bug's Life,tt0120623,movie,A Bug's Life,1998,95,"Adventure,Animation,Comedy",7.2,284538.0,https://www.imdb.com/title/tt0120623,...,120000000,363258859,2.027157,1998-11-25,11.0,25.0,Wednesday,4.0,3.0,Fall
4,A Christmas Carol,tt1067106,movie,A Christmas Carol,2009,96,"Adventure,Animation,Comedy",6.8,112582.0,https://www.imdb.com/title/tt1067106,...,200000000,325286646,0.626433,2009-11-06,11.0,6.0,Friday,6.0,3.0,Fall


In [67]:
features = [ 'startYear',
       'runtimeMinutes', 
        #'averageRating', 
        #'numVotes', 
       'DIR_COUNT', 'DIR_AGE',
       'DIR_RTG', 'DIR_FILM_COUNT', 'WTR_COUNT', 'WTR_AGE', 
       'WTR_FILM_COUNT','action', 'adventure', 'animation', 
       'biography', 'comedy', 'crime','documentary', 'drama', 
       'family', 'fantasy', 'fi', 'history', 'horror','music', 
       'musical', 'mystery', 'news', 'romance', 'sci', 'sport',
       'thriller', 'western', 'newmpaarating_G', 'newmpaarating_Not Rated',
       'newmpaarating_PG', 'newmpaarating_PG-13', 'budget_adj',
       #'worldwide_adj', 
       #'ROI', 
       'rlsdt_mo', 'rlsdt_day',
       'rlsdt_dayofwk', 'rlsdt_season']

### 11.03 Null Model

In [35]:
df['base'] = df['averageRating'].mean()
RMSE = np.sqrt(metrics.mean_squared_error(df['averageRating'], df['base']))
RMSE

0.9473993212555134

The Baseline model has a Root Mean Squared Error of 0.947%. On average the predicted IMDB Rating is about 0.947% off from the true average rating.

### 11.04 Pipeline

In [68]:
X = df[features]
y = df['averageRating']

In [69]:
def modelfunc(X, y):
    pipelines = [
        ('LINEAR REGRESSION', (Pipeline([ ('LR', LinearRegression())]))),
        ('DECISION TREE', (Pipeline([ ('TREE', DecisionTreeRegressor())]))),
        ('BAGGED TREE', (Pipeline([ ('BAG', BaggingRegressor())]))),
        ('RANDOM FOREST', (Pipeline([ ('RAND', RandomForestRegressor())]))),
        ('ADABOOST', (Pipeline([ ('ADA', AdaBoostRegressor())]))),
        ('KNN', (Pipeline([ ('sc', StandardScaler()),('KNN', KNeighborsRegressor())]))),
        ('LASSO',(Pipeline([ ('sc', StandardScaler()),('LASSO', LassoCV())]))),
        ('RIDGE',(Pipeline([ ('sc', StandardScaler()),('RIDGE', RidgeCV())])))
    ]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42) 
    for pipe_name ,model in pipelines:
        print(pipe_name)
        model.fit(X_train, y_train)
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
        trainscore = model.score(X_train, y_train)
        testscore = model.score(X_test, y_test)
        crossval = cross_val_score(model, X_train, y_train).mean()
        rmsetr= np.sqrt(metrics.mean_squared_error(y_train, y_pred_train))
        rmsete = np.sqrt(metrics.mean_squared_error(y_test, y_pred_test))
        print (f'Model = {model}')
        print (f'Train Score = {trainscore}')
        print (f'Test Score = {testscore}')
        print (f'Cross Val Score = {crossval}')
        print (f'RMSE Train = {rmsetr}')
        print (f'RMSE Test = {rmsete}')
        print('')
        print('')

In [70]:
modelfunc(X, y)

LINEAR REGRESSION
Model = Pipeline(steps=[('LR', LinearRegression())])
Train Score = 0.5517311515488699
Test Score = 0.2155114311442371
Cross Val Score = 0.29518047620023
RMSE Train = 0.6443852302955745
RMSE Test = 0.797934945394043


DECISION TREE
Model = Pipeline(steps=[('TREE', DecisionTreeRegressor())])
Train Score = 1.0
Test Score = -0.29121208792137465
Cross Val Score = -0.2018084000820147
RMSE Train = 1.4825870773051724e-16
RMSE Test = 1.0237006217459101


BAGGED TREE
Model = Pipeline(steps=[('BAG', BaggingRegressor())])
Train Score = 0.8911945685934295
Test Score = 0.19211855526353605
Cross Val Score = 0.3407706526413383
RMSE Train = 0.31746919111388044
RMSE Test = 0.809744472726496


RANDOM FOREST
Model = Pipeline(steps=[('RAND', RandomForestRegressor())])
Train Score = 0.9127780118534572
Test Score = 0.32750538269121854
Cross Val Score = 0.3436448202702593
RMSE Train = 0.2842426796397624
RMSE Test = 0.7387858721001742


ADABOOST
Model = Pipeline(steps=[('ADA', AdaBoostRegress

### 11.05 Test and Tune


Lasso performed the best according to the function above. Let's tune.

In [84]:
# Grid Search
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42) 
scoring_func = metrics.make_scorer(mean_squared_error)

# lss = Lasso()
# rand.fit(X_train, y_train)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform (X_test)

params = {
    'alpha':np.arange(0.001,10,0.001)
    #[0.0001, 0.01, 0.02, 0.05, 0.2, 0.2, 0.5, 1 , 5, 10, 100]  Rather than us guessing, and since we don't have that much data, we're going feed it a range of alphas
}
gs = GridSearchCV(Lasso(), 
                 param_grid = params, 
                 verbose = 1)

gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

Fitting 5 folds for each of 9999 candidates, totalling 49995 fits
0.39625125994157934
{'alpha': 0.026000000000000002}


In [79]:
### RIDGE didn't perform as well once we numvotes and worldwide revenue.  We won't know these items at the time of release

# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42) 
# scoring_func = metrics.make_scorer(mean_squared_error)

# rdg = Ridge()
# # rand.fit(X_train, y_train)

# params = {
#     'alpha': [0, 0.2, 0.01, 1, 5, 10]  
# }
# gs = GridSearchCV(Ridge(), 
#                  param_grid = params, 
#                  verbose = 1)

# gs.fit(X_train, y_train)
# print(gs.best_score_)
# print(gs.best_params_)

### 11.06 Final Model and Evaluation


In [102]:
# Final Model
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42) 
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform (X_test)

lss = Lasso(alpha=0.026)

lss.fit(X_train_sc, y_train)
y_pred_train = lss.predict(X_train_sc)
y_pred_test = lss.predict(X_test_sc)
trainscore = lss.score(X_train_sc, y_train)
testscore = lss.score(X_test_sc, y_test)
crossval = cross_val_score(lss, X_train_sc, y_train).mean()
rmsetr= np.sqrt(metrics.mean_squared_error(y_train, y_pred_train))
rmsete = np.sqrt(metrics.mean_squared_error(y_test, y_pred_test))

print (f'Train Score = {trainscore}')
print (f'Test Score = {testscore}')
print (f'Cross Val Score = {crossval}')
print (f'RMSE Train = {rmsetr}')
print (f'RMSE Test = {rmsete}')

Train Score = 0.5275369340400811
Test Score = 0.25770715219307805
Cross Val Score = 0.3962512599415794
RMSE Train = 0.6615462771227589
RMSE Test = 0.7761788660454407
