# 12 Modeling for Worldwide Rev

## 12.01 Imports


### 12.01.01 Python Imports


In [121]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.metrics as metrics
from sklearn.metrics import mean_squared_error, make_scorer

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV

from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.decomposition import PCA

### 12.01.02 Feature Film Import


Read in Data

In [123]:
df = pd.read_csv('../Bens_Data/Disney_Films_For_Visual.csv')
df.drop(columns=['Unnamed: 0','index'], inplace=True)

### 12.02 Determine and Create Features List

In [124]:
df.columns

Index(['DFL_title', 'tconst', 'titleType', 'primaryTitle', 'startYear',
       'runtimeMinutes', 'genres', 'averageRating', 'numVotes', 'newurl',
       'mpaarating', 'rlsdt', 'budget', 'worldwide', 'DIR_COUNT', 'DIR_AGE',
       'DIR_RTG', 'DIR_FILM_COUNT', 'WTR_COUNT', 'WTR_AGE', 'WTR_FILM_COUNT',
       'action', 'adventure', 'animation', 'biography', 'comedy', 'crime',
       'documentary', 'drama', 'family', 'fantasy', 'fi', 'history', 'horror',
       'music', 'musical', 'mystery', 'news', 'romance', 'sci', 'sport',
       'thriller', 'western', 'newmpaarating_G', 'newmpaarating_Not Rated',
       'newmpaarating_PG', 'newmpaarating_PG-13', 'budget_adj',
       'worldwide_adj', 'ROI', 'rlsdt_dt', 'rlsdt_mo', 'rlsdt_day',
       'rlsdt_daynm', 'rlsdt_dayofwk', 'rlsdt_season', 'rlsdt_season_NM'],
      dtype='object')

In [133]:
features = [ 'startYear',
       'runtimeMinutes', 
        #'averageRating', 
        #'numVotes', 
       'DIR_COUNT', 'DIR_AGE',
       'DIR_RTG', 'DIR_FILM_COUNT', 'WTR_COUNT', 'WTR_AGE', 
       'WTR_FILM_COUNT','action', 'adventure', 'animation', 
       'biography', 'comedy', 'crime','documentary', 'drama', 
       'family', 'fantasy', 'fi', 'history', 'horror','music', 
       'musical', 'mystery', 'news', 'romance', 'sci', 'sport',
       'thriller', 'western', 'newmpaarating_G', 'newmpaarating_Not Rated',
       'newmpaarating_PG', 'newmpaarating_PG-13', 'budget_adj',
       #'worldwide_adj', 
       #'ROI', 
       'rlsdt_mo', 'rlsdt_day',
       'rlsdt_dayofwk', 'rlsdt_season']

In [125]:
df.head()

Unnamed: 0,DFL_title,tconst,titleType,primaryTitle,startYear,runtimeMinutes,genres,averageRating,numVotes,newurl,...,budget_adj,worldwide_adj,ROI,rlsdt_dt,rlsdt_mo,rlsdt_day,rlsdt_daynm,rlsdt_dayofwk,rlsdt_season,rlsdt_season_NM
0,101 Dalmatians,tt0115433,movie,101 Dalmatians,1996,103,"Adventure,Comedy,Crime",5.7,109712.0,https://www.imdb.com/title/tt0115433,...,75000000,320689294,3.275857,1996-11-27,11.0,27.0,Wednesday,4.0,3.0,Fall
1,102 Dalmatians,tt0211181,movie,102 Dalmatians,2000,100,"Adventure,Comedy,Family",4.8,37056.0,https://www.imdb.com/title/tt0211181,...,85000000,183611771,1.160138,2000-11-22,11.0,22.0,Wednesday,4.0,3.0,Fall
2,"20,000 Leagues Under the Sea",tt0046672,movie,"20,000 Leagues Under the Sea",1954,127,"Adventure,Drama,Family",7.2,33109.0,https://www.imdb.com/title/tt0046672,...,9000000,0,-1.0,1955-07-20,7.0,20.0,Wednesday,4.0,2.0,Summer
3,A Bug's Life,tt0120623,movie,A Bug's Life,1998,95,"Adventure,Animation,Comedy",7.2,284538.0,https://www.imdb.com/title/tt0120623,...,120000000,363258859,2.027157,1998-11-25,11.0,25.0,Wednesday,4.0,3.0,Fall
4,A Christmas Carol,tt1067106,movie,A Christmas Carol,2009,96,"Adventure,Animation,Comedy",6.8,112582.0,https://www.imdb.com/title/tt1067106,...,200000000,325286646,0.626433,2009-11-06,11.0,6.0,Friday,6.0,3.0,Fall


In [126]:
df.shape

(431, 57)

In [127]:
df = df[df['worldwide_adj'] > 0]
df.shape

(304, 57)

In [132]:
df['worldwide_adj'].mean(), df['worldwide_adj'].min(), df['worldwide_adj'].max()

(247717829.2993421, 3000, 2797501328)

### 12.03 Null Model


In [130]:
df['base'] = df['worldwide_adj'].mean()
RMSE = np.sqrt(metrics.mean_squared_error(df['worldwide_adj'], df['base']))
RMSE

359926713.1040875

The Baseline model has a Root Mean Squared Error of 359926713%. On average the predicted Worldwide Rev is about 3599267133% off from the true average rating.
There is such huge variance in the worldwide numbers, our mean 247 M looks really odd.  Our Max is 2.7B, and our Min in 3k.  

### 12.04 Pipeline

In [134]:
X = df[features]
y = df['worldwide_adj']

In [135]:
def modelfunc(X, y):
    pipelines = [
        ('LINEAR REGRESSION', (Pipeline([ ('LR', LinearRegression())]))),
        ('DECISION TREE', (Pipeline([ ('TREE', DecisionTreeRegressor())]))),
        ('BAGGED TREE', (Pipeline([ ('BAG', BaggingRegressor())]))),
        ('RANDOM FOREST', (Pipeline([ ('RAND', RandomForestRegressor())]))),
        ('ADABOOST', (Pipeline([ ('ADA', AdaBoostRegressor())]))),
        ('KNN', (Pipeline([ ('sc', StandardScaler()),('KNN', KNeighborsRegressor())]))),
        ('LASSO',(Pipeline([ ('sc', StandardScaler()),('LASSO', LassoCV())]))),
        ('RIDGE',(Pipeline([ ('sc', StandardScaler()),('RIDGE', RidgeCV())])))
    ]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42) 
    for pipe_name ,model in pipelines:
        print(pipe_name)
        model.fit(X_train, y_train)
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
        trainscore = model.score(X_train, y_train)
        testscore = model.score(X_test, y_test)
        crossval = cross_val_score(model, X_train, y_train).mean()
        rmsetr= np.sqrt(metrics.mean_squared_error(y_train, y_pred_train))
        rmsete = np.sqrt(metrics.mean_squared_error(y_test, y_pred_test))
        print (f'Model = {model}')
        print (f'Train Score = {trainscore}')
        print (f'Test Score = {testscore}')
        print (f'Cross Val Score = {crossval}')
        print (f'RMSE Train = {rmsetr}')
        print (f'RMSE Test = {rmsete}')
        print('')
        print('')

In [136]:
modelfunc(X, y)

LINEAR REGRESSION
Model = Pipeline(steps=[('LR', LinearRegression())])
Train Score = 0.6885725272796499
Test Score = 0.6126560653309894
Cross Val Score = 0.5473284992955734
RMSE Train = 194509556.6107299
RMSE Test = 242770358.1038976


DECISION TREE
Model = Pipeline(steps=[('TREE', DecisionTreeRegressor())])
Train Score = 1.0
Test Score = 0.7700046471005294
Cross Val Score = 0.14519574470768437
RMSE Train = 0.0
RMSE Test = 187071147.21656126


BAGGED TREE
Model = Pipeline(steps=[('BAG', BaggingRegressor())])
Train Score = 0.9095728240008082
Test Score = 0.7284024506166735
Cross Val Score = 0.4907877391645905
RMSE Train = 104812231.46802893
RMSE Test = 203287278.83349523


RANDOM FOREST
Model = Pipeline(steps=[('RAND', RandomForestRegressor())])
Train Score = 0.931013999081614
Test Score = 0.6445500892698575
Cross Val Score = 0.5166706535923009
RMSE Train = 91546757.81259257
RMSE Test = 232560784.49395558


ADABOOST
Model = Pipeline(steps=[('ADA', AdaBoostRegressor())])
Train Score = 0.

### 12.05 Test and Tune


Lasso performed the best according to the function above. Let's tune.

In [145]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42) 
scoring_func = metrics.make_scorer(mean_squared_error)

# lss = Lasso()
# rand.fit(X_train, y_train)
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform (X_test)

params = {
    'alpha':np.arange(0.001,10,0.001),
    'max_iter':np.arange(1000,10000,1000),
    'tol':[1e-2]       # Not 100% sure what this does but got it from StackOverflow and it cleared up the convergance warnings.
    #[0.0001, 0.01, 0.02, 0.05, 0.2, 0.2, 0.5, 1 , 5, 10, 100]  Rather than us guessing, and since we don't have that much data, we're going feed it a range of alphas
}
gs = GridSearchCV(Lasso(), 
                 param_grid = params, 
                 verbose = 1)

gs.fit(X_train_sc, y_train)
print(gs.best_score_)
print(gs.best_params_)

Fitting 5 folds for each of 89991 candidates, totalling 449955 fits
0.5473286281458967
{'alpha': 9.998999999999999, 'max_iter': 1000, 'tol': 0.01}


In [148]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42) 


sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform (X_test)

params = {
    
}
gs = GridSearchCV(Lasso(), 
                 param_grid = params, 
                 verbose = 1)

gs.fit(X_train_sc, y_train)
print(gs.best_score_)
print(gs.best_params_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
0.547328512364005
{}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [79]:
### RIDGE didn't perform as well once we numvotes and worldwide revenue.  We won't know these items at the time of release

# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42) 
# scoring_func = metrics.make_scorer(mean_squared_error)

# rdg = Ridge()
# # rand.fit(X_train, y_train)

# params = {
#     'alpha': [0, 0.2, 0.01, 1, 5, 10]  
# }
# gs = GridSearchCV(Ridge(), 
#                  param_grid = params, 
#                  verbose = 1)

# gs.fit(X_train, y_train)
# print(gs.best_score_)
# print(gs.best_params_)

### 12.06 Final Model and Evaluation


In [149]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42) 
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform (X_test)

lss = Lasso()

lss.fit(X_train_sc, y_train)
y_pred_train = lss.predict(X_train_sc)
y_pred_test = lss.predict(X_test_sc)
trainscore = lss.score(X_train_sc, y_train)
testscore = lss.score(X_test_sc, y_test)
crossval = cross_val_score(lss, X_train_sc, y_train).mean()
rmsetr= np.sqrt(metrics.mean_squared_error(y_train, y_pred_train))
rmsete = np.sqrt(metrics.mean_squared_error(y_test, y_pred_test))

print (f'Train Score = {trainscore}')
print (f'Test Score = {testscore}')
print (f'Cross Val Score = {crossval}')
print (f'RMSE Train = {rmsetr}')
print (f'RMSE Test = {rmsete}')

Train Score = 0.6885725272796499
Test Score = 0.612656065281655
Cross Val Score = 0.5473284992496843
RMSE Train = 194509556.61072993
RMSE Test = 242770358.1193579


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
