In [153]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from random import randint
from collections import defaultdict
import re

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import log_loss, mean_squared_error, r2_score, confusion_matrix
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import  RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction.text import HashingVectorizer
from math import sqrt
import scipy.stats as stats

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.utils import to_categorical
from sklearn import preprocessing
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Import the data and clean it up with all the functions from our EDA

We will be using the string hashing trick to attempt a better fit than our initial methods. For this reason we're going to conform as many of our columns to individual strings as possible, and then turn the row into 1 string for processing with the FeatureHasher.

In [13]:
_2019 = pd.read_csv('data/IMDB_mine_data_2019.csv',index_col=0)
_2018 = pd.read_csv('data/IMDB_mine_data_2018.csv',index_col=0)
_2017 = pd.read_csv('data/IMDB_mine_data_2017.csv',index_col=0)
_2016 = pd.read_csv('data/IMDB_mine_data_2016.csv',index_col=0)
_2015 = pd.read_csv('data/IMDB_mine_data_2015.csv',index_col=0)
#get all the films into one DF
films = pd.concat([_2019,_2018,_2017,_2016,_2015])
# remove the filler films we were using to start the mining bot
films = films[films['title_code'] != np.nan]
films = films[films['opening_wknd'] != np.nan]
films = films[films['release_date'] != '1980-05-16']
films.shape

(2965, 26)

In [95]:
#Reset the index now that all films are in 1 dataframe
#films = films.reset_index(drop=True)
films.head(3)

Unnamed: 0,title,runtime,release_date,rating,prod_co,metaScore,metaUserScore,imdb_rating,genre,actor1,actor2,actor3,actor4,actor5,actor6,actor7,actor8,actor9,actor10,directors,writers,budget,opening_wknd,gross_dom,gross_int,title_code,production,production_2,distribution,director,actor_1,actor_2,actor_3,actor_4,actor_5,actor_6,actor_7,actor_8,actor_9,actor_10,producer,executive,release_month,train_string,action,adventure,animated,biography,drama,documentary,comedy,crime,fantasy,family,musical,horror,war,mystery,sci-fi,thriller,romance,actor1_popularity,actor2_popularity,actor3_popularity,actor4_popularity,actor5_popularity,actor6_popularity,actor7_popularity,actor8_popularity,actor9_popularity,actor10_popularity,actor1_class,actor2_class,actor3_class,actor4_class,actor5_class,actor6_class,actor7_class,actor8_class,actor9_class,actor10_class,release_year
0,Motherless Brooklyn,0,2019-11-01,R,"[Class 5 Films, Warner Bros.]",0,0,0,"[Crime, Drama, Mystery]",/name/nm0001570/,/name/nm1813221/,/name/nm0000285/,/name/nm0134072/,/name/nm0000353/,/name/nm0000246/,/name/nm0839486/,/name/nm0427728/,/name/nm1316767/,/name/nm0656929/,[Edward Norton],"['Jonathan Lethem', 'Edward Norton']",26000000.0,3500454.0,9277736.0,18477736.0,tt0385887,Class5Films,,WarnerBros.,EdwardNorton,EdwardNorton,GuguMbatha-Raw,AlecBaldwin,BobbyCannavale,WillemDafoe,BruceWillis,EthanSuplee,CherryJones,DallasRoberts,JoshPais,AdrianAlperovich,AdrianAlperovich,November,Class5Films WarnerBros. EdwardNorton EdwardNor...,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,640,838,1846,543,1080,253,1122,1946,3791,6307,A-list,A-list,A-list,A-list,A-list,A-list,A-list,A-list,B-list,C-list,2019
1,Alita: Battle Angel,0,2019-02-14,PG-13,"[Twentieth Century Fox, Twentieth Century Fox]",0,0,0,"[Action, Adventure, Sci-Fi, Thriller]",/name/nm4023073/,/name/nm0910607/,/name/nm0000124/,/name/nm0991810/,/name/nm4534098/,/name/nm0355097/,/name/nm5277107/,/name/nm7449863/,/name/nm7093076/,/name/nm1701107/,[Robert Rodriguez],"['James Cameron', 'Laeta Kalogridis', 'Yukito ...",170000000.0,28525613.0,85710210.0,404852543.0,tt0437086,TwentiethCenturyFox,,TwentiethCenturyFox,RobertRodriguez,RosaSalazar,ChristophWaltz,JenniferConnelly,MahershalaAli,EdSkrein,JackieEarleHaley,KeeanJohnson,JorgeLendeborgJr.,LanaCondor,IdaraVictor,JamesCameron,DavidValdes,February,TwentiethCenturyFox TwentiethCenturyFox Robert...,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,344,649,98,997,1279,1315,6480,13600,3190,19910,A-list,A-list,A-list,A-list,A-list,A-list,C-list,C-list,B-list,C-list,2019
2,Danger Close,0,2019-11-08,R,"[Deeper Water, Saboteur Media, Saban Films]",0,0,0,"[Action, Drama, War]",/name/nm1379938/,/name/nm9826817/,/name/nm1542397/,/name/nm2527406/,/name/nm5937328/,/name/nm9680111/,/name/nm3478396/,/name/nm7011217/,/name/nm2828232/,/name/nm7202582/,[Kriv Stenders],"['Stuart Beattie', 'James Nicholas', 'Karel Se...",35000000.0,2078370.0,,,tt0441881,DeeperWater,SaboteurMedia,SabanFilms,KrivStenders,TravisFimmel,TobyBlome,AlexanderEngland,AaronGlenane,UliLatukefu,RichardTeAre,LukeBracey,SeanMcCarthy,MojeanAria,RyanHance,StuartBeattie,JustinBoylson,November,DeeperWater SabanFilms KrivStenders TravisFimm...,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,86,94762,12814,25353,12440,66675,2127,157965,32369,266786,A-list,D-list,C-list,D-list,C-list,D-list,B-list,E-list,D-list,nobody,2019


#### Clean some of the columns and map the features to a cleaner structure

In [15]:
#clean the text in the production company column, and turn it into an accessable array
films['prod_co'] = films.prod_co.map(lambda x : re.findall(r"'(.*?)'",x, re.DOTALL))

#break production and distribution out into their own columns
films['production'] = films['prod_co'].map(lambda x : x[0].replace(" ", '') if len(x) >= 1 else np.nan)
films['production_2'] = films['prod_co'].map(lambda x : x[1].replace(" ", '') if len(x) >= 3 else np.nan)
films['distribution'] = films['prod_co'].map(lambda x : x[-1].replace(" ", '') if len(x) >= 2 else np.nan)

#convert the release date to a pandas datetime object
films['release_date'] = films['release_date'].map(lambda x : pd.to_datetime(x))

#Set the first director to their own column
films.directors = films.directors.map(lambda x : re.findall(r"'(.*?)'",x, re.DOTALL if isinstance(x, str) else np.nan))
films['director'] = films['directors'].map(lambda x: x[0].replace(" ", '') if len(x) >= 1 else 'none')

#### Add the Actor names from the key we scraped

In [16]:
# convert the actor codes to strings
actor_key = pd.read_csv('data/actor_key.csv', index_col=0).reset_index()

def get_actor_name(key):
    #print(key)
    if isinstance(key, float):
        return key
    row = actor_key.loc[actor_key['actor'] == key].index[0]
    #print(type(actor_key.iloc[row]['name']), actor_key.iloc[row]['name'])
    return actor_key.iloc[row]['name']

def get_actor_key(name):
    #print(key)
    row = actor_key.loc[actor_key['name'] == name].index[0]
    return(actor_key.iloc[row]['actor'])

In [17]:
films['actor_1'] = films['actor1'].map(lambda x : get_actor_name(x).replace(" ", '') if isinstance(x, str) else "")
films['actor_2'] = films['actor2'].map(lambda x : get_actor_name(x).replace(" ", '') if isinstance(x, str) else "")
films['actor_3'] = films['actor3'].map(lambda x : get_actor_name(x).replace(" ", '') if isinstance(x, str) else "")
films['actor_4'] = films['actor4'].map(lambda x : get_actor_name(x).replace(" ", '') if isinstance(x, str) else "")
films['actor_5'] = films['actor5'].map(lambda x : get_actor_name(x).replace(" ", '') if isinstance(x, str) else "")
films['actor_6'] = films['actor6'].map(lambda x : get_actor_name(x).replace(" ", '') if isinstance(x, str) else "")
films['actor_7'] = films['actor7'].map(lambda x : get_actor_name(x).replace(" ", '') if isinstance(x, str) else "")
films['actor_8'] = films['actor8'].map(lambda x : get_actor_name(x).replace(" ", '') if isinstance(x, str) else "")
films['actor_9'] = films['actor9'].map(lambda x : get_actor_name(x).replace(" ", '') if isinstance(x, str) else "")
films['actor_10'] = films['actor10'].map(lambda x : get_actor_name(x).replace(" ", '') if isinstance(x, str) else "")

#### Merge the producers with their films

In [18]:
#Add the main producer and executive producer to the dataframe
producer_key = pd.read_csv('data/producer_key.csv', index_col=0)
films = films.merge(producer_key, on='title_code', how='left')
films['producer'] = films['producer'].map(lambda x : x.replace(" ", '') if x!='[]' else '')
films['executive'] = films['executive'].map(lambda x : x.replace(" ", '') if x!='[]' else '')

#### Encode the release month as a string column

In [72]:
#reencode the release month as a string.
films['release_month'] = films['release_date'].map(lambda x : pd.to_datetime(x).month)
films['release_month'] = films['release_month'].map(lambda x : 'January' if x == 1 else ( 'February' if x==2 else ( 'March' if x==3 else ( 'April' if x==4 else ('May' if x==5 else ('June' if x==6 else ( 'July' if x==7 else ( 'August' if x==8 else ('September' if x==9 else ( 'October' if x==10 else ( 'November' if x==11 else ('December' if x==12 else 'unknown'))))))))) ) ))
#reencode the release year as a string
films['release_year'] = films['release_date'].map(lambda x : x.year)
films['release_year'] = films['release_year'].map(lambda x : '2015' if x == 2015 else ( '2016' if x==2016 else ( '2017' if x==2017 else ( '2018' if x==2018 else ('2019' if x==2019 else ('2020' if x==2020 else 'none'))))))

#### Add all the strings together to simplify our string hashing

In [114]:
films['train_string'] = films[['production','distribution','director','actor_1','actor_2','actor_3','actor_4',
                               'actor_5','actor_6','actor_7','actor_8','actor_9','actor_10',
                               'producer','executive']].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)

# Hashing Features
We are going to create a single matrix that we can train a model on that contains all of the strings in our films.

In [121]:
vectorizer = HashingVectorizer(n_features=2000)
vector = vectorizer.transform(films['train_string'].to_numpy())
vector

<2965x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 41424 stored elements in Compressed Sparse Row format>

#### Train Test split, and train on our string hashing.

In [122]:
y = films['opening_wknd']
X = vector

dum_guess = round(films['opening_wknd'].mean())
print("Base Model will guess: "+ str(dum_guess))
print("Our baseline RMSE is : " + str(round(sqrt(mean_squared_error(y, np.full(y.shape,dum_guess))))))

Base Model will guess: 6284064.0
Our baseline RMSE is : 21204937


In [123]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [124]:
lin_model = LinearRegression()
lin_model.fit(np.nan_to_num(X_train), y_train)
lin_preds = lin_model.predict(X_test)
print("MSE : " + str(mean_squared_error(y_test, lin_preds)))
print("RMSE: " + str(sqrt(mean_squared_error(y_test, lin_preds))))

MSE : 1668356055969120.8
RMSE: 40845514.51468228


In [125]:
rf_model = RandomForestRegressor(n_estimators=550,
                           #max_features=3,
                            max_depth=1000,
                           #random_state=0,
                            min_samples_split=3,
                            min_samples_leaf=5)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
print("MSE : " + str(mean_squared_error(y_test, rf_pred)))
print("RMSE: " + str(sqrt(mean_squared_error(y_test, rf_pred))))

MSE : 353048100781394.0
RMSE: 18789574.257587478


In [135]:
model_g = GradientBoostingRegressor(learning_rate=0.025,
                                   n_estimators=430,
                                   min_samples_leaf=4,
                                   max_depth=1100)
model_g.fit(X_train,y_train)
preds_gb = model_g.predict(X_test)
print("MSE : " + str(mean_squared_error(y_test, preds_gb)))
print("RMSE: " + str(sqrt(mean_squared_error(y_test, preds_gb))))

MSE : 419384942992689.5
RMSE: 20478890.179711632


In [136]:
xgb_model = xgb.XGBRegressor(n_estimators=100,
                            max_depth=200).fit(X_train, y_train)
xg_pred = xgb_model.predict(X_test)
print("MSE : " + str(mean_squared_error(y_test, xg_pred)))
print("RMSE: " + str(sqrt(mean_squared_error(y_test, xg_pred))))

MSE : 469449120273668.9
RMSE: 21666774.57014931


### Great news!
This means that we have some signal from our string vectors. Now we will add our one-hot features, and the other data points that we discovered in our inital EDA, and see if we can improve on the model.

In [31]:
# Adding the genre OHE
# we need to extract the inner quotes from the strings into a list.
films['genre'] = films['genre'].map(lambda x : re.findall(r"'(.*?)'",x, re.DOTALL))
#we're going to do the OHE manually
films['action'] = films['genre'].map(lambda x : 1 if 'Action' in x else 0)
films['adventure']  = films['genre'].map(lambda x : 1 if 'Adventure' in x else 0)
films['animated'] = films['genre'].map(lambda x : 1 if 'Animation' in x else 0)
films['biography'] = films['genre'].map(lambda x : 1 if 'Biography' in x else 0)
films['drama'] = films['genre'].map(lambda x : 1 if 'Drama' in x else 0)
films['documentary'] = films['genre'].map(lambda x : 1 if 'Documentary' in x else 0)
films['comedy'] = films['genre'].map(lambda x : 1 if 'Comedy' in x else 0)
films['crime'] = films['genre'].map(lambda x : 1 if 'Crime' in x else 0)
films['fantasy'] = films['genre'].map(lambda x : 1 if 'Fantasy' in x else 0)
films['family'] = films['genre'].map(lambda x : 1 if 'Family' in x else 0)
films['musical'] = films['genre'].map(lambda x : 1 if 'Musical' in x else 0)
films['horror'] = films['genre'].map(lambda x : 1 if 'Horror' in x else 0)
films['war'] = films['genre'].map(lambda x : 1 if 'War' in x else 0)
films['mystery'] = films['genre'].map(lambda x : 1 if 'Mystery' in x else 0)
films['sci-fi'] = films['genre'].map(lambda x : 1 if 'Sci-Fi' in x else 0)
films['thriller'] = films['genre'].map(lambda x : 1 if 'Thriller' in x else 0)
films['romance'] = films['genre'].map(lambda x : 1 if 'Romance' in x else 0)

In [32]:
#add actor popularity scores
fake_popularity = 10071118 #instantiated as the lowest actor popularity +1
def get_act_pop(code, fake_pop):
    if code in act_pop_keys['actor'].unique():
        row = act_pop_keys.loc[act_pop_keys['actor'] == code].index[0]
        return act_pop_keys.iloc[row]['popularity']
    else:
        return fake_pop + randint(0,1000)

act_pop_keys = pd.read_csv('data/actor_popularity_out.csv', index_col=0)
act_pop_keys.reset_index(inplace=True, drop=True)

films['actor1_popularity'] = films['actor1'].map(lambda x : get_act_pop(x,fake_popularity))
films['actor2_popularity'] = films['actor2'].map(lambda x : get_act_pop(x,fake_popularity))
films['actor3_popularity'] = films['actor3'].map(lambda x : get_act_pop(x,fake_popularity))
films['actor4_popularity'] = films['actor4'].map(lambda x : get_act_pop(x,fake_popularity))
films['actor5_popularity'] = films['actor5'].map(lambda x : get_act_pop(x,fake_popularity))
films['actor6_popularity'] = films['actor6'].map(lambda x : get_act_pop(x,fake_popularity))
films['actor7_popularity'] = films['actor7'].map(lambda x : get_act_pop(x,fake_popularity))
films['actor8_popularity'] = films['actor8'].map(lambda x : get_act_pop(x,fake_popularity))
films['actor9_popularity'] = films['actor9'].map(lambda x : get_act_pop(x,fake_popularity))
films['actor10_popularity'] = films['actor10'].map(lambda x : get_act_pop(x,fake_popularity))

In [194]:
def get_celeb_class(pop):
    if pop <=2000:
        return 'A-list'
    elif pop <= 5000:
        return 'B-list'
    elif pop <= 20000:
        return 'C-list'
    elif pop <= 100000:
        return 'D-list'
    elif pop <= 250000:
        return 'E-list'
    else:
        return 'nobody'

In [57]:
#convert popularity scores to text columns 
films['actor1_class'] = films['actor1_popularity'].map(lambda x : get_celeb_class(x))
films['actor2_class'] = films['actor2_popularity'].map(lambda x : get_celeb_class(x))
films['actor3_class'] = films['actor3_popularity'].map(lambda x : get_celeb_class(x))
films['actor4_class'] = films['actor4_popularity'].map(lambda x : get_celeb_class(x))
films['actor5_class'] = films['actor5_popularity'].map(lambda x : get_celeb_class(x))
films['actor6_class'] = films['actor6_popularity'].map(lambda x : get_celeb_class(x))
films['actor7_class'] = films['actor7_popularity'].map(lambda x : get_celeb_class(x))
films['actor8_class'] = films['actor8_popularity'].map(lambda x : get_celeb_class(x))
films['actor9_class'] = films['actor9_popularity'].map(lambda x : get_celeb_class(x))
films['actor10_class'] = films['actor10_popularity'].map(lambda x : get_celeb_class(x))

In [58]:
#add the normalized budget
bud = films[['budget']].values
min_max_scaler = preprocessing.MinMaxScaler()
bud_scaled = min_max_scaler.fit_transform(bud)
films['budget_normalized']=bud_scaled

## Train new models with the full set of data

In [137]:
#make dummies from our curated columns
dum = pd.get_dummies(films[['release_month', 'release_year', 'rating', 'actor1_class', 'actor2_class', 'actor3_class', 'actor4_class',
                              'actor5_class', 'actor6_class', 'actor7_class', 'actor8_class', 'actor9_class','actor10_class']])
dum.head(1)

Unnamed: 0,release_month_April,release_month_August,release_month_December,release_month_February,release_month_January,release_month_July,release_month_June,release_month_March,release_month_May,release_month_November,release_month_October,release_month_September,release_month_unknown,release_year_2015,release_year_2016,release_year_2017,release_year_2018,release_year_2019,release_year_2020,release_year_none,rating_Approved,rating_G,rating_M,rating_NC-17,rating_Not Rated,rating_PG,rating_PG-13,rating_R,rating_TV-14,rating_TV-G,rating_TV-MA,rating_TV-PG,rating_Unrated,actor1_class_A-list,actor1_class_B-list,actor1_class_C-list,actor1_class_D-list,actor1_class_E-list,actor1_class_nobody,actor2_class_A-list,actor2_class_B-list,actor2_class_C-list,actor2_class_D-list,actor2_class_E-list,actor2_class_nobody,actor3_class_A-list,actor3_class_B-list,actor3_class_C-list,actor3_class_D-list,actor3_class_E-list,actor3_class_nobody,actor4_class_A-list,actor4_class_B-list,actor4_class_C-list,actor4_class_D-list,actor4_class_E-list,actor4_class_nobody,actor5_class_A-list,actor5_class_B-list,actor5_class_C-list,actor5_class_D-list,actor5_class_E-list,actor5_class_nobody,actor6_class_A-list,actor6_class_B-list,actor6_class_C-list,actor6_class_D-list,actor6_class_E-list,actor6_class_nobody,actor7_class_A-list,actor7_class_B-list,actor7_class_C-list,actor7_class_D-list,actor7_class_E-list,actor7_class_nobody,actor8_class_A-list,actor8_class_B-list,actor8_class_C-list,actor8_class_D-list,actor8_class_E-list,actor8_class_nobody,actor9_class_A-list,actor9_class_B-list,actor9_class_C-list,actor9_class_D-list,actor9_class_E-list,actor9_class_nobody,actor10_class_A-list,actor10_class_B-list,actor10_class_C-list,actor10_class_D-list,actor10_class_E-list,actor10_class_nobody
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0


In [138]:
#pull the columns we want from the main DF
use_cols = films[['budget','action','adventure','animated','biography','drama','documentary','comedy','crime','fantasy','family',
                 'musical','horror','war','mystery','sci-fi','thriller','romance']]
use_cols.head(1)

Unnamed: 0,budget,action,adventure,animated,biography,drama,documentary,comedy,crime,fantasy,family,musical,horror,war,mystery,sci-fi,thriller,romance
0,26000000.0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0


In [139]:
vectorizer = HashingVectorizer(n_features=400)
vector = vectorizer.transform(films['train_string'].to_numpy())
vec_df = pd.DataFrame.sparse.from_spmatrix(vector)
vec_df.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.235702,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.235702,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.235702,0.0,0.0,0.0,0.0,0.235702,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.235702,0.0,0.0,0.0,0.0,-0.471405,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.471405,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.235702,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.235702,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.235702,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.235702,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.235702,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [140]:
y = films['opening_wknd']
X = pd.concat([use_cols, dum, vec_df], axis=1, sort=False)
X.shape

(2965, 511)

In [141]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train.shape

(2372, 511)

In [142]:
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)
lin_preds = lin_model.predict(X_test)
print("MSE : " + str(mean_squared_error(y_test, lin_preds)))
print("RMSE: " + str(sqrt(mean_squared_error(y_test, lin_preds))))

MSE : 291424263471242.5
RMSE: 17071152.962563556


In [143]:
rf_model = RandomForestRegressor(n_estimators=550,
                           #max_features=3,
                            max_depth=1000,
                           #random_state=0,
                            min_samples_split=3,
                            min_samples_leaf=5)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
print("MSE : " + str(mean_squared_error(y_test, rf_pred)))
print("RMSE: " + str(sqrt(mean_squared_error(y_test, rf_pred))))

MSE : 267388115885001.97
RMSE: 16352006.478869863


In [156]:
model_g = GradientBoostingRegressor(learning_rate=0.02,
                                   n_estimators=200,
                                   min_samples_leaf=15,
                                   max_depth=400)
model_g.fit(X_train,y_train)
preds_gb = model_g.predict(X_test)
print("MSE : " + str(mean_squared_error(y_test, preds_gb)))
print("RMSE: " + str(sqrt(mean_squared_error(y_test, preds_gb))))

MSE : 237379726245908.88
RMSE: 15407132.317401214


In [150]:
xgb_model = xgb.XGBRegressor(n_estimators=230,
                            max_depth=2000).fit(X_train, y_train)
xg_pred = xgb_model.predict(X_test)
print("MSE : " + str(mean_squared_error(y_test, xg_pred)))
print("RMSE: " + str(sqrt(mean_squared_error(y_test, xg_pred))))

MSE : 305538374929680.06
RMSE: 17479656.030073363


### Grid Search for best params

In [155]:
#gradient boost searching
clf = GridSearchCV(model_g,
                   {'max_depth': [200, 400, 600,1000,1500,2000],
                    'n_estimators': [50, 100, 200,300,400,500],
                   'min_samples_leaf':[4,8,10,15,18],
                   'learning_rate':[0.005,0.01,0.02,0.3]}, verbose=2, n_jobs=-1)
clf.fit(X, y)
print(clf.best_score_)
print(clf.best_params_)

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 30.2min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 73.5min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 125.1min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 204.0min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed: 312.7min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed: 438.4min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 589.8min
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed: 811.8min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed: 921.7min finished


0.5941903029498035
{'learning_rate': 0.02, 'max_depth': 400, 'min_samples_leaf': 15, 'n_estimators': 200}


In [157]:
clf = GridSearchCV(rf_model,
                   {'max_depth': [200, 400, 600,1000,1500,2000,3000],
                    'n_estimators': [50, 100, 200,300,400,500],
                   'min_samples_leaf':[4,8,10,15,18],
                   'min_samples_split':[3,6,10,12,15,18,20]}, verbose=3, n_jobs=-1)
clf.fit(X, y)
print(clf.best_score_)
print(clf.best_params_)

Fitting 5 folds for each of 1470 candidates, totalling 7350 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 15.3min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 34.8min
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed: 57.3min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 86.4min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed: 124.8min
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed: 184.7min


KeyboardInterrupt: 