# Create a predictive model that will tell us if a stand-up comedy special will receive an above or below average IMDb rating

1) Train weak learners: Random Forrest, Stochastic Gradient Descent.

2) Perform a grid search to find optimal parameters for an XGBoost classifier.

3) Put all three models into an ensemble for a final accuracy of 0.76

By combining the power of three weaker models into an ensemble, it was possible to predict what the IMDb rating of a comedy special is with decent accuracy. The models would probably be improved by using more training data. The LDA model that produced these topic vectors (in topic_modeling_LDA.ipynb) could also be improved with more training data or perhaps by using different hyperparameter settings.

In [8]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from xgboost import XGBClassifier
import pickle

df = pd.read_pickle('data/stand-up-data-w-clusters.pkl')

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 322 entries, 0 to 329
Data columns (total 25 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            322 non-null    object 
 1   date_posted      322 non-null    object 
 2   link             322 non-null    object 
 3   name             318 non-null    object 
 4   year             306 non-null    float64
 5   transcript       322 non-null    object 
 6   language         322 non-null    object 
 7   runtime          272 non-null    float64
 8   rating           272 non-null    float64
 9   rating_type      322 non-null    int64  
 10  words            322 non-null    object 
 11  word_count       322 non-null    int64  
 12  f_words          322 non-null    int64  
 13  s_words          322 non-null    int64  
 14  diversity        322 non-null    int64  
 15  diversity_ratio  322 non-null    float64
 16  police_AA        322 non-null    float64
 17  clean           

### Create one-hot features for cluster assignments and merge with dataframe

In [9]:
cluster_LDA_dummies = pd.get_dummies(df['cluster_LDA'])
LDA_columns = [str(column) + '_LDA' for column in cluster_LDA_dummies.columns]
cluster_LDA_dummies.columns = LDA_columns

cluster_tfidf_dummies = pd.get_dummies(df['cluster_tfidf'])
tfidf_columns = [str(column) + '_tfidf' for column in cluster_tfidf_dummies.columns]
cluster_tfidf_dummies.columns = tfidf_columns

cluster_df = pd.merge(cluster_LDA_dummies, cluster_tfidf_dummies, right_index=True, left_index=True)
cluster_df.head()

Unnamed: 0,0_LDA,1_LDA,2_LDA,3_LDA,4_LDA,5_LDA,6_LDA,0_tfidf,1_tfidf,2_tfidf,3_tfidf,4_tfidf,5_tfidf,6_tfidf
0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0,0,1,0
2,1,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,1,0,0,0,0,0,0,0,0,1,0,0,0
4,0,1,0,0,0,0,0,0,0,0,1,0,0,0


In [10]:
df = pd.merge(df, cluster_df, right_index=True, left_index=True)
df.columns

Index(['title', 'date_posted', 'link', 'name', 'year', 'transcript',
       'language', 'runtime', 'rating', 'rating_type', 'words', 'word_count',
       'f_words', 's_words', 'diversity', 'diversity_ratio', 'police_AA',
       'clean', 'UK', 'relationships', 'animals', 'politics', 'big_picture',
       'cluster_LDA', 'cluster_tfidf', '0_LDA', '1_LDA', '2_LDA', '3_LDA',
       '4_LDA', '5_LDA', '6_LDA', '0_tfidf', '1_tfidf', '2_tfidf', '3_tfidf',
       '4_tfidf', '5_tfidf', '6_tfidf'],
      dtype='object')

In [537]:
X = np.array(df[['police_AA', 'clean', 'UK', 'relationships', 'animals', 'politics', 'big_picture']].loc[df.rating > 0])
y = np.array(df.rating_type.loc[df.rating > 0])
print(X.shape)
print(y.shape)

(272, 7)
(272,)


### Split data into training and testing sets and train models.

- Train Random Forrest model

- Train SGD model

- Perform grid search and train XGB model

- Create and ensemble of three classifiers

In [538]:
# Split the data training and testing 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1)

In [539]:
# Random Forrest
rf = RandomForestClassifier(n_estimators=101).fit(X_train, y_train)
print(f'RF score: {rf.score(X_test, y_test)}')

RF score: 0.7073170731707317


In [541]:
# SGD
sgd = linear_model.SGDClassifier(loss='modified_huber').fit(X_train, y_train)
print(f'SGD score: {sgd.score(X_test, y_test)}')

SGD score: 0.7804878048780488


In [542]:
%%time
xgb = XGBClassifier()
parameters = {
     "eta"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
     "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
     "min_child_weight" : [ 1, 3, 5, 7 ],
     "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
     "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
     }

grid = GridSearchCV(xgb,
                    parameters, n_jobs=4,
                    scoring="neg_log_loss",
                    cv=3)

grid.fit(X_train, y_train)

CPU times: user 19.1 s, sys: 472 ms, total: 19.6 s
Wall time: 1min 33s


GridSearchCV(cv=3, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estim...
                                     subsample=None, tree_method=None,
                                     validate_parameters=None, verbosity=None),
             iid='deprecated', n_jobs=4,
             param_grid={'colsample_bytree': [0.3, 0.4, 0.5, 0.7],
                        

In [543]:
best_xgb = grid.best_estimator_.fit(X_train, y_train)
print(f'Best params: {grid.best_params_}')
print(f'Best XGB score: {best_xgb.score(X_test, y_test)}')

Best params: {'colsample_bytree': 0.7, 'eta': 0.05, 'gamma': 0.4, 'max_depth': 3, 'min_child_weight': 3}
Best XGB score: 0.6097560975609756


In [558]:
# Ensemble
estimators = [('rf', rf), ('sgd', sgd), ('xgb', best_xgb)]

ensemble = VotingClassifier(estimators, voting='soft')
ensemble.fit(X_train, y_train)
print('Voting Classifier, Ensemble Acc: {}'.format(ensemble.score(X_test, y_test)))

Voting Classifier, Ensemble Acc: 0.8292682926829268


### The ensemble method performed the best at 0.82 accuracy when taking only the LDA topic probabilities as input. 
Save the model to a pickle

In [560]:
# # Save ensemble model
# pickle.dump(ensemble, open('models/rating_pred_ens_w_LDAprob_model.pkl', 'wb'))

# # Load ensemble model
# with open('models/rating_pred_ens_w_LDAprob_model.pkl','rb') as f:
#     ensemble = pickle.load(f)

### See what happens when we use cluster assignments only to train the model

In [564]:
X = np.array(df[['0_LDA', '1_LDA', '2_LDA', '3_LDA',
       '4_LDA', '5_LDA', '6_LDA', '0_tfidf', '1_tfidf', '2_tfidf', '3_tfidf',
       '4_tfidf', '5_tfidf', '6_tfidf']].loc[df.rating > 0])
y = np.array(df.rating_type.loc[df.rating > 0])
print(X.shape)
print(y.shape)

(272, 14)
(272,)


In [565]:
# Split the data training and testing 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1)

In [68]:
# Random Forrest
rf = RandomForestClassifier(n_estimators=101).fit(X_train, y_train)
print(f'RF score: {rf.score(X_test, y_test)}')

RF score: 0.7317073170731707


In [102]:
# SGD
sgd = linear_model.SGDClassifier(loss='modified_huber').fit(X_train, y_train)
print(f'SGD score: {sgd.score(X_test, y_test)}')

SGD score: 0.8048780487804879


In [137]:
# XGBoosting
xgb = XGBClassifier().fit(X_train, y_train)
print(f'XGB score: {xgb.score(X_test, y_test)}')

XGB score: 0.6341463414634146


In [189]:
# Ensemble
estimators = [('rf', rf), ('sgd', sgd), ('xgb', xgb)]

ensemble = VotingClassifier(estimators, voting='soft')
ensemble.fit(X_train, y_train)
print('Voting Classifier, Ensemble Acc: {}'.format(ensemble.score(X_test, y_test)))

Voting Classifier, Ensemble Acc: 0.7073170731707317


### SGD performed the best at 0.80 test accuracy when taking only the cluster assignments as input. 
Save the model to a pickle

In [190]:
# # Save sgd model
# pickle.dump(sgd, open('models/rating_pred_sgd_w_clusters_model.pkl', 'wb'))

# # Load sgd model
# with open('models/rating_pred_sgd_w_clusters_model.pkl','rb') as f:
#     sgd = pickle.load(f)

### See what happens when we use cluster assignments along with LDA vectors

In [567]:
X = np.array(df[['police_AA', 'clean', 'UK', 'relationships', 'animals', 'politics', 
                 'big_picture', '0_LDA', '1_LDA', '2_LDA', '3_LDA',
                 '4_LDA', '5_LDA', '6_LDA', '0_tfidf', '1_tfidf', '2_tfidf', '3_tfidf',
                 '4_tfidf', '5_tfidf', '6_tfidf']].loc[df.rating > 0])
y = np.array(df.rating_type.loc[df.rating > 0])
print(X.shape)
print(y.shape)

(272, 21)
(272,)


In [568]:
# Split the data training and testing 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1)

In [456]:
# Random Forrest
rf = RandomForestClassifier(n_estimators=101).fit(X_train, y_train)
print(f'RF score: {rf.score(X_test, y_test)}')

RF score: 0.6829268292682927


In [449]:
# SGD
sgd = linear_model.SGDClassifier(loss='modified_huber').fit(X_train, y_train)
print(f'SGD score: {sgd.score(X_test, y_test)}')

SGD score: 0.8536585365853658


In [414]:
%%time
xgb = XGBClassifier()
parameters = {
     "eta"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
     "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
     "min_child_weight" : [ 1, 3, 5, 7 ],
     "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
     "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
     }

grid = GridSearchCV(xgb,
                    parameters, n_jobs=4,
                    scoring="neg_log_loss",
                    cv=3)

grid.fit(X_train, y_train)

CPU times: user 20.5 s, sys: 545 ms, total: 21 s
Wall time: 2min 6s


GridSearchCV(cv=3, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estim...
                                     subsample=None, tree_method=None,
                                     validate_parameters=None, verbosity=None),
             iid='deprecated', n_jobs=4,
             param_grid={'colsample_bytree': [0.3, 0.4, 0.5, 0.7],
                        

In [420]:
best_xgb = grid.best_estimator_.fit(X_train, y_train)
print(f'Best params: {grid.best_params_}')
print(f'Best XGB score: {best_xgb.score(X_test, y_test)}')

Best params: {'colsample_bytree': 0.5, 'eta': 0.05, 'gamma': 0.0, 'max_depth': 3, 'min_child_weight': 3}
Best XGB score: 0.6585365853658537


In [475]:
from sklearn.ensemble import VotingClassifier

# Ensemble
estimators = [('rf', rf1), ('sgd', sgd), ('xgb', best_xgb)]

ensemble = VotingClassifier(estimators, voting='soft')
ensemble.fit(X_train, y_train)
print('Voting Classifier, Ensemble Acc: {}'.format(ensemble.score(X_test, y_test)))

Voting Classifier, Ensemble Acc: 0.8780487804878049


### The ensemble method performed the best at 0.88 accuracy when taking cluster assignments and LDA probabilities as input. 
Save the model to a pickle

In [536]:
# # Save ensemble model
# pickle.dump(ensemble, open('models/rating_pred_ens_combined_model.pkl', 'wb'))

# # Load ensemble model
# with open('models/rating_pred_ens_combined_model.pkl','rb') as f: 
#     ensemble = pickle.load(f)