# 4.1 Develop the classification model

### Notebook goal

The aim of this notebook is to model the data to predict if a film makes a profit or not. A classification problem, the models considered in the first instance will be:

* Logistic Regression,
* Decision Tree Classifier,
* Random Forest Classifier and
* Naive Bayes Classifier.

In [4]:
# for the sake of development, use this magic command to solve slow suggestion
%config Completer.use_jedi = False

In [5]:
import pandas as pd
import numpy as np
from datetime import datetime
import math
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('max_columns', 50)
from load_data import load_data
from preprocess_data import preprocess

from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.metrics import classification_report, roc_curve, plot_confusion_matrix
from sklearn.metrics import r2_score, make_scorer, f1_score

#### Load in the dataset

In [6]:
df = preprocess(load_data())
df.head()

Unnamed: 0,revenue,runtime,num_prods,num_languages,num_writers,UNRATE,PCE,class,original_language_en,original_language_fr,original_language_hi,month_Apr,month_Aug,month_Dec,month_Feb,month_Jan,month_Jul,month_Jun,month_Mar,month_May,month_Nov,month_Oct,month_Sep,genres_Adventure,genres_Animation,...,prod_comp_names_Universal_Pictures,prod_comp_names_Columbia_Pictures,prod_comp_names_Paramount,prod_comp_names_20th_Century_Fox,prod_comp_names_New_Line_Cinema,prod_comp_names_Walt_Disney_Pictures,prod_comp_names_Canal+,prod_comp_names_Metro-Goldwyn-Mayer,prod_comp_names_Touchstone_Pictures,prod_comp_names_Relativity_Media,prod_comp_names_Miramax,prod_comp_cntry_US,prod_comp_cntry_GB,prod_comp_cntry_FR,num_top_100_actors,established_director,log10_budget,log10_director_pop,log10_avg_writer_pop,log10_max_writer_pop,log10_avg_actor_pop,log10_max_actor_pop,log10_min_actor_pop,log10_cast_crew_sum_pop,log10_cast_crew_product_pop
0,221546000.0,81.0,1,1,5,5.5,5013.9,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0.0,7.250232,0.525304,0.652182,0.822822,1.051268,1.42951,0.393048,1.280904,2.228754
1,156265000.0,104.0,4,2,3,5.6,5097.5,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0.0,7.587154,0.471732,0.260389,0.629817,0.876776,1.069668,0.318272,1.090399,1.608897
2,48433220.0,127.0,1,1,2,5.6,5097.5,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0.0,6.978361,0.72583,-0.175874,0.146438,0.524006,0.754578,0.199206,0.969789,1.073962
3,111454000.0,170.0,3,2,1,5.6,5097.5,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,3,1.0,7.552392,0.962985,0.485863,0.962985,1.037811,1.211307,0.914079,1.36462,2.486659
4,31914590.0,127.0,7,2,5,5.6,5097.5,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,1.0,7.537669,0.031408,0.337858,0.350829,0.858918,1.054498,0.697142,1.020292,1.228185


## 1. Classification

### 1.1. Preparing the data

Develop the classification model to determine if the films make a profit or loss.

The first steps are to split the data into the train and test set. Do this with the methods that come with sklearn and set the training set to 80% of the total. The data will then need to be scaled. A min-max scaler was chosen for this.

In [7]:
X = df.drop(['class', 'revenue'], axis=1)
y = df['class']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, train_size=0.8
)

In [9]:
X_train.head(3)

Unnamed: 0,runtime,num_prods,num_languages,num_writers,UNRATE,PCE,original_language_en,original_language_fr,original_language_hi,month_Apr,month_Aug,month_Dec,month_Feb,month_Jan,month_Jul,month_Jun,month_Mar,month_May,month_Nov,month_Oct,month_Sep,genres_Adventure,genres_Animation,genres_Children,genres_Comedy,...,prod_comp_names_Universal_Pictures,prod_comp_names_Columbia_Pictures,prod_comp_names_Paramount,prod_comp_names_20th_Century_Fox,prod_comp_names_New_Line_Cinema,prod_comp_names_Walt_Disney_Pictures,prod_comp_names_Canal+,prod_comp_names_Metro-Goldwyn-Mayer,prod_comp_names_Touchstone_Pictures,prod_comp_names_Relativity_Media,prod_comp_names_Miramax,prod_comp_cntry_US,prod_comp_cntry_GB,prod_comp_cntry_FR,num_top_100_actors,established_director,log10_budget,log10_director_pop,log10_avg_writer_pop,log10_max_writer_pop,log10_avg_actor_pop,log10_max_actor_pop,log10_min_actor_pop,log10_cast_crew_sum_pop,log10_cast_crew_product_pop
3607,117.0,5,1,1,9.3,10392.1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0.0,7.106447,-0.075721,-0.552842,-0.075721,0.603649,0.918816,-0.079355,0.710512,-0.024913
1754,93.0,3,1,1,5.7,7174.3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,0.0,6.73978,0.340047,-0.326058,0.151063,1.058805,1.13043,0.99127,1.149527,1.072795
2238,120.0,7,6,3,5.6,8147.2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.0,7.904627,0.369401,-0.164521,-0.103474,1.05497,1.419923,0.563837,1.157608,1.259851


In [10]:
# scaler = StandardScaler()
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)

In [13]:
pd.DataFrame(X_train
             , columns=df.drop(['class', 'revenue'], axis=1).columns).head()

Unnamed: 0,runtime,num_prods,num_languages,num_writers,UNRATE,PCE,original_language_en,original_language_fr,original_language_hi,month_Apr,month_Aug,month_Dec,month_Feb,month_Jan,month_Jul,month_Jun,month_Mar,month_May,month_Nov,month_Oct,month_Sep,genres_Adventure,genres_Animation,genres_Children,genres_Comedy,...,prod_comp_names_Universal_Pictures,prod_comp_names_Columbia_Pictures,prod_comp_names_Paramount,prod_comp_names_20th_Century_Fox,prod_comp_names_New_Line_Cinema,prod_comp_names_Walt_Disney_Pictures,prod_comp_names_Canal+,prod_comp_names_Metro-Goldwyn-Mayer,prod_comp_names_Touchstone_Pictures,prod_comp_names_Relativity_Media,prod_comp_names_Miramax,prod_comp_cntry_US,prod_comp_cntry_GB,prod_comp_cntry_FR,num_top_100_actors,established_director,log10_budget,log10_director_pop,log10_avg_writer_pop,log10_max_writer_pop,log10_avg_actor_pop,log10_max_actor_pop,log10_min_actor_pop,log10_cast_crew_sum_pop,log10_cast_crew_product_pop
0,0.290735,0.16,0.0,0.0,0.797297,0.685946,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.333333,0.0,0.59792,0.08849,0.084285,0.08849,0.63022,0.613863,0.099427,0.412307,0.303378
1,0.214058,0.08,0.0,0.0,0.310811,0.460796,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.666667,0.0,0.493519,0.340263,0.21509,0.225821,0.850429,0.727745,0.846467,0.667065,0.512185
2,0.300319,0.24,0.555556,0.105263,0.297297,0.52887,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.825185,0.358039,0.308263,0.071683,0.848573,0.883539,0.548221,0.671754,0.547767
3,0.284345,0.04,0.111111,0.105263,0.216216,0.826502,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.636349,0.356574,0.55458,0.396436,0.656813,0.581649,0.493913,0.545623,0.553146
4,0.198083,0.08,0.0,0.052632,0.324324,0.470864,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.785597,0.279859,0.457847,0.418046,0.641512,0.565864,0.343179,0.498182,0.491131


### 1.2 Training the models

The four models that will be trained be trained are a Logistic Regression, a Support Vector Machine, a Decision Tree and a Random Forest. Default parameters are used for this first step of training, except for the SVM.

The test data is then scaled with a min-max scaler and used by the models to make predictions. Metrics are printed out for assessment.

In [16]:
log_clf = LogisticRegression()
svm_clf = SVC(kernel='rbf', class_weight='balanced', verbose=True, probability=True)
tree_clf = DecisionTreeClassifier()
forest_clf = RandomForestClassifier()

models = [log_clf, svm_clf, tree_clf, forest_clf]

In [17]:
scaler = MinMaxScaler()
X_test = scaler.fit_transform(X_test)

In [18]:
for clf in [log_clf, svm_clf, tree_clf, forest_clf]:
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    print(type(clf))
    print(classification_report(y_test, preds))
    print(confusion_matrix(y_test, preds))
    print("F1-score: {:.2f}".format(f1_score(y_test, preds)))
    print("Accuracy: {:.2f}".format(accuracy_score(y_test, preds)))
    print("Precision: {:.2f}".format(precision_score(y_test, preds)))
    print("Recall: {:.2f}".format(recall_score(y_test, preds)))
    print()

<class 'sklearn.linear_model._logistic.LogisticRegression'>
              precision    recall  f1-score   support

           0       0.75      0.92      0.83       382
           1       0.61      0.29      0.40       163

    accuracy                           0.73       545
   macro avg       0.68      0.61      0.61       545
weighted avg       0.71      0.73      0.70       545

[[351  31]
 [115  48]]
F1-score: 0.40
Accuracy: 0.73
Precision: 0.61
Recall: 0.29

[LibSVM]<class 'sklearn.svm._classes.SVC'>
              precision    recall  f1-score   support

           0       0.82      0.66      0.73       382
           1       0.46      0.66      0.54       163

    accuracy                           0.66       545
   macro avg       0.64      0.66      0.64       545
weighted avg       0.71      0.66      0.68       545

[[254 128]
 [ 56 107]]
F1-score: 0.54
Accuracy: 0.66
Precision: 0.46
Recall: 0.66

<class 'sklearn.tree._classes.DecisionTreeClassifier'>
              precisio

#### We can also look at the probability predictions of the classifiers to see the distributions

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(12,12))
sns.histplot(svm_clf.predict_proba(X_test), ax=ax[0][0])
sns.histplot(forest_clf.predict_proba(X_test), ax=ax[0][1])
sns.histplot(tree_clf.predict_proba(X_test), ax=ax[1][0])
sns.histplot(log_clf.predict_proba(X_test), ax=ax[1][1])

sns.despine(right=True)
plt.show()

### 1.3 Taking models forward for tuning

The SVM and Decision Tree models are taken forward given they had the best mix of evaluation metrics, including better F1 scores.

#### 1.3.1 Decision Tree Tuning

In [99]:
param_grid = {
    'class_weight': ['balanced', {0:1, 1:2}, {0:1, 1:3}, {0:1, 1:4}
                     , {0:1, 1:5}, {0:1, 1:6}, {0:1, 1:7}, {0:1, 1:8}]
    , 'criterion': ['gini', 'entropy']
    , 'max_features': ['auto', 'sqrt', 'log2']
    , 'max_depth' : [100, 200, 500, 1000, None]
    , 'splitter': ['best', 'random']
}

tree_clf = DecisionTreeClassifier()

f1 = make_scorer(f1_score)  

grid_search = GridSearchCV(tree_clf, param_grid, cv=5
                          , return_train_score=True
                          , scoring=f1, verbose=1)
grid_search.fit(X_train, y_train)

print(grid_search.best_estimator_)

print(grid_search.best_score_)

print(classification_report(y_test, grid_search.predict(X_test)))

Fitting 5 folds for each of 480 candidates, totalling 2400 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


DecisionTreeClassifier(class_weight={0: 1, 1: 6}, criterion='entropy',
                       max_features='auto')
0.4597375161359075
              precision    recall  f1-score   support

           0       0.76      0.77      0.76       382
           1       0.44      0.44      0.44       163

    accuracy                           0.67       545
   macro avg       0.60      0.60      0.60       545
weighted avg       0.67      0.67      0.67       545



[Parallel(n_jobs=1)]: Done 2400 out of 2400 | elapsed:   21.1s finished


#### 1.3.2 Support Vector Machine

In [13]:
svm_clf = SVC(kernel='rbf', class_weight='balanced', verbose=True)

In [14]:
svm_clf.fit(X_train, y_train)

[LibSVM]

SVC(class_weight='balanced', verbose=True)

In [17]:
preds = svm_clf.predict(X_test)

In [20]:
print(confusion_matrix(y_test, preds))
print(f"Accuracy: {accuracy_score(y_test, preds)}")
print(f"Precision: {precision_score(y_test, preds)}")
print(f"Recall: {recall_score(y_test, preds)}")

[[213 169]
 [ 81  82]]
Accuracy: 0.5412844036697247
Precision: 0.32669322709163345
Recall: 0.5030674846625767


In [21]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.72      0.56      0.63       382
           1       0.33      0.50      0.40       163

    accuracy                           0.54       545
   macro avg       0.53      0.53      0.51       545
weighted avg       0.61      0.54      0.56       545



#### Random Forest Tuning

In [147]:
# param_grid = [
#     {'n_estimators': [3, 10, 20, 100, 200], 'max_features': [2, 4, 6, 8, 10]}
#     , {'bootstrap': [False], 'n_estimators': [3, 10, 50, 200], 'max_features': [2, 3, 4, 12]}
# ]

param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [8,10,12,16,20],
    'criterion' :['gini', 'entropy']
}

forest_clf = RandomForestClassifier()

# prec = make_scorer(precision_score)  

grid_search = GridSearchCV(forest_clf, param_grid, cv=5
                          , return_train_score=True)

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)

predictions = grid_search.predict(X_test)
print("Random Forest: Grid Search Params
print(confusion_matrix(y_test, predictions))
print(f"Accuracy: {accuracy_score(y_test, predictions)}")
print(f"Precision: {precision_score(y_test, predictions)}")
print(f"Recall: {recall_score(y_test, predictions)}")

Use the `RandomizedSearchCV` to do a random search for the best parameters.

In [66]:
forest_clf = RandomForestClassifier()

from scipy.stats import randint

random_grid = {'bootstrap': [True, False],
               'max_depth': randint(low=10, high=250),
               'max_features': ['auto', 'sqrt'],
               'min_samples_leaf': [1, 2, 4, 8],
               'min_samples_split': [2, 5, 10, 20],
               'n_estimators': randint(low=100, high=500)}

random_search = RandomizedSearchCV(
    estimator=forest_clf
    , param_distributions=random_grid
    , n_iter=200
    , cv=5
    , verbose=1
    , random_state=42
)

import time
start_time = time.time()

random_search.fit(X_train, y_train)

print("Took {}mins {:.2f}s".format(
    math.floor( (time.time()-start_time) / 60)
    , (time.time()-start_time) % 60
))

print(random_search.best_estimator_)

print(random_search.best_score_)

predictions = random_search.predict(X_test)
print(confusion_matrix(y_test, predictions))
print(f"Accuracy: {accuracy_score(y_test, predictions)}")
print(f"Precision: {precision_score(y_test, predictions)}")
print(f"Recall: {recall_score(y_test, predictions)}")
print(classification_report(y_test, predictions))