# Binary Classification with fine-tuning

In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score

In [45]:
data = pd.read_csv('/Users/user/Downloads/AshokaUniversity/monsoon23-courses/IML/final_project/dataset/binary_success.csv')

In [47]:
X = data.filter(['budget', 'runtime', 'release_year', 'release_month', 'genre', 'production_company', 'production_country', 'main_cast', 'director'], axis=1)
Y = data['success_degree']

In [48]:
X.head(2)

Unnamed: 0,budget,runtime,release_year,release_month,genre,production_company,production_country,main_cast,director
0,2115000.0,92.0,1950,3,Music,Twentieth Century Fox Film Corporation,United States of America,Betty Grable,Henry Koster
1,3768785.0,107.0,1950,5,Action,Metro-Goldwyn-Mayer (MGM),United States of America,Betty Hutton,George Sidney


In [49]:
Y.head(2)

0    0
1    1
Name: success_degree, dtype: int64

# Standardising and one hot encoding

In [51]:
categorical_features = ['release_month', 'genre', 'production_company', 'production_country', 'main_cast', 'director']
encoder = OneHotEncoder(sparse_output=False)
X_encoded = encoder.fit_transform(X[categorical_features])

numerical_features = ['budget', 'runtime', 'release_year']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X[numerical_features])

X_final = np.concatenate((X_scaled, X_encoded), axis=1)
X_scaled[0:10]


array([[-0.76168751, -0.86705799, -3.78711496],
       [-0.72131641, -0.16242984, -3.78711496],
       [-0.77914162,  1.29380167, -3.78711496],
       [-0.74252461, -1.71261176, -3.78711496],
       [-0.80721465, -1.05495883, -3.78711496],
       [-0.77054883, -0.02150421, -3.78711496],
       [-0.78158275, -0.25638026, -3.71271221],
       [-0.76937708,  0.68312394, -3.71271221],
       [-0.78402388, -0.4442811 , -3.71271221],
       [-0.75717141, -0.16242984, -3.71271221]])

In [52]:
X_trans = pd.DataFrame(X_final)

column_names = numerical_features + list(encoder.get_feature_names_out(categorical_features))
X_trans.columns = column_names

X_trans.head(10)

Unnamed: 0,budget,runtime,release_year,release_month_1,release_month_2,release_month_3,release_month_4,release_month_5,release_month_6,release_month_7,...,director_Yılmaz Erdoğan,director_Zach Braff,director_Zacharias Kunuk,director_Zack Snyder,director_Zal Batmanglij,director_Zana Briski,director_Zhang Yimou,director_Zoya Akhtar,director_Álex de la Iglesia,director_Émile Gaudreault
0,-0.761688,-0.867058,-3.787115,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.721316,-0.16243,-3.787115,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.779142,1.293802,-3.787115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.742525,-1.712612,-3.787115,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.807215,-1.054959,-3.787115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,-0.770549,-0.021504,-3.787115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,-0.781583,-0.25638,-3.712712,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,-0.769377,0.683124,-3.712712,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,-0.784024,-0.444281,-3.712712,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,-0.757171,-0.16243,-3.712712,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
X_trans.dtypes

budget                         float64
runtime                        float64
release_year                   float64
release_month_1                float64
release_month_2                float64
                                ...   
director_Zana Briski           float64
director_Zhang Yimou           float64
director_Zoya Akhtar           float64
director_Álex de la Iglesia    float64
director_Émile Gaudreault      float64
Length: 5474, dtype: object

In [54]:
X_trans.describe()

Unnamed: 0,budget,runtime,release_year,release_month_1,release_month_2,release_month_3,release_month_4,release_month_5,release_month_6,release_month_7,...,director_Yılmaz Erdoğan,director_Zach Braff,director_Zacharias Kunuk,director_Zack Snyder,director_Zal Batmanglij,director_Zana Briski,director_Zhang Yimou,director_Zoya Akhtar,director_Álex de la Iglesia,director_Émile Gaudreault
count,4950.0,4950.0,4950.0,4950.0,4950.0,4950.0,4950.0,4950.0,4950.0,4950.0,...,4950.0,4950.0,4950.0,4950.0,4950.0,4950.0,4950.0,4950.0,4950.0,4950.0
mean,1.1483520000000002e-17,-2.87088e-17,4.501539e-15,0.062424,0.067071,0.07596,0.073939,0.078384,0.087879,0.081616,...,0.000202,0.000606,0.000202,0.001414,0.000202,0.000202,0.000808,0.000202,0.000202,0.000202
std,1.000101,1.000101,1.000101,0.241949,0.25017,0.26496,0.261699,0.268802,0.283147,0.273807,...,0.014213,0.024613,0.014213,0.037582,0.014213,0.014213,0.028418,0.014213,0.014213,0.014213
min,-0.8130734,-2.229339,-3.787115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.6424381,-0.6791571,-0.4389913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-0.3495021,-0.209405,0.3050362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.1875473,0.4482479,0.7514527,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,8.46299,10.68884,1.197869,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [55]:
X_train, X_test, Y_train, Y_test = train_test_split(X_trans.to_numpy(), Y, test_size=0.1, shuffle=False)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape


((4455, 5474), (495, 5474), (4455,), (495,))

In [56]:
Y_test.sum(), Y_test.shape

(299, (495,))

### We use a time series split that will be used in grid search later. This makes an expanding window for the training data

In [57]:
tscv = TimeSeriesSplit(n_splits=5)
print(tscv)
for train_index, test_index in tscv.split(X_trans.to_numpy()):
    train = X_trans.iloc[train_index]
    valid = X_trans.iloc[test_index]
    print("TRAIN:", len(train_index), "TEST:", len(test_index))

TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None)
TRAIN: 825 TEST: 825
TRAIN: 1650 TEST: 825
TRAIN: 2475 TEST: 825
TRAIN: 3300 TEST: 825
TRAIN: 4125 TEST: 825


### 1)i) Logistic Regression
        (the accuracy is given after the verbose output of the gridsearch, in the same output cell)

In [65]:
param_grid_logist = {'C': [0.1,1,10], 'solver': ['lbfgs', 'liblinear', 'sag', 'saga']}

logistic_regression = LogisticRegression(max_iter=1000)

grid_search = GridSearchCV(logistic_regression, param_grid_logist, scoring='accuracy', cv=tscv, verbose=2)

grid_search.fit(X_train, Y_train)

best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

best_model = grid_search.best_estimator_
Y_pred_best = best_model.predict(X_test)
print("Accuracy: ", accuracy_score(Y_test, Y_pred_best))
print(pd.DataFrame(Y_pred_best).sum())


Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END ................................C=0.1, solver=lbfgs; total time=   0.2s
[CV] END ................................C=0.1, solver=lbfgs; total time=   0.3s
[CV] END ................................C=0.1, solver=lbfgs; total time=   0.4s
[CV] END ................................C=0.1, solver=lbfgs; total time=   0.5s
[CV] END ................................C=0.1, solver=lbfgs; total time=   0.7s
[CV] END ............................C=0.1, solver=liblinear; total time=   0.1s
[CV] END ............................C=0.1, solver=liblinear; total time=   0.0s
[CV] END ............................C=0.1, solver=liblinear; total time=   0.1s
[CV] END ............................C=0.1, solver=liblinear; total time=   0.1s
[CV] END ............................C=0.1, solver=liblinear; total time=   0.2s
[CV] END ..................................C=0.1, solver=sag; total time=   0.5s
[CV] END ..................................C=0.1



[CV] END ..................................C=10, solver=saga; total time= 1.6min
Best Hyperparameters: {'C': 0.1, 'solver': 'liblinear'}
Accuracy:  0.6646464646464646
0    321
dtype: int64


### 1)ii) KNN

In [81]:

param_grid_knn = {'n_neighbors': [3,5,10,50]} 

knn_tune = KNeighborsClassifier()

grid_search_knn = GridSearchCV(knn_tune, param_grid_knn, scoring='accuracy', cv=tscv, verbose=2)

grid_search_knn.fit(X_train, Y_train)

best_params_knn = grid_search_knn.best_params_
print("Best Hyperparameters:", best_params_knn)

best_model_knn = grid_search_knn.best_estimator_


Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END ......................................n_neighbors=3; total time=   0.2s
[CV] END ......................................n_neighbors=3; total time=   0.4s
[CV] END ......................................n_neighbors=3; total time=   0.5s
[CV] END ......................................n_neighbors=3; total time=   0.6s
[CV] END ......................................n_neighbors=3; total time=   0.6s
[CV] END ......................................n_neighbors=5; total time=   0.2s
[CV] END ......................................n_neighbors=5; total time=   0.2s
[CV] END ......................................n_neighbors=5; total time=   0.5s
[CV] END ......................................n_neighbors=5; total time=   0.5s
[CV] END ......................................n_neighbors=5; total time=   0.5s
[CV] END .....................................n_neighbors=10; total time=   0.2s
[CV] END .....................................n_n

In [82]:
Y_pred_best_knn = best_model_knn.predict(X_test)
print("Testing accuracy: ", accuracy_score(Y_test, Y_pred_best_knn))

Y_pred_best_knn_train = best_model_knn.predict(X_train)
print("Training accuracy: ", accuracy_score(Y_train, Y_pred_best_knn_train))

Testing accuracy:  0.6505050505050505
Training accuracy:  0.660381593714927


### 1)iii) Decision Trees

In [97]:

param_grid_DT = {'max_depth': [3,5,10], 'min_samples_split': [2,5], 'min_samples_leaf': [2,5]} 

DT_tune = DecisionTreeClassifier()

grid_search_DT = GridSearchCV(DT_tune, param_grid_DT, scoring='accuracy', cv=tscv, verbose=2)

grid_search_DT.fit(X_train, Y_train)

best_params_DT = grid_search_DT.best_params_
print("Best Hyperparameters:", best_params_DT)

best_model_DT = grid_search_DT.best_estimator_

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END max_depth=3, min_samples_leaf=2, min_samples_split=2; total time=   0.1s
[CV] END max_depth=3, min_samples_leaf=2, min_samples_split=2; total time=   0.2s
[CV] END max_depth=3, min_samples_leaf=2, min_samples_split=2; total time=   0.3s
[CV] END max_depth=3, min_samples_leaf=2, min_samples_split=2; total time=   0.4s
[CV] END max_depth=3, min_samples_leaf=2, min_samples_split=2; total time=   0.5s
[CV] END max_depth=3, min_samples_leaf=2, min_samples_split=5; total time=   0.0s
[CV] END max_depth=3, min_samples_leaf=2, min_samples_split=5; total time=   0.1s
[CV] END max_depth=3, min_samples_leaf=2, min_samples_split=5; total time=   0.1s
[CV] END max_depth=3, min_samples_leaf=2, min_samples_split=5; total time=   0.3s
[CV] END max_depth=3, min_samples_leaf=2, min_samples_split=5; total time=   0.5s
[CV] END max_depth=3, min_samples_leaf=5, min_samples_split=2; total time=   0.0s
[CV] END max_depth=3, min_samples_lea

In [98]:
Y_pred_best_DT = best_model_DT.predict(X_test)
print("Testing accuracy: ", accuracy_score(Y_test, Y_pred_best_DT))

Y_pred_best_DT_train = best_model_DT.predict(X_train)
print("Training accuracy: ", accuracy_score(Y_train, Y_pred_best_DT_train))

Testing accuracy:  0.5797979797979798
Training accuracy:  0.6271604938271605


The gridsearch cross validation isn't working too well for the test data, so I will just use some nested for loops to figure out some better hyperparameters

In [107]:
hyperparams_DT = [1,2,3,4,5,6,7,8,9,10]

In [111]:
from tqdm import tqdm
scores = []
for i in tqdm(hyperparams_DT):
    for j in hyperparams_DT[1:]:  
        for k in hyperparams_DT:
            DT = DecisionTreeClassifier(max_depth=i, min_samples_split=j, min_samples_leaf=k)
            DT.fit(X_train, Y_train)
            Y_pred_DT = DT.predict(X_test)
            # print(f"Accuracy for max_depth = {i} and min_samples_split = {j} and min_samples_leaf = {k} ", accuracy_score(Y_test, Y_pred_DT))
            scores.append([i,j,k, accuracy_score(Y_test, Y_pred_DT)])
            # print(pd.DataFrame(Y_pred_DT).sum())




100%|██████████| 10/10 [09:22<00:00, 56.24s/it]

[10, 10, 10, 0.591919191919192]





In [112]:
max(scores, key=lambda x:x[3]) # the best hyperparameters for Decision Tree that give the best accuracy

[10, 9, 1, 0.6080808080808081]

### 1)iv) Random Forest

In [126]:

param_grid_RF = {'n_estimators': [100,200,300], 'max_depth': [3,5,10], 'min_samples_split': [2,9], 'min_samples_leaf': [2,5]}

RF_tune = RandomForestClassifier()

grid_search_RF = GridSearchCV(RF_tune, param_grid_RF, scoring='accuracy', cv=tscv, verbose=2)

grid_search_RF.fit(X_train, Y_train)

best_params_RF = grid_search_RF.best_params_
print("Best Hyperparameters:", best_params_RF)

best_model_RF = grid_search_RF.best_estimator_

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END max_depth=3, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=3, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=3, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END max_depth=3, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END max_depth=3, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END max_depth=3, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.2s
[CV] END max_depth=3, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.3s
[CV] END max_depth=3, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=3, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.6s
[CV] END max_depth=3, min_samples_leaf

In [128]:
from tqdm import tqdm
scores = []
for i in tqdm([100,200,300]):
    for k in [2,9]:
        for p in [1,5]:
            RF = RandomForestClassifier(n_estimators=i, min_samples_split=k, min_samples_leaf = p)
            RF.fit(X_train, Y_train)
            Y_pred_RF = RF.predict(X_test)
            # print(f"Accuracy for max_depth = {i} and min_samples_split = {j} and min_samples_leaf = {k} ", accuracy_score(Y_test, Y_pred_RF))
            scores.append([i,k,p, accuracy_score(Y_test, Y_pred_RF)])
            # print(pd.DataFrame(Y_pred_RF).sum())




100%|██████████| 3/3 [01:19<00:00, 26.60s/it]


In [129]:
max(scores, key=lambda x:x[3])

[100, 9, 1, 0.6565656565656566]

In [127]:
Y_pred_best_RF = best_model_RF.predict(X_test)
print("Testing accuracy: ", accuracy_score(Y_test, Y_pred_best_RF))

Y_pred_best_RF_train = best_model_RF.predict(X_train)
print("Training accuracy: ", accuracy_score(Y_train, Y_pred_best_RF_train))

Testing accuracy:  0.604040404040404
Training accuracy:  0.6047138047138048
