# Binary Classification without fine-tuning

In [32]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

In [33]:
data = pd.read_csv('/Users/user/Downloads/AshokaUniversity/monsoon23-courses/IML/final_project/dataset/binary_success.csv')

In [34]:
data.head(2)

Unnamed: 0,budget,id,imdb_id,original_language,original_title,popularity,revenue,runtime,title,vote_average,...,release_year,release_month,genre,production_company,production_country,spoken_language,main_cast,main_cast_id,director,success_degree
0,2115000.0,110669,tt0043116,en,Wabash Avenue,0.109913,2039000.0,92.0,Wabash Avenue,7.0,...,1950,3,Music,Twentieth Century Fox Film Corporation,United States of America,English,Betty Grable,64838.0,Henry Koster,0
1,3768785.0,25209,tt0042200,en,Annie Get Your Gun,2.090588,8000000.0,107.0,Annie Get Your Gun,7.3,...,1950,5,Action,Metro-Goldwyn-Mayer (MGM),United States of America,English,Betty Hutton,97182.0,George Sidney,1


Filtering based on data analysis and own intuitive understanding of what may lead to a movie's success

In [35]:
X = data.filter(['budget', 'runtime', 'release_year', 'release_month', 'genre', 'production_company', 'production_country', 'main_cast', 'director'], axis=1)
Y = data['success_degree']

In [36]:
X.head(2)

Unnamed: 0,budget,runtime,release_year,release_month,genre,production_company,production_country,main_cast,director
0,2115000.0,92.0,1950,3,Music,Twentieth Century Fox Film Corporation,United States of America,Betty Grable,Henry Koster
1,3768785.0,107.0,1950,5,Action,Metro-Goldwyn-Mayer (MGM),United States of America,Betty Hutton,George Sidney


In [37]:
Y.head(2)

0    0
1    1
Name: success_degree, dtype: int64

# Standardising and one hot encoding

In [38]:
categorical_features = ['release_month', 'genre', 'production_company', 'production_country', 'main_cast', 'director']
encoder = OneHotEncoder(sparse_output=False)
X_encoded = encoder.fit_transform(X[categorical_features])

numerical_features = ['budget', 'runtime', 'release_year']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X[numerical_features])

X_final = np.concatenate((X_scaled, X_encoded), axis=1)
X_scaled[0:10]


array([[-0.76168751, -0.86705799, -3.78711496],
       [-0.72131641, -0.16242984, -3.78711496],
       [-0.77914162,  1.29380167, -3.78711496],
       [-0.74252461, -1.71261176, -3.78711496],
       [-0.80721465, -1.05495883, -3.78711496],
       [-0.77054883, -0.02150421, -3.78711496],
       [-0.78158275, -0.25638026, -3.71271221],
       [-0.76937708,  0.68312394, -3.71271221],
       [-0.78402388, -0.4442811 , -3.71271221],
       [-0.75717141, -0.16242984, -3.71271221]])

In [39]:
X_trans = pd.DataFrame(X_final)

column_names = numerical_features + list(encoder.get_feature_names_out(categorical_features))
X_trans.columns = column_names

X_trans.head(10)

Unnamed: 0,budget,runtime,release_year,release_month_1,release_month_2,release_month_3,release_month_4,release_month_5,release_month_6,release_month_7,...,director_Yılmaz Erdoğan,director_Zach Braff,director_Zacharias Kunuk,director_Zack Snyder,director_Zal Batmanglij,director_Zana Briski,director_Zhang Yimou,director_Zoya Akhtar,director_Álex de la Iglesia,director_Émile Gaudreault
0,-0.761688,-0.867058,-3.787115,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.721316,-0.16243,-3.787115,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.779142,1.293802,-3.787115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.742525,-1.712612,-3.787115,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.807215,-1.054959,-3.787115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,-0.770549,-0.021504,-3.787115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,-0.781583,-0.25638,-3.712712,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,-0.769377,0.683124,-3.712712,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,-0.784024,-0.444281,-3.712712,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,-0.757171,-0.16243,-3.712712,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
X_trans.dtypes

budget                         float64
runtime                        float64
release_year                   float64
release_month_1                float64
release_month_2                float64
                                ...   
director_Zana Briski           float64
director_Zhang Yimou           float64
director_Zoya Akhtar           float64
director_Álex de la Iglesia    float64
director_Émile Gaudreault      float64
Length: 5474, dtype: object

In [41]:
X_trans.describe()

Unnamed: 0,budget,runtime,release_year,release_month_1,release_month_2,release_month_3,release_month_4,release_month_5,release_month_6,release_month_7,...,director_Yılmaz Erdoğan,director_Zach Braff,director_Zacharias Kunuk,director_Zack Snyder,director_Zal Batmanglij,director_Zana Briski,director_Zhang Yimou,director_Zoya Akhtar,director_Álex de la Iglesia,director_Émile Gaudreault
count,4950.0,4950.0,4950.0,4950.0,4950.0,4950.0,4950.0,4950.0,4950.0,4950.0,...,4950.0,4950.0,4950.0,4950.0,4950.0,4950.0,4950.0,4950.0,4950.0,4950.0
mean,1.1483520000000002e-17,-2.87088e-17,4.501539e-15,0.062424,0.067071,0.07596,0.073939,0.078384,0.087879,0.081616,...,0.000202,0.000606,0.000202,0.001414,0.000202,0.000202,0.000808,0.000202,0.000202,0.000202
std,1.000101,1.000101,1.000101,0.241949,0.25017,0.26496,0.261699,0.268802,0.283147,0.273807,...,0.014213,0.024613,0.014213,0.037582,0.014213,0.014213,0.028418,0.014213,0.014213,0.014213
min,-0.8130734,-2.229339,-3.787115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.6424381,-0.6791571,-0.4389913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-0.3495021,-0.209405,0.3050362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.1875473,0.4482479,0.7514527,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,8.46299,10.68884,1.197869,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [42]:
X_train, X_test, Y_train, Y_test = train_test_split(X_trans.to_numpy(), Y, test_size=0.1, shuffle=False)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape


((4455, 5474), (495, 5474), (4455,), (495,))

In [43]:
Y_test.sum(), Y_test.shape

(299, (495,))

### 1)i) Logistic Regression

In [44]:
logist = LogisticRegression(max_iter=1000)
logist.fit(X_train, Y_train)

In [45]:
Y_pred_logist = logist.predict(X_test)
print("Testing Accuracy: ", accuracy_score(Y_test, Y_pred_logist))
print(pd.DataFrame(Y_pred_logist).sum())


Testing Accuracy:  0.6545454545454545
0    316
dtype: int64


In [46]:
Y_train_pred_logist = logist.predict(X_train)
print("Training Accuracy: ", accuracy_score(Y_train, Y_train_pred_logist))
print(pd.DataFrame(Y_train_pred_logist).sum())


Training Accuracy:  0.8857463524130191
0    2987
dtype: int64


### 1)ii) KNN

In [47]:
knn = KNeighborsClassifier(n_neighbors=10) # default 5
knn.fit(X_train, Y_train)

In [48]:
Y_pred_knn = knn.predict(X_test)
print("Accuracy: ", accuracy_score(Y_test, Y_pred_knn))
print(pd.DataFrame(Y_pred_knn).sum())

Accuracy:  0.6505050505050505
0    342
dtype: int64


### 1)iii) Decision Trees

In [49]:
DT = DecisionTreeClassifier(max_depth=10)
DT.fit(X_train, Y_train)

In [50]:
Y_pred_DT = DT.predict(X_test)
print("Accuracy: ", accuracy_score(Y_test, Y_pred_DT))
print(pd.DataFrame(Y_pred_DT).sum())

Accuracy:  0.604040404040404
0    281
dtype: int64


### 1)iv) Random Forest

In [53]:
RF = RandomForestClassifier()
print(RF.get_params())
RF.fit(X_train, Y_train)

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [54]:
Y_pred_RF = RF.predict(X_test)
print("Accuracy: ", accuracy_score(Y_test, Y_pred_RF))
print(pd.DataFrame(Y_pred_RF).sum())

Accuracy:  0.6424242424242425
0    398
dtype: int64
