In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
%matplotlib inline

In [10]:
train_df=pd.read_csv('../datasets/train_dataset.csv',index_col=0)
test_df=pd.read_csv('../datasets/test_data.csv',index_col=0)

In [17]:
X_train=train_df.drop(columns=['target'])
X_test=test_df.drop(columns=['target'])

y_train=train_df['target']
y_test=test_df['target']

Model Training

In [20]:
from sklearn.metrics import accuracy_score, precision_score, roc_auc_score, recall_score, f1_score, classification_report

In [21]:
def performance(model,X_test=X_test,y_test=y_test):
    y_pred=model.predict(X_test)

    print("Accuracy :", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall   :", recall_score(y_test, y_pred))
    print("F1 Score :", f1_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))


Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression

lr=LogisticRegression(max_iter=1000)
lr.fit(X_train,y_train)
performance(lr)

Accuracy : 0.8532608695652174
Precision: 0.9
Recall   : 0.8411214953271028
F1 Score : 0.8695652173913043

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.87      0.83        77
           1       0.90      0.84      0.87       107

    accuracy                           0.85       184
   macro avg       0.85      0.86      0.85       184
weighted avg       0.86      0.85      0.85       184



Decision Tree 

In [25]:
from sklearn.tree import DecisionTreeClassifier

dt=DecisionTreeClassifier(
    max_depth=5,
    min_samples_split=10,
    random_state=42
)

dt.fit(X_train,y_train)
performance(dt)

Accuracy : 0.8641304347826086
Precision: 0.91
Recall   : 0.8504672897196262
F1 Score : 0.8792270531400966

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.88      0.84        77
           1       0.91      0.85      0.88       107

    accuracy                           0.86       184
   macro avg       0.86      0.87      0.86       184
weighted avg       0.87      0.86      0.86       184



Random Forest

In [46]:
from sklearn.ensemble import RandomForestClassifier

rf=RandomForestClassifier(
    n_estimators=200,
    max_depth=7,
    random_state=42
)

rf.fit(X_train,y_train)
performance(rf)

Accuracy : 0.8858695652173914
Precision: 0.9134615384615384
Recall   : 0.8878504672897196
F1 Score : 0.9004739336492891

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.88      0.87        77
           1       0.91      0.89      0.90       107

    accuracy                           0.89       184
   macro avg       0.88      0.89      0.88       184
weighted avg       0.89      0.89      0.89       184



AdaBoostClassifier

In [28]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(
    n_estimators=200,
    learning_rate=0.05,
    random_state=42
)

ada.fit(X_train, y_train)

performance(ada)


Accuracy : 0.8478260869565217
Precision: 0.898989898989899
Recall   : 0.8317757009345794
F1 Score : 0.8640776699029126

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.87      0.83        77
           1       0.90      0.83      0.86       107

    accuracy                           0.85       184
   macro avg       0.84      0.85      0.85       184
weighted avg       0.85      0.85      0.85       184



XGBClassifier

In [29]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)

xgb.fit(X_train, y_train)

performance(xgb)

Accuracy : 0.8804347826086957
Precision: 0.912621359223301
Recall   : 0.8785046728971962
F1 Score : 0.8952380952380953

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.88      0.86        77
           1       0.91      0.88      0.90       107

    accuracy                           0.88       184
   macro avg       0.88      0.88      0.88       184
weighted avg       0.88      0.88      0.88       184



In [34]:
import warnings
warnings.filterwarnings('ignore')

LGBMClassifier

In [None]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    random_state=42
)

lgbm.fit(X_train, y_train)


In [54]:
performance(lgbm)

Accuracy : 0.8586956521739131
Precision: 0.900990099009901
Recall   : 0.8504672897196262
F1 Score : 0.875

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.87      0.84        77
           1       0.90      0.85      0.88       107

    accuracy                           0.86       184
   macro avg       0.85      0.86      0.86       184
weighted avg       0.86      0.86      0.86       184



CatBoostClassifier

In [37]:
from catboost import CatBoostClassifier

cat = CatBoostClassifier(
    iterations=300,
    learning_rate=0.05,
    depth=5,
    verbose=0,
    random_seed=42
)

cat.fit(X_train, y_train)

performance(cat, X_test, y_test)


Accuracy : 0.8804347826086957
Precision: 0.9047619047619048
Recall   : 0.8878504672897196
F1 Score : 0.8962264150943396

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.87      0.86        77
           1       0.90      0.89      0.90       107

    accuracy                           0.88       184
   macro avg       0.88      0.88      0.88       184
weighted avg       0.88      0.88      0.88       184



VotingClassifier

In [39]:
from sklearn.ensemble import VotingClassifier

voting = VotingClassifier(
    estimators=[
        ("dt", dt),
        ("rf", rf),
        ("xgb", xgb)
    ],
    voting="soft"
)

voting.fit(X_train, y_train)

performance(voting, X_test, y_test)


Accuracy : 0.8804347826086957
Precision: 0.912621359223301
Recall   : 0.8785046728971962
F1 Score : 0.8952380952380953

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.88      0.86        77
           1       0.91      0.88      0.90       107

    accuracy                           0.88       184
   macro avg       0.88      0.88      0.88       184
weighted avg       0.88      0.88      0.88       184



### FInal Take away:
Random forest gave the best performance

In [47]:
performance(rf)

Accuracy : 0.8858695652173914
Precision: 0.9134615384615384
Recall   : 0.8878504672897196
F1 Score : 0.9004739336492891

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.88      0.87        77
           1       0.91      0.89      0.90       107

    accuracy                           0.89       184
   macro avg       0.88      0.89      0.88       184
weighted avg       0.89      0.89      0.89       184



Saving the model

In [50]:
import pickle as pkl

with open('../models/RandomForest.pkl', 'wb') as f:
    pkl.dump(rf, f)

-----------------------------------------------------------------