In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [2]:
%%capture
%run "./04_supervised_learning.ipynb"

In [3]:
# Prepare Data

X_train, X_test, y_train, y_test = train_test_split(
    X_final, y, test_size=0.2, random_state=42
)

# Helper function

def accuracy_calc(model):
    """Print train & test accuracy for a fitted model"""
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    train_acc = accuracy_score(y_train, y_pred_train)
    test_acc = accuracy_score(y_test, y_pred_test)

    print(f"accuracy of training data = {train_acc*100:.2f}%")
    print(f"accuracy of testing data = {test_acc*100:.2f}%")
    return test_acc


In [4]:
# 1 - Logistic Regression

print("=== Logistic Regression ===")
lrmodel = LogisticRegression()

lr_param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200, 500],
    'l1_ratio': [0, 0.25, 0.5, 0.75, 1]
}

lr_grid_search = GridSearchCV(lrmodel, lr_param_grid, cv=5, scoring='accuracy', error_score=np.nan)
lr_grid_search.fit(X_train, y_train)

print("Best parameters (GridSearch):", lr_grid_search.best_params_)
acc_lr_grid = accuracy_calc(lr_grid_search)
print("\n")

lr_random_search = RandomizedSearchCV(lrmodel, lr_param_grid, cv=5, scoring='accuracy', n_iter=20, error_score=np.nan)
lr_random_search.fit(X_train, y_train)

print("Best parameters (RandomSearch):", lr_random_search.best_params_)
acc_lr_random = accuracy_calc(lr_random_search)
print("\n\n")

=== Logistic Regression ===
Best parameters (GridSearch): {'C': 1, 'l1_ratio': 0, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}
accuracy of training data = 59.49%
accuracy of testing data = 61.67%


Best parameters (RandomSearch): {'solver': 'saga', 'penalty': 'l1', 'max_iter': 200, 'l1_ratio': 0.25, 'C': 1}
accuracy of training data = 60.34%
accuracy of testing data = 61.67%





In [5]:
# 2 - Decision Tree

print("=== Decision Tree ===")
dtmodel = DecisionTreeClassifier()

dt_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

dt_grid_search = GridSearchCV(dtmodel, dt_param_grid, cv=5, scoring='accuracy')
dt_grid_search.fit(X_train, y_train)

print("Best parameters (GridSearch):", dt_grid_search.best_params_)
acc_dt_grid = accuracy_calc(dt_grid_search)
print("\n")

dt_random_search = RandomizedSearchCV(dtmodel, dt_param_grid, cv=5, scoring='accuracy', n_iter=20)
dt_random_search.fit(X_train, y_train)

print("Best parameters (RandomSearch):", dt_random_search.best_params_)
acc_dt_random = accuracy_calc(dt_random_search)
print("\n\n")


=== Decision Tree ===
Best parameters (GridSearch): {'criterion': 'entropy', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5}
accuracy of training data = 73.84%
accuracy of testing data = 56.67%


Best parameters (RandomSearch): {'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 20, 'criterion': 'entropy'}
accuracy of training data = 63.29%
accuracy of testing data = 68.33%





In [6]:
# 3 - Random Forest

print(" Random Forest ")
rfmodel = RandomForestClassifier()

rf_param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}

rf_grid_search = GridSearchCV(rfmodel, rf_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
rf_grid_search.fit(X_train, y_train)

print("Best parameters (GridSearch):", rf_grid_search.best_params_)
acc_rf_grid = accuracy_calc(rf_grid_search)
print("\n")

rf_random_search = RandomizedSearchCV(rfmodel, rf_param_grid, cv=5, scoring='accuracy', n_iter=20, n_jobs=-1)
rf_random_search.fit(X_train, y_train)

print("Best parameters (RandomSearch):", rf_random_search.best_params_)
acc_rf_random = accuracy_calc(rf_random_search)
print("\n\n")

 Random Forest 
Best parameters (GridSearch): {'bootstrap': True, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
accuracy of training data = 78.48%
accuracy of testing data = 65.00%


Best parameters (RandomSearch): {'n_estimators': 100, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 30, 'bootstrap': True}
accuracy of training data = 77.64%
accuracy of testing data = 66.67%





In [7]:
# 4 - SVM

print(" Support Vector Machine (SVM) ")
svmmodel = SVC()

svm_param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto', 0.01, 0.1, 1],
    'degree': [2, 3, 4]
}

svm_grid_search = GridSearchCV(svmmodel, svm_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
svm_grid_search.fit(X_train, y_train)

print("Best parameters (GridSearch):", svm_grid_search.best_params_)
acc_svm_grid = accuracy_calc(svm_grid_search)
print("\n")

svm_random_search = RandomizedSearchCV(svmmodel, svm_param_grid, cv=5, scoring='accuracy', n_iter=20, n_jobs=-1)
svm_random_search.fit(X_train, y_train)

print("Best parameters (RandomSearch):", svm_random_search.best_params_)
acc_svm_random = accuracy_calc(svm_random_search)
print("\n\n")


 Support Vector Machine (SVM) 
Best parameters (GridSearch): {'C': 10, 'degree': 4, 'gamma': 'scale', 'kernel': 'poly'}
accuracy of training data = 86.08%
accuracy of testing data = 58.33%


Best parameters (RandomSearch): {'kernel': 'linear', 'gamma': 'scale', 'degree': 2, 'C': 0.1}
accuracy of training data = 61.18%
accuracy of testing data = 61.67%





In [8]:
# Compare best models
results = {
    "Logistic Regression (Grid)": acc_lr_grid,
    "Logistic Regression (Random)": acc_lr_random,
    "Decision Tree (Grid)": acc_dt_grid,
    "Decision Tree (Random)": acc_dt_random,
    "Random Forest (Grid)": acc_rf_grid,
    "Random Forest (Random)": acc_rf_random,
    "SVM (Grid)": acc_svm_grid,
    "SVM (Random)": acc_svm_random
}

best_model = max(results, key=results.get)

print(f"Best model is: {best_model} with accuracy = {results[best_model]*100:.2f}%")

Best model is: Decision Tree (Random) with accuracy = 68.33%
