Wrapper to train binary class classifier including: LR, DT, RF

In [None]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    precision_score, recall_score, f1_score,
    matthews_corrcoef,
    precision_recall_curve, average_precision_score,
    classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve
)

# ---- Step 1: Load stratified data ----
with open('Data/Pre-processed_data/70-30/X_train_enc.pkl', 'rb') as f:
    X_train = pickle.load(f)
with open('Data/Pre-processed_data/70-30/X_test_enc.pkl', 'rb') as f:
    X_test = pickle.load(f)
with open('Data/Pre-processed_data/70-30/y_train.pkl', 'rb') as f:
    y_train = pickle.load(f)
with open('Data/Pre-processed_data/70-30/y_test.pkl', 'rb') as f:
    y_test = pickle.load(f)

# ---- Step 2: Convert target to binary ----
y_train_bin = y_train.map({'No': 0, 'Yes': 1})
y_test_bin  = y_test.map( {'No': 0, 'Yes': 1})

# ---- Step 3: Check for NaNs ----
print("\nMissing values in X_train:")
print(X_train.isna().sum()[X_train.isna().sum() > 0])

print("\nMissing values in X_test:")
print(X_test.isna().sum()[X_test.isna().sum() > 0])

# ---- Step 4: Define models with preprocessing pipeline ----
# Create a preprocessing step: impute + scale
preprocessing = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Use 'most_frequent' for categorical
    ('scaler', StandardScaler())
])

# Define models wrapped in pipelines
models = {
    'LogisticRegression': Pipeline([
        ('preprocessing', preprocessing),
        ('classifier', LogisticRegression(class_weight='balanced', random_state=42))
    ]),
    'RandomForest': Pipeline([
        ('preprocessing', preprocessing),
        ('classifier', RandomForestClassifier(class_weight='balanced', n_estimators=200, random_state=42))
    ]),
    'Decision Trees': Pipeline([
        ('preprocessing', preprocessing),
        ('classifier', DecisionTreeClassifier(criterion="gini", random_state=None))
    ])
}

# ---- Step 5: Train, predict, evaluate, visualize ----
for name, pipeline in models.items():
    print(f"\n{'='*25} {name} {'='*25}")
    pipeline.fit(X_train, y_train_bin)

    y_proba = pipeline.predict_proba(X_test)[:, 1]
    y_pred  = pipeline.predict(X_test)

    # Metrics
    acc = accuracy_score(y_test_bin, y_pred)
    prec = precision_score(y_test_bin, y_pred)
    rec  = recall_score(y_test_bin, y_pred)
    f1   = f1_score(y_test_bin, y_pred)
    mcc  = matthews_corrcoef(y_test_bin, y_pred)
    ap   = average_precision_score(y_test_bin, y_proba)
    roc_auc = roc_auc_score(y_test_bin, y_proba)
    fpr, tpr, thresholds = roc_curve(y_test_bin, y_proba)

    print(f"ROC-AUC:    {roc_auc:.3f}")
    print(f"Precision:  {prec:.3f}")
    print(f"Recall:     {rec:.3f}")
    print(f"F1-score:   {f1:.3f}")
    print(f"MCC:        {mcc:.3f}")
    print(f"AUC-PR:     {ap:.3f}")
    print(f"Accuracy:    {acc: .5f}")
    print("\nClassification Report:\n", classification_report(y_test_bin, y_pred, target_names=['No','Yes']))

    # ---- Confusion Matrix ----
    cm = confusion_matrix(y_test_bin, y_pred)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
    plt.title(f'{name} - Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.show()

    # ---- Precision-Recall Curve ----
    precision_vals, recall_vals, _ = precision_recall_curve(y_test_bin, y_proba)
    plt.figure(figsize=(7, 5))
    plt.plot(recall_vals, precision_vals, label=f'{name} (AUC-PR = {ap:.2f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'{name} - Precision–Recall Curve')
    plt.grid()
    plt.legend(loc='lower left')

    # ---- AUC-ROC Curve ----
    plt.figure(figsize=(7, 5))
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Chance')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'{name} - ROC Curve')
    plt.legend(loc='lower right')
    plt.grid()
    plt.tight_layout()
    plt.show()

    # Annotate metrics on plot
    plt.text(0.6, 0.2, f'F1-score: {f1:.2f}\nPrecision: {prec:.2f}\nRecall: {rec:.2f}',
             bbox=dict(boxstyle='round,pad=0.3', facecolor='lightyellow', edgecolor='black'))
    plt.tight_layout()
    plt.show()

Wrapper to train binary class classifier including: SVC

In [None]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    precision_score, recall_score, f1_score,
    matthews_corrcoef, average_precision_score,
    precision_recall_curve, classification_report,
    confusion_matrix, accuracy_score, roc_auc_score, roc_curve
)

# ---- Step 1: Load Data ----
with open('Data/Pre-processed_data/70-30/X_train_enc.pkl', 'rb') as f:
    X_train = pickle.load(f)
with open('Data/Pre-processed_data/70-30/X_test_enc.pkl', 'rb') as f:
    X_test = pickle.load(f)
with open('Data/Pre-processed_data/70-30/y_train.pkl', 'rb') as f:
    y_train = pickle.load(f)
with open('Data/Pre-processed_data/70-30/y_test.pkl', 'rb') as f:
    y_test = pickle.load(f)

# ---- Step 2: Impute Missing Values ----
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# ---- Step 3: Scale Features ----
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ---- Step 4: Convert Target to Binary ----
y_train_bin = y_train.map({'No': 0, 'Yes': 1})
y_test_bin  = y_test.map( {'No': 0, 'Yes': 1})

# ---- Step 5: Define Parameter Grid ----
param_grid = {
    'C': [0.1, 1],
    'gamma': ['scale', 0.01, 0.1],
    'kernel': ['rbf', 'linear'],
    'class_weight': ['balanced']
}

svc = SVC(probability=True, random_state=42)

grid_search = GridSearchCV(
    svc,
    param_grid,
    scoring='average_precision',  # Better for imbalanced data
    cv=5,
    n_jobs=-1,
    verbose=2
)

# ---- Step 6: Fit Model ----
grid_search.fit(X_train, y_train_bin)
best_model = grid_search.best_estimator_

print("\nBest Parameters from GridSearch:")
print(grid_search.best_params_)

# ---- Step 7: Evaluate on Test Data ----
y_proba = best_model.predict_proba(X_test)[:, 1]
y_pred  = best_model.predict(X_test)

prec = precision_score(y_test_bin, y_pred, zero_division=0)
rec  = recall_score(y_test_bin, y_pred, zero_division=0)
f1   = f1_score(y_test_bin, y_pred, zero_division=0)
mcc  = matthews_corrcoef(y_test_bin, y_pred)
ap   = average_precision_score(y_test_bin, y_proba)
acc = accuracy_score(y_test_bin, y_pred)
roc_auc = roc_auc_score(y_test_bin, y_proba)
fpr, tpr, thresholds = roc_curve(y_test_bin, y_proba)

print("\n===== SVC with Grid Search Evaluation =====")
print(f'ROC_AUC :     {roc_auc: .3f}')
print(f'Accuracy:    {acc: .5f}')
print(f"Precision:  {prec:.3f}")
print(f"Recall:     {rec:.3f}")
print(f"F1-score:   {f1:.3f}")
print(f"MCC:        {mcc:.3f}")
print(f"AUC-PR:     {ap:.3f}")
print("\nClassification Report:\n", classification_report(y_test_bin, y_pred, target_names=['No', 'Yes'], zero_division=0))

# ---- Step 8: Confusion Matrix ----
cm = confusion_matrix(y_test_bin, y_pred)
plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.title('SVC (GridSearch) - Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.show()
# ---- AUC-ROC Curve ----
plt.figure(figsize=(7, 5))
plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Chance')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'{name} - ROC Curve')
plt.legend(loc='lower right')
plt.grid()
plt.tight_layout()
plt.show()

# ---- Step 9: Precision-Recall Curve ----
precision, recall, _ = precision_recall_curve(y_test_bin, y_proba)
plt.figure(figsize=(7, 5))
plt.plot(recall, precision, label=f'SVC (AUC-PR = {ap:.2f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('SVC - Precision–Recall Curve')
plt.grid()
plt.legend(loc='lower left')

plt.text(0.6, 0.2, f'F1-score: {f1:.2f}\nPrecision: {prec:.2f}\nRecall: {rec:.2f}',
         bbox=dict(boxstyle='round,pad=0.3', facecolor='lightyellow', edgecolor='black'))
plt.tight_layout()
plt.show()