In [None]:
from IPython.core.display import display, HTML
from IPython.display import clear_output
display(HTML("<style>.container { width:90% }</style>"))
import warnings
warnings.filterwarnings('ignore')
# ------------------------------------------------------------------

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import statsmodels.api as sample_data
import random

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.model_selection import GridSearchCV

# -----------------------------------------------------------------
# For Random Oversampling
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

# NN
import tensorflow as tf
from tensorflow.keras.optimizers import SGD
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load Sample of Data

In [68]:

# Define file path in Google Drive (Update the path as per your directory structure)
file_path = "BAN6025Project2Data.csv"
#file_path = "/content/drive/My Drive/Copy of BAN6025Project2Data.csv"

# Set sample size
sample_size = 100000  # 10% of 1,000,000 rows

# Step 1: Get total row count (without loading full dataset)
with open(file_path, 'r') as f:
    total_rows = sum(1 for line in f) - 1  # Subtract 1 for header/

# Step 2: Generate random row indices to skip (excluding header)
skip_rows = sorted(random.sample(range(1, total_rows + 1), total_rows - sample_size))

# Step 3: Load only the sampled rows
df_sample = pd.read_csv(file_path, skiprows=skip_rows)


print(f"Loaded a random 10% sample ({len(df_sample)} rows) successfully.")
df = df_sample


Loaded a random 10% sample (100000 rows) successfully.


# Pre Process

In [70]:
# Make X25 into 0,1 because it only has 2 values
df['X25'] = df['X25'].replace({'INTERNET': 0, 'TELEAPP': 1}).astype(int)

# Make into categories: Target_Y, X17, X19, X20, X22, X24, X25, X28
df['Target_Y'] = df['Target_Y'].astype('category')
'''
df['X17'] = df['X17'].astype('category')
df['X19'] = df['X19'].astype('category')
df['X20'] = df['X20'].astype('category')
df['X22'] = df['X22'].astype('category')
df['X24'] = df['X24'].astype('category')
df['X25'] = df['X25'].astype('category')
df['X28'] = df['X28'].astype('category')
'''
# Drop col X30 because it only has 0's
df = df.drop(['X30'], axis=1)


In [79]:
# Make into dummies: X8, X15, X18, X27 - (they are not numbers)
X = df.drop(['Target_Y'], axis=1)
X = pd.get_dummies(X, columns=['X8', 'X15', 'X18', 'X27', 'X1', 'X5', 'X14', 'X21', 'X23', 'X29', 'X31'], drop_first=True, dtype='float')


y = df['Target_Y']

# Apply SMOTE with Limited Oversampling (1% → 10% instead of full balance)
smote = SMOTE(sampling_strategy=0.1, random_state=42)  # Increase minority class to 10%
X, y = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Random Forest

In [84]:
# create an instance of a random forest classifier using default values
# n_estimators': 150, 'max_depth': 25, 'min_samples_leaf': 5, 'max_features': 'sqrt'
# 'n_estimators': 50, 'max_depth': 25, 'min_samples_split': 20, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'class_weight': 'balanced'
# n_estimators': 100, 'max_depth': 25, 'min_samples_split': 20, 'min_samples_leaf': 5, 'max_features': 6, 'class_weight': 'balanced_subsample'
rf = RandomForestClassifier( n_estimators=100, max_depth=25, min_samples_split=20, min_samples_leaf=5, max_features=6, class_weight='balanced_subsample', n_jobs=-1, random_state=42 )

# fit the model to the training data
rf.fit(X_train, y_train)

# make predictions on the training and test data
y_pred_train_rf = rf.predict(X_train)
y_pred_test_rf = rf.predict(X_test)

y_prob_train_rf = rf.predict_proba(X_train)
y_prob_test_rf = rf.predict_proba(X_test)

# calculate the accuracy, precision, and recall scores
acc_train = accuracy_score(y_train, y_pred_train_rf)
prec_train = precision_score(y_train, y_pred_train_rf)
rec_train = recall_score(y_train, y_pred_train_rf)
f1_train = f1_score(y_train, y_pred_train_rf)


# print the scores
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print("Precision: {:.4f}".format(prec_train))
print("Recall.  : {:.4f}".format(rec_train))
print("F1 Score.  : {:.4f}".format(f1_train))
print("")

# calculate the accuracy, precision, and recall scores
acc_test = accuracy_score(y_test, y_pred_test_rf)
prec_test = precision_score(y_test, y_pred_test_rf)
rec_test = recall_score(y_test, y_pred_test_rf)
f1_test = f1_score(y_test, y_pred_test_rf)

print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print("Precision: {:.4f}".format(prec_test))
print("Recall.  : {:.4f}".format(rec_test))
print("F1 Score.  : {:.4f}".format(f1_test))

 -- train set -- 
Accuracy : 0.9923
Precision: 0.9697
Recall.  : 0.9458
F1 Score.  : 0.9576

 -- test set -- 
Accuracy : 0.9872
Precision: 0.9548
Recall.  : 0.8984
F1 Score.  : 0.9257


# RF Bayesian Optimization

In [83]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Define the objective function for Bayesian Optimization
def objective(trial):
    # Define hyperparameter search space
    n_estimators = trial.suggest_int('n_estimators', 50, 300, step=50)
    max_depth = trial.suggest_int('max_depth', 5, 25, step=5)
    min_samples_split = trial.suggest_int('min_samples_split', 10, 50, step=10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 30, step=5)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 2, 4, 6])
    class_weight = trial.suggest_categorical('class_weight', ['balanced', 'balanced_subsample'])

    # Initialize Random Forest with trial-selected hyperparameters
    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        class_weight=class_weight,
        n_jobs=-1,
        random_state=42
    )

    # Perform Cross-Validation
    score = cross_val_score(rf, X_train, y_train, cv=3, scoring='f1', n_jobs=-1).mean()
    return score  # Maximizing F1-score

# Create and run Bayesian Optimization study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)  # Run 30 trials to find best params

# Print Best Parameters & Score
print("Best Parameters:", study.best_params)
print("Best Cross-Validation F1 Score:", study.best_value)


[I 2025-02-06 21:59:37,557] A new study created in memory with name: no-name-8386660f-d072-4c37-9a3e-c52ea7acd821
[I 2025-02-06 21:59:46,215] Trial 0 finished with value: 0.7678430418624586 and parameters: {'n_estimators': 200, 'max_depth': 5, 'min_samples_split': 30, 'min_samples_leaf': 10, 'max_features': 6, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.7678430418624586.
[I 2025-02-06 21:59:50,560] Trial 1 finished with value: 0.8501837042366572 and parameters: {'n_estimators': 50, 'max_depth': 20, 'min_samples_split': 20, 'min_samples_leaf': 15, 'max_features': 2, 'class_weight': 'balanced_subsample'}. Best is trial 1 with value: 0.8501837042366572.
[I 2025-02-06 22:00:07,615] Trial 2 finished with value: 0.8876258458449917 and parameters: {'n_estimators': 250, 'max_depth': 20, 'min_samples_split': 40, 'min_samples_leaf': 25, 'max_features': 6, 'class_weight': 'balanced_subsample'}. Best is trial 2 with value: 0.8876258458449917.
[I 2025-02-06 22:00:19,198] Trial 3 fini

Best Parameters: {'n_estimators': 100, 'max_depth': 25, 'min_samples_split': 20, 'min_samples_leaf': 5, 'max_features': 6, 'class_weight': 'balanced_subsample'}
Best Cross-Validation F1 Score: 0.92484073562495


# Gradient Boost

In [88]:
# create an instance of the Gradient Boosting Classifier using default values
# n_estimators': 150, 'learning_rate': 0.15000000000000002, 'max_depth': 9, 'min_samples_split': 20, 'min_samples_leaf': 15, 'subsample': 1.0, 'max_features': 'sqrt'
gb = GradientBoostingClassifier(n_estimators = 150, learning_rate = 0.15000000000000002, max_depth=9,min_samples_split=20, min_samples_leaf=15,subsample=1.0, max_features='sqrt', random_state=904)

# fit the model to the training data
gb.fit(X_train, y_train)

# make predictions on the training and test data
y_pred_train_gb = gb.predict(X_train)
y_pred_test_gb = gb.predict(X_test)

y_prob_train_gb = gb.predict_proba(X_train)
y_prob_test_gb = gb.predict_proba(X_test)

# calculate the accuracy, precision, and recall scores
acc_train = accuracy_score(y_train, y_pred_train_gb)
prec_train = precision_score(y_train, y_pred_train_gb)
rec_train = recall_score(y_train, y_pred_train_gb)
f1_train = f1_score(y_train, y_pred_train_gb)

# print the scores
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print("Precision: {:.4f}".format(prec_train))
print("Recall.  : {:.4f}".format(rec_train))
print("F1 Score.  : {:.4f}".format(f1_train))
print("")

# calculate the accuracy, precision, and recall scores
acc_test = accuracy_score(y_test, y_pred_test_gb)
prec_test = precision_score(y_test, y_pred_test_gb)
rec_test = recall_score(y_test, y_pred_test_gb)
f1_test = f1_score(y_test, y_pred_test_gb)

print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print("Precision: {:.4f}".format(prec_test))
print("Recall.  : {:.4f}".format(rec_test))
print("F1 Score.  : {:.4f}".format(f1_test))

 -- train set -- 
Accuracy : 0.9994
Precision: 1.0000
Recall.  : 0.9931
F1 Score.  : 0.9966

 -- test set -- 
Accuracy : 0.9898
Precision: 0.9908
Recall.  : 0.8928
F1 Score.  : 0.9393


# GB Bayesian Optimization

In [87]:
import optuna
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

# Define the objective function for Bayesian Optimization
def objective(trial):
    # Define hyperparameter search space
    n_estimators = trial.suggest_int('n_estimators', 50, 150, step=50)  # Reduced upper limit
    learning_rate = trial.suggest_float('learning_rate', 0.05, 0.3, step=0.05)  # Skipped very low values
    max_depth = trial.suggest_int('max_depth', 3, 12, step=3)  # Capped at 12 to reduce complexity
    min_samples_split = trial.suggest_int('min_samples_split', 10, 40, step=10)  # Reduced range
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 20, step=5)  # Reduced range
    subsample = trial.suggest_float('subsample', 0.7, 1.0, step=0.1)  # Keeps randomness for generalization
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])

    # Initialize Gradient Boosting with trial-selected hyperparameters
    gb = GradientBoostingClassifier(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        subsample=subsample,
        max_features=max_features,
        random_state=42
    )

    # Perform Cross-Validation (cv=2 instead of cv=3 for speed)
    score = cross_val_score(gb, X_train, y_train, cv=2, scoring='f1', n_jobs=-1).mean()
    return score  # Maximizing F1-score

# Create and run Bayesian Optimization study (n_trials=15 instead of 30)
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=15)  # Reduced trials for faster execution

# Print Best Parameters & Score
print("Best Parameters:", study.best_params)
print("Best Cross-Validation F1 Score:", study.best_value)


[I 2025-02-06 22:19:39,028] A new study created in memory with name: no-name-b9cce6c3-1db8-4e07-98a6-a601c42a43b4
[I 2025-02-06 22:19:46,968] Trial 0 finished with value: 0.9258368652719251 and parameters: {'n_estimators': 50, 'learning_rate': 0.2, 'max_depth': 6, 'min_samples_split': 10, 'min_samples_leaf': 15, 'subsample': 0.8999999999999999, 'max_features': 'log2'}. Best is trial 0 with value: 0.9258368652719251.
[I 2025-02-06 22:23:07,912] Trial 1 finished with value: 0.932949410744025 and parameters: {'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 6, 'min_samples_split': 20, 'min_samples_leaf': 20, 'subsample': 0.8999999999999999, 'max_features': None}. Best is trial 1 with value: 0.932949410744025.
[I 2025-02-06 22:25:08,177] Trial 2 finished with value: 0.9260338114476264 and parameters: {'n_estimators': 100, 'learning_rate': 0.3, 'max_depth': 6, 'min_samples_split': 30, 'min_samples_leaf': 10, 'subsample': 0.7, 'max_features': None}. Best is trial 1 with value: 0.93294

Best Parameters: {'n_estimators': 150, 'learning_rate': 0.15000000000000002, 'max_depth': 9, 'min_samples_split': 20, 'min_samples_leaf': 15, 'subsample': 1.0, 'max_features': 'sqrt'}
Best Cross-Validation F1 Score: 0.9360475074873122
