In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import joblib
import os

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_val_score, StratifiedKFold

import optuna
from optuna.samplers import TPESampler

import warnings
warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Config
RANDOM_STATE = 42
N_FOLDS = 5
N_OPTUNA_TRIALS = 20 # Reduced for speed

os.makedirs('models', exist_ok=True)
os.makedirs('outputs', exist_ok=True)

print("✅ Libraries imported.")

✅ Libraries imported.


## 1. Load Data

In [44]:
data_path = 'data'
X_train = np.load(f'{data_path}/X_train_full.npy')
y_train = np.load(f'{data_path}/y_train_full.npy')
X_test = np.load(f'{data_path}/X_test.npy')
y_test = np.load(f'{data_path}/y_test.npy')
feature_names = np.load(f'{data_path}/feature_names.npy', allow_pickle=True)

print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

X_train: (48601, 10), y_train: (48601,)
X_test: (12151, 10), y_test: (12151,)


In [45]:
cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)

def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    y_proba = model.predict_proba(X)[:, 1] if hasattr(model, 'predict_proba') else None
    
    return {
        'accuracy': accuracy_score(y, y_pred),
        'precision': precision_score(y, y_pred),
        'recall': recall_score(y, y_pred),
        'f1': f1_score(y, y_pred),
        'roc_auc': roc_auc_score(y, y_proba) if y_proba is not None else None
    }

def print_metrics(metrics, name):
    print(f"\n{'='*40}\n{name}\n{'='*40}")
    for k, v in metrics.items():
        if v: print(f"{k:<15}: {v:.4f}")

ensemble_results = {}
trained_ensembles = {}

def optimize(objective, n_trials=N_OPTUNA_TRIALS):
    sampler = TPESampler(seed=RANDOM_STATE)
    study = optuna.create_study(direction='maximize', sampler=sampler)
    study.optimize(objective, n_trials=n_trials)
    return study

## 2. Random Forest

In [46]:
def obj_rf(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'random_state': RANDOM_STATE,
        'n_jobs': -1
    }
    model = RandomForestClassifier(**params)
    return cross_val_score(model, X_train, y_train, cv=cv, scoring='f1').mean()

print("Tuning Random Forest...")
study_rf = optimize(obj_rf)
print(f"Best F1: {study_rf.best_value:.4f}")

rf_model = RandomForestClassifier(**study_rf.best_params, random_state=RANDOM_STATE, n_jobs=-1)
start = time.time()
rf_model.fit(X_train, y_train)
train_time = time.time() - start

metrics_rf = evaluate_model(rf_model, X_test, y_test)
metrics_rf['train_time'] = train_time
ensemble_results['RandomForest'] = metrics_rf
trained_ensembles['RandomForest'] = rf_model
print_metrics(metrics_rf, 'RandomForest')
joblib.dump(rf_model, 'models/ensemble_randomforest.pkl')

Tuning Random Forest...
Best F1: 0.8086

RandomForest
accuracy       : 0.8175
precision      : 0.8428
recall         : 0.7733
f1             : 0.8066
roc_auc        : 0.9090
train_time     : 0.5580


['models/ensemble_randomforest.pkl']

## 3. XGBoost

In [47]:
def obj_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'random_state': RANDOM_STATE,
        'eval_metric': 'logloss'
    }
    model = XGBClassifier(**params)
    return cross_val_score(model, X_train, y_train, cv=cv, scoring='f1').mean()

print("Tuning XGBoost...")
study_xgb = optimize(obj_xgb)
print(f"Best F1: {study_xgb.best_value:.4f}")

xgb_model = XGBClassifier(**study_xgb.best_params, random_state=RANDOM_STATE, eval_metric='logloss')
start = time.time()
xgb_model.fit(X_train, y_train)
train_time = time.time() - start

metrics_xgb = evaluate_model(xgb_model, X_test, y_test)
metrics_xgb['train_time'] = train_time
ensemble_results['XGBoost'] = metrics_xgb
trained_ensembles['XGBoost'] = xgb_model
print_metrics(metrics_xgb, 'XGBoost')
joblib.dump(xgb_model, 'models/ensemble_xgboost.pkl')

Tuning XGBoost...
Best F1: 0.8091

XGBoost
accuracy       : 0.8195
precision      : 0.8472
recall         : 0.7725
f1             : 0.8081
roc_auc        : 0.9105
train_time     : 0.3707


['models/ensemble_xgboost.pkl']

## 4. LightGBM

In [48]:
def obj_lgbm(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'random_state': RANDOM_STATE,
        'verbose': -1
    }
    model = LGBMClassifier(**params)
    return cross_val_score(model, X_train, y_train, cv=cv, scoring='f1').mean()

print("Tuning LightGBM...")
study_lgbm = optimize(obj_lgbm)
print(f"Best F1: {study_lgbm.best_value:.4f}")

lgbm_model = LGBMClassifier(**study_lgbm.best_params, random_state=RANDOM_STATE, verbose=-1)
start = time.time()
lgbm_model.fit(X_train, y_train)
train_time = time.time() - start

metrics_lgbm = evaluate_model(lgbm_model, X_test, y_test)
metrics_lgbm['train_time'] = train_time
ensemble_results['LightGBM'] = metrics_lgbm
trained_ensembles['LightGBM'] = lgbm_model
print_metrics(metrics_lgbm, 'LightGBM')
joblib.dump(lgbm_model, 'models/ensemble_lightgbm.pkl')

Tuning LightGBM...
Best F1: 0.8096

LightGBM
accuracy       : 0.8189
precision      : 0.8436
recall         : 0.7758
f1             : 0.8083
roc_auc        : 0.9109
train_time     : 0.7616


['models/ensemble_lightgbm.pkl']

## 5. Voting Classifier (Soft)

In [49]:
# Define estimators with pipelines where needed
estimators = [
    ('lr', Pipeline([('scaler', StandardScaler()), ('model', LogisticRegression(solver='saga', max_iter=1000, random_state=RANDOM_STATE))])),
    ('knn', Pipeline([('scaler', StandardScaler()), ('model', KNeighborsClassifier())])),
    ('nb', GaussianNB()),
    ('dt', DecisionTreeClassifier(random_state=RANDOM_STATE)),
    ('rf', RandomForestClassifier(random_state=RANDOM_STATE)),
    ('xgb', XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss')),
    ('lgbm', LGBMClassifier(random_state=RANDOM_STATE, verbose=-1))
]

voting_model = VotingClassifier(estimators=estimators, voting='soft')
start = time.time()
voting_model.fit(X_train, y_train)
train_time = time.time() - start

metrics_voting = evaluate_model(voting_model, X_test, y_test)
metrics_voting['train_time'] = train_time
ensemble_results['Voting'] = metrics_voting
print_metrics(metrics_voting, 'Voting (Soft)')
joblib.dump(voting_model, 'models/ensemble_voting.pkl')


Voting (Soft)
accuracy       : 0.8148
precision      : 0.8362
recall         : 0.7755
f1             : 0.8047
roc_auc        : 0.9038
train_time     : 1.9885


['models/ensemble_voting.pkl']

## 6. Stacking Classifier

In [None]:
estimators = [
    ('rf', RandomForestClassifier(random_state=RANDOM_STATE)),
    ('xgb', XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss')),
    ('lgbm', LGBMClassifier(random_state=RANDOM_STATE, verbose=-1))
]
stacking_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(random_state=RANDOM_STATE),
    cv=5
)

start = time.time()
stacking_model.fit(X_train, y_train)
train_time = time.time() - start

metrics_stacking = evaluate_model(stacking_model, X_test, y_test)
metrics_stacking['train_time'] = train_time
ensemble_results['Stacking'] = metrics_stacking
print_metrics(metrics_stacking, 'Stacking')
joblib.dump(stacking_model, 'models/ensemble_stacking.pkl')


Stacking
accuracy       : 0.8199
precision      : 0.8431
recall         : 0.7788
f1             : 0.8097
roc_auc        : 0.9104
train_time     : 12.8586


['models/ensemble_stacking.pkl']

In [51]:
df_results = pd.DataFrame(ensemble_results).T
df_results.to_csv('outputs/ensemble_models_results.csv')
print("✅ Results saved to outputs/ensemble_models_results.csv")
print(df_results)

✅ Results saved to outputs/ensemble_models_results.csv
              accuracy  precision    recall        f1   roc_auc  train_time
RandomForest  0.817546   0.842815  0.773298  0.806561  0.908971    0.558048
XGBoost       0.819521   0.847156  0.772461  0.808086  0.910483    0.370746
LightGBM      0.818945   0.843551  0.775807  0.808262  0.910866    0.761565
Voting        0.814830   0.836190  0.775473  0.804688  0.903806    1.988515
Stacking      0.819933   0.843144  0.778819  0.809706  0.910417   12.858622
