In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import sys
import os
sys.path.append(os.path.abspath(".."))
from utils.preprocess_data import preprocess_data


In [30]:
df=pd.read_csv('../Data/cleaned/preprocessed.csv')

In [31]:
train = df 
# test  = pd.read_csv("test.csv")    # uncomment when you have test.csv

TARGET_COL = "y"
X = train.drop(columns=[TARGET_COL]).copy()
y = train[TARGET_COL].astype(int)

categorical_cols = ["job","marital","housing","loan","contact","poutcome",'responsiveness']

for c in categorical_cols:
    if c in X.columns:
        X[c] = X[c].astype("category")

neg, pos = int((y == 0).sum()), int((y == 1).sum())
scale_pos_weight = neg / max(pos, 1)
print(f"Class balance -> pos: {pos}, neg: {neg}, scale_pos_weight: {scale_pos_weight:.2f}")

params = {
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 127,
    "min_data_in_leaf": 200,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.9,
    "bagging_freq": 1,
    "max_bin": 255,
    "scale_pos_weight": scale_pos_weight,
    "verbosity": -1,
    "seed": 42,
}

# ---- Stratified K-Fold CV ----
n_splits = 5
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

oof = np.zeros(len(train))
auc_scores = []

models = []

for fold, (tr_idx, va_idx) in enumerate(kf.split(X, y), 1):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

    dtrain = lgb.Dataset(X_tr, y_tr, categorical_feature=categorical_cols, free_raw_data=False)
    dvalid = lgb.Dataset(X_va, y_va, categorical_feature=categorical_cols, free_raw_data=False)

    model = lgb.train(
        params,
        dtrain,
        num_boost_round=5000,
        valid_sets=[dtrain, dvalid],
        valid_names=["train", "valid"],
        callbacks=[
            lgb.early_stopping(stopping_rounds=200, verbose=False),
            lgb.log_evaluation(period=200)
        ],
    )

    pred_va = model.predict(X_va, num_iteration=model.best_iteration)
    oof[va_idx] = pred_va
    auc = roc_auc_score(y_va, pred_va)
    auc_scores.append(auc)
    models.append(model)
    print(f"Fold {fold}: AUC = {auc:.5f} | iters = {model.best_iteration}")

print(f"\nCV AUC: {np.mean(auc_scores):.5f} ± {np.std(auc_scores):.5f}")

# ---- Feature importance ----
importances = pd.DataFrame({
    "feature": X.columns,
    "gain": np.mean([m.feature_importance(importance_type="gain") for m in models], axis=0),
    "split": np.mean([m.feature_importance(importance_type="split") for m in models], axis=0),
}).sort_values("gain", ascending=False)
display(importances.head(20))


Class balance -> pos: 90488, neg: 659512, scale_pos_weight: 7.29
[200]	train's auc: 0.963786	valid's auc: 0.959101
[400]	train's auc: 0.968183	valid's auc: 0.960042
[600]	train's auc: 0.971316	valid's auc: 0.960301
[800]	train's auc: 0.973788	valid's auc: 0.960336
[1000]	train's auc: 0.975962	valid's auc: 0.960318
Fold 1: AUC = 0.96039 | iters = 877
[200]	train's auc: 0.964005	valid's auc: 0.95812
[400]	train's auc: 0.968462	valid's auc: 0.959003
[600]	train's auc: 0.971582	valid's auc: 0.95912
[800]	train's auc: 0.974112	valid's auc: 0.959152
Fold 2: AUC = 0.95916 | iters = 783
[200]	train's auc: 0.963918	valid's auc: 0.958426
[400]	train's auc: 0.96821	valid's auc: 0.959322
[600]	train's auc: 0.971362	valid's auc: 0.959593
[800]	train's auc: 0.973912	valid's auc: 0.959535
Fold 3: AUC = 0.95962 | iters = 639
[200]	train's auc: 0.963624	valid's auc: 0.959419
[400]	train's auc: 0.968024	valid's auc: 0.960222
[600]	train's auc: 0.971228	valid's auc: 0.960577
[800]	train's auc: 0.973776	v

Unnamed: 0,feature,gain,split
7,duration,5226026.0,23642.2
12,duration_log,875366.1,3070.8
11,log_balance,580485.1,21191.0
6,contact,410928.9,2274.0
4,housing,346100.0,2350.2
0,age,330968.1,15056.0
14,prev_campaign_engaged,327190.6,370.2
1,job,181623.7,6274.6
9,pdays,173852.4,4941.4
8,campaign,158006.5,5711.6


In [32]:
import joblib

joblib.dump(model, "Bank_subscription.pkl")

['Bank_subscription.pkl']

In [33]:
raw_train = pd.read_csv('../Data/raw/test.csv')
X_test = preprocess_data(raw_train)
categorical_cols = ["job","marital","housing","loan","contact","poutcome",'responsiveness']

for c in categorical_cols:
    if c in X_test.columns:
        X_test[c] = X_test[c].astype("category")

In [34]:
# Load the saved model
loaded_model = joblib.load("Bank_subscription.pkl")

# Predict on new data
preds = loaded_model.predict(X_test)
