In [8]:
import sys
import os
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split, RandomizedSearchCV, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, f1_score, precision_recall_curve, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings("ignore")


In [9]:
X_train, X_test, y_train, y_test, scaler = joblib.load("preprocessed.joblib")
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train).astype(int)
y_test = np.array(y_test).astype(int)


In [10]:
try:
    import lightgbm as lgb
except:
    !{sys.executable} -m pip install lightgbm --no-warn-script-location
    import lightgbm as lgb
try:
    from imblearn.over_sampling import SMOTE
except:
    !{sys.executable} -m pip install imbalanced-learn --no-warn-script-location
    from imblearn.over_sampling import SMOTE


Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   -------------- ------------------------- 0.5/1.5 MB 4.2 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 4.8 MB/s  0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0




Collecting imbalanced-learn
  Downloading imbalanced_learn-0.14.0-py3-none-any.whl.metadata (8.8 kB)
Downloading imbalanced_learn-0.14.0-py3-none-any.whl (239 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.14.0


In [11]:
def evaluate_model_probs(y_true, y_probs):
    auc = roc_auc_score(y_true, y_probs)
    prec, rec, thr = precision_recall_curve(y_true, y_probs)
    f1s = 2 * (prec * rec) / (prec + rec + 1e-12)
    best_idx = np.nanargmax(f1s)
    best_th = thr[best_idx] if best_idx < len(thr) else 0.5
    best_f1 = f1s[best_idx]
    preds = (y_probs >= best_th).astype(int)
    report = classification_report(y_true, preds, digits=4)
    cm = confusion_matrix(y_true, preds)
    return {"auc": auc, "best_threshold": float(best_th), "best_f1": float(best_f1), "report": report, "confusion_matrix": cm}


In [12]:
print("Train size:", X_train.shape, "Test size:", X_test.shape)
print("Train pos/neg:", int(y_train.sum()), int((y_train==0).sum()))
print("Test pos/neg:", int(y_test.sum()), int((y_test==0).sum()))


Train size: (312931, 31) Test size: (78233, 31)
Train pos/neg: 63059 249872
Test pos/neg: 15765 62468


In [15]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

print("After SMOTE:", X_res.shape)
print("Pos:", int(y_res.sum()), "Neg:", int((y_res==0).sum()))


After SMOTE: (499744, 31)
Pos: 249872 Neg: 249872


In [16]:
lr = LogisticRegression(max_iter=2000, class_weight='balanced', n_jobs=-1)
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', random_state=42)
lgbm = lgb.LGBMClassifier(objective='binary', n_jobs=-1, random_state=42)


In [None]:
param_dist_rf = {
    "n_estimators":[200,400,800],
    "max_depth":[6,10,16,None],
    "min_samples_split":[2,5,10],
    "min_samples_leaf":[1,2,4]
}
rs_rf = RandomizedSearchCV(rf, param_dist_rf, n_iter=12, scoring='roc_auc', cv=3, random_state=42, n_jobs=-1, verbose=0)
rs_rf.fit(X_res, y_res)
best_rf = rs_rf.best_estimator_
print("Best RF params:", rs_rf.best_params_)


In [None]:
param_dist_lgb = {
    "num_leaves":[31,63,127],
    "n_estimators":[200,400,800],
    "learning_rate":[0.01,0.03,0.05,0.1],
    "min_child_samples":[5,10,20,50]
}
rs_lgb = RandomizedSearchCV(lgbm, param_dist_lgb, n_iter=12, scoring='roc_auc', cv=3, random_state=42, n_jobs=-1, verbose=0)
rs_lgb.fit(X_res, y_res)
best_lgb = rs_lgb.best_estimator_
print("Best LGB params:", rs_lgb.best_params_)


In [None]:
param_dist_lr = {
    "C":[0.01,0.1,1,10,100],
    "penalty":["l2"],
    "solver":["lbfgs"]
}
rs_lr = RandomizedSearchCV(lr, param_dist_lr, n_iter=6, scoring='roc_auc', cv=3, random_state=42, n_jobs=-1, verbose=0)
rs_lr.fit(X_res, y_res)
best_lr = rs_lr.best_estimator_
print("Best LR params:", rs_lr.best_params_)


In [None]:
param_dist_lr = {
    "C":[0.01,0.1,1,10,100],
    "penalty":["l2"],
    "solver":["lbfgs"]
}
rs_lr = RandomizedSearchCV(lr, param_dist_lr, n_iter=6, scoring='roc_auc', cv=3, random_state=42, n_jobs=-1, verbose=0)
rs_lr.fit(X_res, y_res)
best_lr = rs_lr.best_estimator_
print("Best LR params:", rs_lr.best_params_)


In [None]:
models = {"lgb": best_lgb, "rf": best_rf, "lr": best_lr}
for name, m in models.items():
    probs = m.predict_proba(X_test)[:,1]
    res = evaluate_model_probs(y_test, probs)
    print(name, "AUC:", res["auc"], "best_f1:", res["best_f1"], "best_th:", res["best_threshold"])


In [None]:
l_probs = best_lgb.predict_proba(X_test)[:,1]
r_probs = best_rf.predict_proba(X_test)[:,1]
lr_probs = best_lr.predict_proba(X_test)[:,1]
stack_input = np.vstack([l_probs, r_probs, lr_probs]).T
from sklearn.linear_model import LogisticRegressionCV
stacker = LogisticRegressionCV(cv=5, max_iter=2000, scoring='roc_auc', n_jobs=-1, class_weight='balanced')
stacker.fit(stack_input, y_test)
stack_probs = stacker.predict_proba(stack_input)[:,1]
res_stack = evaluate_model_probs(y_test, stack_probs)
print("Stack AUC:", res_stack["auc"], "Stack best_f1:", res_stack["best_f1"], "th:", res_stack["best_threshold"])
print(res_stack["report"])
print("Confusion matrix:")
print(res_stack["confusion_matrix"])


In [None]:
import joblib, os
os.makedirs("models", exist_ok=True)
joblib.dump(best_lgb, "models/best_lgb.joblib")
joblib.dump(best_rf, "models/best_rf.joblib")
joblib.dump(best_lr, "models/best_lr.joblib")
joblib.dump(stacker, "models/stacker.joblib")
joblib.dump(scaler, "models/scaler.joblib")
joblib.dump((res_stack["auc"], res_stack["best_f1"], res_stack["best_threshold"]), "models/metrics_summary.joblib")
print("Saved models and metrics.")
