## Cell 1 : Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    accuracy_score, precision_score, recall_score, f1_score
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



## Cell 2 : Load cleaned 15-variable dataset

In [2]:
df = pd.read_csv("../data/lung_15_variable_cleaned.csv")

TARGET = "lung_cancer"
y = df[TARGET]
X = df.drop(columns=[TARGET])

print("Dataset:", X.shape, "| Positives:", y.sum())


Dataset: (154887, 26) | Positives: 3723


## Cell 3 : 70 / 15 / 15 split

In [3]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42
)

print("Train:", X_train.shape)
print("Val:  ", X_val.shape)
print("Test: ", X_test.shape)


Train: (108420, 26)
Val:   (23233, 26)
Test:  (23234, 26)


## Cell 4 : Imbalance ratio (for XGBoost & CatBoost)

In [4]:
pos = y_train.sum()
neg = len(y_train) - pos
scale_pos_weight = neg / pos
print("scale_pos_weight =", scale_pos_weight)

class_weight_pos = (len(y_train) - y_train.sum()) / y_train.sum()
print("CatBoost positive class weight:", class_weight_pos)


scale_pos_weight = 40.6039907904835
CatBoost positive class weight: 40.6039907904835


## Cell 5 : Define base models

In [5]:
# 4.1 Logistic Regression
base_lr = LogisticRegression(
    max_iter=2000,
    class_weight="balanced",
    solver="lbfgs"
)

# 4.2 Tuned XGBoost
base_xgb = XGBClassifier(
    n_estimators=500,
    learning_rate=0.01,
    max_depth=4,
    min_child_weight=2,
    subsample=0.6,
    colsample_bytree=0.8,
    gamma=0.3,
    reg_lambda=0.5,
    reg_alpha=0,
    scale_pos_weight=scale_pos_weight,
    objective='binary:logistic',
    eval_metric='logloss',
    n_jobs=-1,
    random_state=42
)

# 4.3 Tuned LightGBM
base_lgb = LGBMClassifier(
    n_estimators=700,
    learning_rate=0.02,
    num_leaves=31,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_samples=20,
    reg_lambda=1.0,
    reg_alpha=0.0,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

# 4.4 Random Forest
base_rf = RandomForestClassifier(
    n_estimators=800,
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=2,
    class_weight="balanced_subsample",
    n_jobs=-1,
    random_state=42
)

# 4.5 Gradient Boosting
base_gb = GradientBoostingClassifier(
    n_estimators=500,
    learning_rate=0.02,
    max_depth=3,
    min_samples_split=20,
    subsample=0.8,
    random_state=42
)

# 4.6 CatBoost (quiet mode, fewer iterations because weâ€™re stacking)
base_cat = CatBoostClassifier(
    iterations=400,
    learning_rate=0.03,
    depth=6,
    loss_function="Logloss",
    eval_metric="AUC",
    class_weights=[1.0, class_weight_pos],
    random_state=42,
    task_type="CPU",   # change to "GPU" if you have it
    verbose=False
)


## Cell 6 : Define and fit super stacking model

In [6]:
meta_lr = LogisticRegression(
    max_iter=2000,
    class_weight="balanced",
    solver="lbfgs"
)

super_stack = StackingClassifier(
    estimators=[
        ("lr", base_lr),
        ("xgb", base_xgb),
        ("lgb", base_lgb),
        ("rf", base_rf),
        ("gb", base_gb),
        ("cat", base_cat),
    ],
    final_estimator=meta_lr,
    stack_method="predict_proba",
    n_jobs=-1,
    passthrough=False
)

print("\nFitting super-stacking ensemble...")
super_stack.fit(X_train, y_train)



Fitting super-stacking ensemble...


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



[LightGBM] [Info] Number of positive: 2606, number of negative: 105814
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003094 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 657
[LightGBM] [Info] Number of data points in the train set: 108420, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



[LightGBM] [Info] Number of positive: 2085, number of negative: 84651
[LightGBM] [Info] Number of positive: 2084, number of negative: 84652
[LightGBM] [Info] Number of positive: 2085, number of negative: 84651
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005107 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 651
[LightGBM] [Info] Number of data points in the train set: 86736, number of used features: 26
[LightGBM] [Info] Number of positive: 2085, number of negative: 84651
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002408 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGB

## Cell 7 : Get probabilities on val/test

In [7]:
val_prob = super_stack.predict_proba(X_val)[:, 1]
test_prob = super_stack.predict_proba(X_test)[:, 1]


## Cell 8 : Threshold search on validation (maximize F1)

In [8]:
from sklearn.metrics import f1_score

thresholds = np.linspace(0.01, 0.30, 300)
best_thr = 0.5
best_f1 = 0

for thr in thresholds:
    preds = (val_prob >= thr).astype(int)
    f1 = f1_score(y_val, preds)
    if f1 > best_f1:
        best_f1 = f1
        best_thr = thr

print("\nBest Threshold (Super Stacking):", best_thr)
print("Best F1 (Validation):", best_f1)



Best Threshold (Super Stacking): 0.29903010033444816
Best F1 (Validation): 0.10115606936416185


## Cell 9 : Apply best threshold

In [9]:
val_pred = (val_prob >= best_thr).astype(int)
test_pred = (test_prob >= best_thr).astype(int)


## Cell 10 : Evaluation helper

In [10]:
def evaluate(name, y_true, y_pred, prob):
    print(f"\n{name}")
    print("AUC-ROC:", roc_auc_score(y_true, prob))
    print("AUC-PR :", average_precision_score(y_true, prob))
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall  :", recall_score(y_true, y_pred))
    print("F1      :", f1_score(y_true, y_pred))


## Cell 11 : Final results

In [11]:
evaluate("SUPER STACKING - VALIDATION", y_val, val_pred, val_prob)
evaluate("SUPER STACKING - TEST", y_test, test_pred, test_prob)



SUPER STACKING - VALIDATION
AUC-ROC: 0.836502037122658
AUC-PR : 0.1294874067047533
Accuracy: 0.6251883097318469
Precision: 0.05366922234392114
Recall  : 0.8781362007168458
F1      : 0.10115606936416185

SUPER STACKING - TEST
AUC-ROC: 0.8400655604491404
AUC-PR : 0.11137060327583898
Accuracy: 0.6258500473444091
Precision: 0.05415479061609296
Recall  : 0.8837209302325582
F1      : 0.10205557277140791
