In [3]:
# ====================================================================
# GRADIENT BOOSTING CLASSIFIER (GBM) — FIXED WITH BEST THRESHOLD
# ====================================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    accuracy_score, precision_score, recall_score, f1_score
)
from sklearn.ensemble import GradientBoostingClassifier


# ---------------------------------------------------------
# 1. Load cleaned dataset
# ---------------------------------------------------------
df = pd.read_csv("lung_15_variable_cleaned.csv")

TARGET = "lung_cancer"
y = df[TARGET]
X = df.drop(columns=[TARGET])

print("Dataset:", X.shape, "| Positives:", y.sum())


# ---------------------------------------------------------
# 2. Train / Val / Test Split (70 / 15 / 15)
# ---------------------------------------------------------
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42
)

print("Train:", X_train.shape)
print("Val:", X_val.shape)
print("Test:", X_test.shape)


# ---------------------------------------------------------
# 3. Slight Oversampling (because GBM has no class_weight)
# ---------------------------------------------------------
pos_idx = y_train[y_train == 1].index
extra_pos_samples = np.random.choice(pos_idx, int(0.5 * len(pos_idx)), replace=True)

X_train_bal = pd.concat([X_train, X_train.loc[extra_pos_samples]])
y_train_bal = pd.concat([y_train, y_train.loc[extra_pos_samples]])

print("Balanced Train:", X_train_bal.shape, "| Positives:", y_train_bal.sum())


# ---------------------------------------------------------
# 4. Gradient Boosting Model
# ---------------------------------------------------------
gb = GradientBoostingClassifier(
    n_estimators=500,
    learning_rate=0.02,
    max_depth=3,
    min_samples_split=20,
    subsample=0.8
)

gb.fit(X_train_bal, y_train_bal)


# ---------------------------------------------------------
# 5. Get probabilities
# ---------------------------------------------------------
val_prob = gb.predict_proba(X_val)[:, 1]
test_prob = gb.predict_proba(X_test)[:, 1]


# ---------------------------------------------------------
# 6. Find best threshold (fixes 0 recall problem)
# ---------------------------------------------------------
thresholds = np.linspace(0, 0.20, 300)
best_f1 = 0
best_thr = 0

for thr in thresholds:
    preds = (val_prob >= thr).astype(int)
    f1 = f1_score(y_val, preds)
    if f1 > best_f1:
        best_f1 = f1
        best_thr = thr

print("\nBest threshold:", best_thr)
print("Best F1:", best_f1)


# ---------------------------------------------------------
# 7. Apply best threshold
# ---------------------------------------------------------
val_pred = (val_prob >= best_thr).astype(int)
test_pred = (test_prob >= best_thr).astype(int)


# ---------------------------------------------------------
# 8. Evaluation Function
# ---------------------------------------------------------
def evaluate(name, y_true, y_pred, prob):
    print(f"\n{name}")
    print("AUC-ROC :", roc_auc_score(y_true, prob))
    print("AUC-PR  :", average_precision_score(y_true, prob))
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall  :", recall_score(y_true, y_pred))
    print("F1      :", f1_score(y_true, y_pred))


# ---------------------------------------------------------
# 9. Final Results (after threshold tuning)
# ---------------------------------------------------------
evaluate("GBM (OPTIMAL THRESHOLD) - VALIDATION", y_val, val_pred, val_prob)
evaluate("GBM (OPTIMAL THRESHOLD) - TEST", y_test, test_pred, test_prob)


Dataset: (154887, 26) | Positives: 3723
Train: (108420, 26)
Val: (23233, 26)
Test: (23234, 26)
Balanced Train: (109723, 26) | Positives: 3909

Best threshold: 0.1899665551839465
Best F1: 0.2108843537414966

GBM (OPTIMAL THRESHOLD) - VALIDATION
AUC-ROC : 0.8330965845099643
AUC-PR  : 0.12177911467175119
Accuracy: 0.9600568157362372
Precision: 0.20064724919093851
Recall  : 0.2222222222222222
F1      : 0.2108843537414966

GBM (OPTIMAL THRESHOLD) - TEST
AUC-ROC : 0.8365630072601689
AUC-PR  : 0.10665568684519991
Accuracy: 0.9567874666437118
Precision: 0.1473851030110935
Recall  : 0.16636851520572452
F1      : 0.15630252100840336


## Cell 1 : Imports

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    accuracy_score, precision_score, recall_score, f1_score
)
from sklearn.ensemble import GradientBoostingClassifier


## Cell 2 : Load cleaned dataset

In [3]:
df = pd.read_csv("../data/lung_15_variable_cleaned.csv")

TARGET = "lung_cancer"
y = df[TARGET]
X = df.drop(columns=[TARGET])

print("Dataset:", X.shape, "| Positives:", y.sum())


Dataset: (154887, 26) | Positives: 3723


## Cell 3 : Train / Val / Test split (70 / 15 / 15)

In [4]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42
)

print("Train:", X_train.shape)
print("Val:", X_val.shape)
print("Test:", X_test.shape)


Train: (108420, 26)
Val: (23233, 26)
Test: (23234, 26)


## Cell 4 : Slight oversampling (GBM has no class_weight)

In [5]:
pos_idx = y_train[y_train == 1].index
extra_pos_samples = np.random.choice(pos_idx, int(0.5 * len(pos_idx)), replace=True)

X_train_bal = pd.concat([X_train, X_train.loc[extra_pos_samples]])
y_train_bal = pd.concat([y_train, y_train.loc[extra_pos_samples]])

print("Balanced Train:", X_train_bal.shape, "| Positives:", y_train_bal.sum())


Balanced Train: (109723, 26) | Positives: 3909


## Cell 5 : Define and fit Gradient Boosting model

In [6]:
gb = GradientBoostingClassifier(
    n_estimators=500,
    learning_rate=0.02,
    max_depth=3,
    min_samples_split=20,
    subsample=0.8
)

gb.fit(X_train_bal, y_train_bal)


## Cell 6 : Get prediction probabilities

In [7]:
val_prob = gb.predict_proba(X_val)[:, 1]
test_prob = gb.predict_proba(X_test)[:, 1]


## Cell 7 : Find best threshold (fixes 0 recall problem)

In [8]:
thresholds = np.linspace(0, 0.20, 300)
best_f1 = 0
best_thr = 0

for thr in thresholds:
    preds = (val_prob >= thr).astype(int)
    f1 = f1_score(y_val, preds)
    if f1 > best_f1:
        best_f1 = f1
        best_thr = thr

print("\nBest threshold:", best_thr)
print("Best F1:", best_f1)



Best threshold: 0.148494983277592
Best F1: 0.2112094395280236


## Cell 8 : Apply best threshold

In [9]:
val_pred = (val_prob >= best_thr).astype(int)
test_pred = (test_prob >= best_thr).astype(int)


## Cell 9 : Evaluation function

In [10]:
def evaluate(name, y_true, y_pred, prob):
    print(f"\n{name}")
    print("AUC-ROC :", roc_auc_score(y_true, prob))
    print("AUC-PR  :", average_precision_score(y_true, prob))
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall  :", recall_score(y_true, y_pred))
    print("F1      :", f1_score(y_true, y_pred))


## Cell 10 : Final results (after threshold tuning)

In [11]:
evaluate("GBM (OPTIMAL THRESHOLD) - VALIDATION", y_val, val_pred, val_prob)
evaluate("GBM (OPTIMAL THRESHOLD) - TEST", y_test, test_pred, test_prob)



GBM (OPTIMAL THRESHOLD) - VALIDATION
AUC-ROC : 0.8331378406894998
AUC-PR  : 0.122419802645025
Accuracy: 0.9424525459475745
Precision: 0.15743183817062445
Recall  : 0.3207885304659498
F1      : 0.2112094395280236

GBM (OPTIMAL THRESHOLD) - TEST
AUC-ROC : 0.8366088443491586
AUC-PR  : 0.10809701184789258
Accuracy: 0.9375053800464836
Precision: 0.1300745650372825
Recall  : 0.2808586762075134
F1      : 0.17780294450736125
