## Cell 1 : Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    accuracy_score, precision_score, recall_score, f1_score
)
from xgboost import XGBClassifier
import numpy as np


## Cell 2 : Load cleaned dataset

In [2]:
df = pd.read_csv("../data/lung_15_variable_cleaned.csv")

TARGET = "lung_cancer"
y = df[TARGET]
X = df.drop(columns=[TARGET])

print("Dataset:", X.shape, "| Positives:", y.sum())


Dataset: (154887, 26) | Positives: 3723


## Cell 3 : Train / Val / Test split

In [3]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42
)

print("Train:", X_train.shape)
print("Val:  ", X_val.shape)
print("Test: ", X_test.shape)


Train: (108420, 26)
Val:   (23233, 26)
Test:  (23234, 26)


## Cell 4 : Imbalance handling (scale_pos_weight)

In [4]:
pos = y_train.sum()
neg = len(y_train) - pos
scale_pos_weight = neg / pos
print("scale_pos_weight =", scale_pos_weight)


scale_pos_weight = 40.6039907904835


## Cell 5 : Base XGBoost model

In [5]:
xgb = XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    scale_pos_weight=scale_pos_weight,
    tree_method="hist",
    n_jobs=-1
)


## Cell 6 : Hyperparameter search space

In [6]:
param_grid = {
    "n_estimators": [300, 500, 700, 900, 1200],
    "learning_rate": [0.005, 0.01, 0.02, 0.03],
    "max_depth": [3, 4, 5, 6],
    "min_child_weight": [1, 2, 3],
    "subsample": [0.6, 0.7, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.7, 0.8, 1.0],
    "gamma": [0, 0.1, 0.2, 0.3],
    "reg_alpha": [0, 0.1, 0.2],
    "reg_lambda": [0.5, 1.0, 1.5, 2.0],
}


## Cell 7 : RandomizedSearchCV tuner

In [7]:
tuner = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid,
    scoring="roc_auc",
    n_iter=30,           # Use 50 for even better tuning (slower)
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)


## Cell 8 : Train tuner and get best model

In [8]:
tuner.fit(X_train, y_train)

print("\nBest Parameters:")
print(tuner.best_params_)

# Final tuned model
best_xgb = tuner.best_estimator_


Fitting 3 folds for each of 30 candidates, totalling 90 fits

Best Parameters:
{'subsample': 0.6, 'reg_lambda': 0.5, 'reg_alpha': 0, 'n_estimators': 500, 'min_child_weight': 2, 'max_depth': 4, 'learning_rate': 0.01, 'gamma': 0.3, 'colsample_bytree': 0.8}


## Cell 9 : Predictions (probabilities + 0.5 threshold)

In [9]:
val_prob = best_xgb.predict_proba(X_val)[:, 1]
test_prob = best_xgb.predict_proba(X_test)[:, 1]

thr = 0.5
val_pred = (val_prob >= thr).astype(int)
test_pred = (test_prob >= thr).astype(int)


## Cell 10 : Evaluation function

In [10]:
def evaluate(name, y_true, pred, prob):
    print(f"\n{name}")
    print("AUC-ROC :", roc_auc_score(y_true, prob))
    print("AUC-PR  :", average_precision_score(y_true, prob))
    print("Recall  :", recall_score(y_true, pred))
    print("Precision:", precision_score(y_true, pred))
    print("F1      :", f1_score(y_true, pred))
    print("Accuracy:", accuracy_score(y_true, pred))


## Cell 11 : Print scores

In [11]:
evaluate("XGBOOST (TUNED) - VALIDATION", y_val, val_pred, val_prob)
evaluate("XGBOOST (TUNED) - TEST", y_test, test_pred, test_prob)



XGBOOST (TUNED) - VALIDATION
AUC-ROC : 0.8344114869217121
AUC-PR  : 0.12006145985494564
Recall  : 0.7992831541218638
Precision: 0.07304290861447757
F1      : 0.13385354141656663
Accuracy: 0.7515602806353032

XGBOOST (TUNED) - TEST
AUC-ROC : 0.8367278945510273
AUC-PR  : 0.11322140422942945
Recall  : 0.7835420393559929
Precision: 0.07118478790833739
F1      : 0.13051251489868892
Accuracy: 0.7488163897736076
