## Cell 1 : Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    accuracy_score, precision_score,
    recall_score, f1_score
)
from lightgbm import LGBMClassifier
import numpy as np


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



## Cell 2 : Load cleaned data

In [2]:
df = pd.read_csv("../data/lung_15_variable_cleaned.csv")

TARGET = "lung_cancer"
y = df[TARGET]
X = df.drop(columns=[TARGET])

print("Dataset:", X.shape, "| Positives:", y.sum())


Dataset: (154887, 26) | Positives: 3723


## Cell 3 : 70/15/15 split

In [3]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42
)

print("Train:", X_train.shape)
print("Val:  ", X_val.shape)
print("Test: ", X_test.shape)


Train: (108420, 26)
Val:   (23233, 26)
Test:  (23234, 26)


## Cell 4 : Imbalance handling (scale_pos_weight)

In [4]:
pos = y_train.sum()
neg = len(y_train) - pos
scale_pos_weight = neg / pos
print("scale_pos_weight =", scale_pos_weight)


scale_pos_weight = 40.6039907904835


## Cell 5 : Define LightGBM model

In [5]:
lgb = LGBMClassifier(
    n_estimators=600,
    learning_rate=0.02,
    num_leaves=31,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_samples=20,
    class_weight="balanced",     # handles imbalance
    objective="binary"
)


## Cell 6 : Train LightGBM

In [6]:
lgb.fit(X_train, y_train)


[LightGBM] [Info] Number of positive: 2606, number of negative: 105814
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001977 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 657
[LightGBM] [Info] Number of data points in the train set: 108420, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


## Cell 7 : Predictions (probs + 0.5 threshold)

In [7]:
val_prob = lgb.predict_proba(X_val)[:, 1]
test_prob = lgb.predict_proba(X_test)[:, 1]

thr = 0.5
val_pred = (val_prob >= thr).astype(int)
test_pred = (test_prob >= thr).astype(int)


## Cell 8 : Evaluation function

In [8]:
def evaluate(name, y_true, y_pred, prob):
    print(f"\n{name}")
    print("AUC-ROC :", roc_auc_score(y_true, prob))
    print("AUC-PR  :", average_precision_score(y_true, prob))
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall  :", recall_score(y_true, y_pred))
    print("F1      :", f1_score(y_true, y_pred))


## Cell 9 : Print final results

In [9]:
evaluate("LIGHTGBM - VALIDATION", y_val, val_pred, val_prob)
evaluate("LIGHTGBM - TEST", y_test, test_pred, test_prob)



LIGHTGBM - VALIDATION
AUC-ROC : 0.825718801990097
AUC-PR  : 0.11512896032325126
Accuracy: 0.7866827357637842
Precision: 0.07954110898661568
Recall  : 0.7455197132616488
F1      : 0.1437456807187284

LIGHTGBM - TEST
AUC-ROC : 0.8245525854366653
AUC-PR  : 0.10659671883209003
Accuracy: 0.7820865972282001
Precision: 0.07669172932330827
Recall  : 0.7298747763864043
F1      : 0.13879911549583263
