## Cell 1 : Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    accuracy_score, precision_score, recall_score, f1_score
)


## Cell 2 : Load cleaned dataset

In [2]:
df = pd.read_csv("../data/lung_15_variable_cleaned.csv")

TARGET = "lung_cancer"
y = df[TARGET]
X = df.drop(columns=[TARGET])

print("Dataset:", X.shape, "| Positives:", y.sum())


Dataset: (154887, 26) | Positives: 3723


## Cell 3 : Train / Val / Test split (70/15/15)

In [3]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42
)

print("Train:", X_train.shape)
print("Val:", X_val.shape)
print("Test:", X_test.shape)


Train: (108420, 26)
Val: (23233, 26)
Test: (23234, 26)


## Cell 4 : Define and train Random Forest

In [4]:
rf = RandomForestClassifier(
    n_estimators=800,
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=2,
    class_weight="balanced_subsample",
    n_jobs=-1,
    random_state=42
)

rf.fit(X_train, y_train)


## Cell 5 : Get probabilities

In [5]:
val_prob = rf.predict_proba(X_val)[:, 1]
test_prob = rf.predict_proba(X_test)[:, 1]


## Cell 6 : Threshold optimization

In [6]:
thresholds = np.linspace(0, 0.25, 300)
best_f1 = 0
best_thr = 0

from sklearn.metrics import f1_score

for thr in thresholds:
    preds = (val_prob >= thr).astype(int)
    f1 = f1_score(y_val, preds)
    if f1 > best_f1:
        best_f1 = f1
        best_thr = thr

print("\nBest Threshold:", best_thr)
print("Best F1:", best_f1)



Best Threshold: 0.18729096989966554
Best F1: 0.14970819588936818


## Cell 7 : Apply best threshold

In [7]:
val_pred = (val_prob >= best_thr).astype(int)
test_pred = (test_prob >= best_thr).astype(int)


## Cell 8 : Evaluation function

In [8]:
def evaluate(name, y_true, y_pred, prob):
    print(f"\n{name}")
    print("AUC-ROC:", roc_auc_score(y_true, prob))
    print("AUC-PR :", average_precision_score(y_true, prob))
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall  :", recall_score(y_true, y_pred))
    print("F1      :", f1_score(y_true, y_pred))


## Cell 9 : Final results

In [9]:
evaluate("RANDOM FOREST (BEST THRESHOLD) - VALIDATION", y_val, val_pred, val_prob)
evaluate("RANDOM FOREST (BEST THRESHOLD) - TEST", y_test, test_pred, test_prob)



RANDOM FOREST (BEST THRESHOLD) - VALIDATION
AUC-ROC: 0.7894851276214865
AUC-PR : 0.0739966696821295
Accuracy: 0.855765505961348
Precision: 0.08720070942950044
Recall  : 0.5286738351254481
F1      : 0.14970819588936818

RANDOM FOREST (BEST THRESHOLD) - TEST
AUC-ROC: 0.7791028632401931
AUC-PR : 0.06709496872148199
Accuracy: 0.851811999655677
Precision: 0.08130081300813008
Recall  : 0.5008944543828264
F1      : 0.13989507869098175
