In [None]:
#Import the necessary libraries for hyperparameter tuning
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

In [None]:
# Import necessary libraries for model evaluation and saving
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report,
    RocCurveDisplay, PrecisionRecallDisplay
)
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import joblib

In [None]:
# Load the dataset
df = pd.read_csv(r"C:\Users\abdul\Documents\Lung Cancer.csv")  # Load your dataset
# Display the first few rows of the dataset
df.head()

In [None]:
# Drop known leakage / non-predictive columns
drop_cols = [c for c in ["id", "end_treatment_date"] if c in df.columns]
df = df.drop(columns=drop_cols)

In [None]:
# Parse date-like columns if present
for col in ["diagnosis_date"]:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors="coerce")

# Feature engineering from dates (safe: known at diagnosis)
if "diagnosis_date" in df.columns:
    df["diagnosis_year"] = df["diagnosis_date"].dt.year
    df["diagnosis_month"] = df["diagnosis_date"].dt.month
    # You can drop the raw date to avoid high cardinality
    df = df.drop(columns=["diagnosis_date"])

In [19]:
# -----------------------------
# 2) Target / features
# -----------------------------
TARGET = "survived"  # 1 = survived, 0 = not
assert TARGET in df.columns, f"'{TARGET}' not found in columns: {df.columns.tolist()}"

X = df.drop(columns=[TARGET])
y = df[TARGET].astype(int)

# Identify column types
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = [c for c in X.columns if c not in numeric_cols]

In [None]:
# -----------------------------
# 3) Train/validation split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Class imbalance handling: scale_pos_weight = (neg/pos) computed on training set
pos = (y_train == 1).sum()
neg = (y_train == 0).sum()
scale_pos_weight = (neg / max(pos, 1))

In [None]:
# -----------------------------
# 4) Preprocess + Model pipeline
# -----------------------------
numeric_pre = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
])

categorical_pre = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pre, numeric_cols),
        ("cat", categorical_pre, categorical_cols),
    ],
    remainder="drop"
)

model = XGBClassifier(
    n_estimators=600,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    reg_alpha=0.0,
    objective="binary:logistic",
    eval_metric="logloss",
    tree_method="hist",
    random_state=42,
    scale_pos_weight=scale_pos_weight
)

pipe = Pipeline(steps=[("prep", preprocessor), ("xgb", model)])

# -----------------------------
# 5) Train
# -----------------------------
pipe.fit(X_train, y_train)

# -----------------------------
# 6) Evaluate
# -----------------------------
pred = pipe.predict(X_test)
proba = pipe.predict_proba(X_test)[:, 1]

print("Accuracy:", round(accuracy_score(y_test, pred), 4))
print("Precision:", round(precision_score(y_test, pred), 4))
print("Recall:", round(recall_score(y_test, pred), 4))
print("F1:", round(f1_score(y_test, pred), 4))
print("ROC AUC:", round(roc_auc_score(y_test, proba), 4))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, pred))
print("\nClassification Report:\n", classification_report(y_test, pred, digits=4))

In [None]:
# Optional: 5-fold CV ROC AUC on the training split
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipe, X_train, y_train, scoring="roc_auc", cv=cv, n_jobs=-1)
print(f"\nCV ROC AUC: mean={cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# -----------------------------
# 7) Curves
# -----------------------------
RocCurveDisplay.from_predictions(y_test, proba)
plt.title("XGBoost ROC Curve")
plt.show()

PrecisionRecallDisplay.from_predictions(y_test, proba)
plt.title("XGBoost Precision–Recall Curve")
plt.show()

In [None]:
# -----------------------------
# 8) Feature importance (global)
# -----------------------------
# Get feature names after preprocessing for readability
oh = pipe.named_steps["prep"].named_transformers_["cat"].named_steps["onehot"]
cat_names = []
if categorical_cols:
    cat_names = oh.get_feature_names_out(categorical_cols).tolist()

feature_names = numeric_cols + cat_names

# Extract XGBoost feature importances aligned to the transformed matrix
importances = pipe.named_steps["xgb"].feature_importances_
imp_df = (pd.DataFrame({"feature": feature_names, "importance": importances})
            .sort_values("importance", ascending=False)
            .head(25))
print("\nTop 25 features by gain:")
print(imp_df)

# Quick bar plot of top features
plt.figure(figsize=(8, 6))
imp_df[::-1].plot(kind="barh", x="feature", y="importance", legend=False)
plt.title("Top features (XGBoost)")
plt.xlabel("Importance")
plt.tight_layout()
plt.show()