In [None]:
# --- CELL: IMPUTATION AUDIT & MODEL CHALLENGE (FAST VERSION) ---
import numpy as np
import pandas as pd
import scipy.sparse as sp
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, brier_score_loss
import sys
import time

# 1. Install XGBoost if missing
try:
    import xgboost as xgb
except ImportError:
    print("Installing XGBoost...")
    !{sys.executable} -m pip install xgboost
    import xgboost as xgb

print("\n=== 1. DATA AUDIT: SCALE OF IMPUTATION ===")
raw_cols = ['baseline_egfr', 'baseline_creat']
source_df = df_cohort_aligned if 'baseline_egfr' in df_cohort_aligned.columns else df_cohort.loc[df_cohort_aligned.index]

n_total = len(source_df)
for col in raw_cols:
    n_missing = source_df[col].isna().sum()
    pct_missing = (n_missing / n_total) * 100
    print(f"Feature '{col}': {n_missing:,} missing values ({pct_missing:.1f}%)")

print("\n=== 2. MATRIX CONSTRUCTION ===")
print("Building Challenger Matrix (Raw Data + Sparse Codes)...")

# A. Current Matrix (Imputed + One-Hot) -> Already exists as X_all
print(f"Baseline Matrix (X_all): {X_all.shape}")

# B. Challenger Matrix (Raw Floats + Sparse Codes)
# 1. Grab Raw Dense Data & Force Numeric Types
dense_raw = source_df[['age', 'baseline_egfr', 'baseline_creat', 'site_contrast_rate']].copy()
dense_raw = dense_raw.apply(pd.to_numeric, errors='coerce')

# 2. Convert to Sparse Matrix (CSR)
# FIX: Explicitly cast to float to prevent the 'dtype(O)' error
X_dense_raw = sp.csr_matrix(dense_raw.values.astype(float))

# 3. Stack with the existing Sparse Codes
X_challenger = sp.hstack([X_dense_raw, X_sparse], format='csr')

print(f"Challenger Matrix (X_challenger): {X_challenger.shape} (Raw NaNs, No Manual Bins)")

# === 3. HEAD-TO-HEAD BATTLE ===
print("\n=== 3. MODEL PERFORMANCE TEST ===")

y = df_cohort_aligned['contrast_received'].values
X_train_base, X_test_base, y_train, y_test = train_test_split(X_all, y, test_size=0.2, random_state=42, stratify=y)
X_train_chal, X_test_chal, _, _ = train_test_split(X_challenger, y, test_size=0.2, random_state=42, stratify=y)

# --- MODEL A: BASELINE (Lasso) ---
# SPEED FIX: solver='liblinear' and tol=0.01 for faster convergence check
print("\nTraining Baseline: Logistic Regression (Lasso)...")
t0 = time.time()
model_base = LogisticRegression(
    penalty='l1', 
    solver='liblinear',  # Faster than 'saga' for single-core binary
    tol=0.01,            # Relax tolerance for speed (default is 0.0001)
    C=0.1, 
    class_weight='balanced', 
    max_iter=1000
)
model_base.fit(X_train_base, y_train)
preds_base = model_base.predict_proba(X_test_base)[:, 1]
time_base = time.time() - t0
auc_base = roc_auc_score(y_test, preds_base)

# --- MODEL B: CHALLENGER (XGBoost) ---
print("Training Challenger: XGBoost (Native Missing Handling)...")
t0 = time.time()
model_chal = xgb.XGBClassifier(
    n_estimators=100, 
    max_depth=6, 
    learning_rate=0.1, 
    tree_method='hist', # Optimized for speed
    n_jobs=-1,          # XGBoost CAN multiprocess!
    missing=np.nan,     # Explicitly tell it to treat NaNs as missing
    scale_pos_weight=(len(y_train)-y_train.sum())/y_train.sum() 
)
model_chal.fit(X_train_chal, y_train)
preds_chal = model_chal.predict_proba(X_test_chal)[:, 1]
time_chal = time.time() - t0
auc_chal = roc_auc_score(y_test, preds_chal)

print("\n=== 4. RESULTS REPORT ===")
print(f"Baseline AUC (Imputed + Lasso):  {auc_base:.4f} (Time: {time_base:.1f}s)")
print(f"Challenger AUC (Raw + XGBoost):  {auc_chal:.4f} (Time: {time_chal:.1f}s)")

if auc_chal >= auc_base - 0.01:
    print("\nSUCCESS: XGBoost matches or beats Lasso using RAW data.")
    print("Action: You can safely delete the imputation code in Cell 6.")
else:
    print("\nCAUTION: Lasso performed significantly better.")

result: 


=== 1. DATA AUDIT: SCALE OF IMPUTATION ===
Feature 'baseline_egfr': 47,285 missing values (75.0%)
Feature 'baseline_creat': 13,346 missing values (21.2%)

=== 2. MATRIX CONSTRUCTION ===
Building Challenger Matrix (Raw Data + Sparse Codes)...
Baseline Matrix (X_all): (63064, 69542)
Challenger Matrix (X_challenger): (63064, 69516) (Raw NaNs, No Manual Bins)

=== 3. MODEL PERFORMANCE TEST ===

Training Baseline: Logistic Regression (Lasso)...
Training Challenger: XGBoost (Native Missing Handling)...

=== 4. RESULTS REPORT ===
Baseline AUC (Imputed + Lasso):  0.7627 (Time: 3.2s)
Challenger AUC (Raw + XGBoost):  0.7854 (Time: 35.5s)

SUCCESS: XGBoost matches or beats Lasso using RAW data.
Action: You can safely delete the imputation code in Cell 6.