# Modeling


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Set style
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)


## 1. Load Data


In [None]:
data_path = 'processed_data.csv'
df = pd.read_csv(data_path)        
print(f"Dataset Shape: {df.shape}")
df.head()

## 2. Modeling Preparation

### Train/Test Split (80:20)
Target Variables: `LFS`


In [None]:
# Prepare X and y
target_cols = ['LFS_10', 'LFS_15', 'LFS_20']
drop_cols = target_cols
drop_cols = [c for c in drop_cols if c in df.columns]

X = df.drop(columns=drop_cols)
y_10 = df['LFS_10']

print("Feature set shape:", X.shape)
print("Target 28d distribution:\n", y_10.value_counts(dropna=False))

# Split Data 80:20 (Stratified by 28-day mortality)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_10, test_size=0.2, random_state=42, stratify=y_10
)

print(f"Training Set: {X_train.shape}")
print(f"Testing Set:  {X_test.shape}")

## Training

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
    f1_score, accuracy_score, recall_score, roc_auc_score, 
    precision_recall_curve, classification_report
)
from imblearn.over_sampling import SMOTE,RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler


# 4. Calculate scale_pos_weight for XGBoost
neg_count = (y_train == 0).sum()
pos_count = (y_train == 1).sum()
scale_pos_weight = neg_count / pos_count
print(f"\nClass ratio: {neg_count}:{pos_count} = {scale_pos_weight:.1f}:1")

# 5. Define sampling strategies
samplers = {
    "No Sampling": None,
    "SMOTE": SMOTE(random_state=42),
    "UnderSampler": RandomUnderSampler(random_state=42),
    "RandomOverSampler":RandomOverSampler(random_state=42)
}

# 6. Define models WITH class_weight / scale_pos_weight
def get_models(scale_weight):
    return {
        "LogReg": LogisticRegression(max_iter=1000, class_weight='balanced'),
        "RandomForest": RandomForestClassifier(
            n_estimators=100, random_state=42, class_weight='balanced'
        ),
        "XGBoost": XGBClassifier(
            n_estimators=100, random_state=42, eval_metric="logloss",
            scale_pos_weight=scale_weight
        )
    }

# 7. Function to find optimal threshold
def find_best_threshold(y_true, y_proba):
    """Find threshold that maximizes F1"""
    precision, recall, thresholds = precision_recall_curve(y_true, y_proba)
    f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
    best_idx = np.argmax(f1_scores)
    return thresholds[best_idx] if best_idx < len(thresholds) else 0.5

# 8. Train and evaluate
all_results = []

for sampler_name, sampler in samplers.items():
    print(f"\n{'#'*60}")
    print(f"Sampling: {sampler_name}")
    print("#"*60)
    
    # Apply sampling
    if sampler is not None:
        X_train_res, y_train_res = sampler.fit_resample(X_train, y_train)
        print(f"Resampled: {dict(pd.Series(y_train_res).value_counts())}")
    else:
        X_train_res, y_train_res = X_train, y_train
    
    # Scale
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)
    
    models = get_models(scale_pos_weight)
    
    for model_name, model in models.items():
        print(f"\n  {model_name}...")
        model.fit(X_train_res, y_train_res)
        y_proba = model.predict_proba(X_test)[:, 1]
        y_pred = model.predict(X_test)
        
        # Metrics
        metrics = {
            "Sampling": sampler_name,
            "Model": model_name,
            "AUC": roc_auc_score(y_test, y_proba),
            "F1": f1_score(y_test, y_pred),
            "Recall": recall_score(y_test, y_pred),
            "Specificity": recall_score(y_test, y_pred, pos_label=0),
            "Precision": (y_pred & y_test).sum() / y_pred.sum() if y_pred.sum() > 0 else 0
        }
        all_results.append(metrics)
        print(f"  AUC: {metrics['AUC']:.4f} | F1: {metrics['F1']:.4f} | Recall: {metrics['Recall']:.4f} | Specificity: {metrics['Specificity']:.4f}")

print(f"\n{'='*60}")
print("FINAL SUMMARY (sorted by F1)")
print("="*60)
results_df = pd.DataFrame(all_results).sort_values("F1", ascending=False)
print(results_df.round(4).to_string(index=False))

print(f"\n{'='*60}")
print("BEST MODEL - Classification Report")
print("="*60)
best = results_df.iloc[0]
print(f"Best: {best['Sampling']} + {best['Model']} (Threshold={best['Threshold']:.3f})")