In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import joblib
import shap
from datetime import datetime

import xgboost as xgb
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks, optimizers
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score, 
    roc_curve, precision_recall_curve, average_precision_score,
    accuracy_score, precision_score, recall_score, f1_score
)
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import time

# Ignore warnings
warnings.filterwarnings('ignore')

In [2]:
# Visualization settings\

plt.style.use('default')
sns.set_palette("husl")

In [3]:
# Configuration and random seeds

RANDOM_STATE = 42
TARGET_COLUMN = 'injury_next_14_days'
MODEL_NAME = 'nba_injury_predictor_v1'

# Random seeds for reproducibility
np.random.seed(RANDOM_STATE)
tf.random.set_seed(RANDOM_STATE)

print(f"Configuration and etc:")
print(f"- Target: {TARGET_COLUMN}")
print(f"- Random State: {RANDOM_STATE}")
print(f"- Model Name: {MODEL_NAME}")
print(f"- TensorFlow version: {tf.__version__}")
print(f"- GPU available: {len(tf.config.list_physical_devices('GPU')) > 0}")

Configuration and etc:
- Target: injury_next_14_days
- Random State: 42
- Model Name: nba_injury_predictor_v1
- TensorFlow version: 2.19.0
- GPU available: False


In [4]:
# Loads all processed data
# Training data (SMOTE balanced + feature selected)
X_train = pd.read_csv('../data/processed/X_train_final.csv')
y_train = pd.read_csv('../data/processed/y_train_final.csv').squeeze()

# Validation data (feature selected)
X_val = pd.read_csv('../data/processed/X_validation_final.csv')
y_val = pd.read_csv('../data/processed/y_validation_final.csv').squeeze()

# Test data (feature selected)
X_test = pd.read_csv('../data/processed/X_test_final.csv')
y_test = pd.read_csv('../data/processed/y_test_final.csv').squeeze()

print(f"Data loaded successfully:")
print(f"- Training: {X_train.shape} features, {len(y_train)} samples")
print(f"- Validation: {X_val.shape} features, {len(y_val)} samples") 
print(f"- Test: {X_test.shape} features, {len(y_test)} samples")

# Loads metadata and configuration
# Selected features list
selected_features = joblib.load('../data/processed/selected_features.pkl')
print(f"- Selected features: {len(selected_features)}")

# Class weights for handling imbalance
class_weights = joblib.load('../data/processed/class_weights.pkl')
print(f"- Class weights: {class_weights}")

# Preprocessing configuration
preprocessing_config = joblib.load('../data/processed/preprocessing_config.pkl')
print(f"- Preprocessing config loaded")

# Feature selection results
feature_selection_results = joblib.load('../data/processed/feature_selection_results.pkl')
print(f"- Feature selection metadata loaded")

# Split information for validation
split_info = joblib.load('../data/processed/split_info.pkl')
print(f"- Data split validation loaded")

# Data validation and consistency checks
# Check feature consistency
assert list(X_train.columns) == selected_features, "Training features don't match selected features"
assert list(X_val.columns) == selected_features, "Validation features don't match selected features"  
assert list(X_test.columns) == selected_features, "Test features don't match selected features"
print("- Feature consistency across all splits")

# Checks target distributions
train_positive_rate = y_train.mean()
val_positive_rate = y_val.mean()
test_positive_rate = y_test.mean()

print(f"\nTarget distribution validation:")
print(f"- Training positive rate: {train_positive_rate:.1%} (after SMOTE)")
print(f"- Validation positive rate: {val_positive_rate:.1%}")
print(f"- Test positive rate: {test_positive_rate:.1%}")

# Checks for missing values
train_missing = X_train.isnull().sum().sum()
val_missing = X_val.isnull().sum().sum()
test_missing = X_test.isnull().sum().sum()

assert train_missing == 0, f"Training data has {train_missing} missing values"
assert val_missing == 0, f"Validation data has {val_missing} missing values"
assert test_missing == 0, f"Test data has {test_missing} missing values"
print("- No missing values in any split")

# Verifies data types
assert X_train.dtypes.apply(lambda x: x.kind in 'biufc').all(), "Non-numeric features in training"
assert X_val.dtypes.apply(lambda x: x.kind in 'biufc').all(), "Non-numeric features in validation"
assert X_test.dtypes.apply(lambda x: x.kind in 'biufc').all(), "Non-numeric features in test"
print("- All features are numeric")

print("\nAll data validation checks passed!")

# Feature statistics
print(f"\nFeature Statistics (Training Data):")
print(f"- Mean range: {X_train.mean().min():.3f} to {X_train.mean().max():.3f}")
print(f"- Std range: {X_train.std().min():.3f} to {X_train.std().max():.3f}")
print(f"- Min values: {X_train.min().min():.3f} to {X_train.min().max():.3f}")
print(f"- Max values: {X_train.max().min():.3f} to {X_train.max().max():.3f}")

# Checks for potential scaling issues
features_need_scaling = (X_train.std() > 10).sum()
print(f"- Features with std > 10: {features_need_scaling} (may need scaling)")

# Targets class balance verification
print(f"\nClass Balance Check:")
print(f"- Training: {y_train.value_counts().to_dict()}")
print(f"- Validation: {y_val.value_counts().to_dict()}")
print(f"- Test: {y_test.value_counts().to_dict()}")

# Sample feature names by category
print(f"\nSample Features by Type:")
workload_features = [f for f in selected_features if any(x in f for x in ['_7d', '_30d', 'load'])]
fatigue_features = [f for f in selected_features if any(x in f for x in ['fatigue', 'rest', 'back_to_back'])]
context_features = [f for f in selected_features if any(x in f for x in ['age', 'bmi', 'position'])]

print(f"- Workload features ({len(workload_features)}): {workload_features[:3]}...")
print(f"- Fatigue features ({len(fatigue_features)}): {fatigue_features[:3]}...")
print(f"- Context features ({len(context_features)}): {context_features[:3]}...")


# Data preprocessing for modeling
# Feature scaling
# RobustScaler to handle outliers better than StandardScaler
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrames for easier handling
X_train_scaled = pd.DataFrame(X_train_scaled, columns=selected_features, index=X_train.index)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=selected_features, index=X_val.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=selected_features, index=X_test.index)

print(f"- Features scaled using RobustScaler")
print(f"  - Training scaled shape: {X_train_scaled.shape}")
print(f"  - Scaled feature stats: mean≈{X_train_scaled.mean().mean():.3f}, std≈{X_train_scaled.std().mean():.3f}")

# Converts to numpy arrays for TensorFlow
X_train_tf = X_train_scaled.values.astype(np.float32)
X_val_tf = X_val_scaled.values.astype(np.float32)
X_test_tf = X_test_scaled.values.astype(np.float32)
y_train_tf = y_train.values.astype(np.float32)
y_val_tf = y_val.values.astype(np.float32)
y_test_tf = y_test.values.astype(np.float32)

print(f"- Data converted to TensorFlow format")
print(f"  - Input shape: {X_train_tf.shape}")
print(f"  - Target shape: {y_train_tf.shape}")
print(f"  - Data types: {X_train_tf.dtype}, {y_train_tf.dtype}")

# Stores scaler for later use
joblib.dump(scaler, f'../data/processed/{MODEL_NAME}_scaler.pkl')
print(f"- Scaler saved for deployment")

print("Data loaded and prepared for modeling stage")
print(f"Prepared to build TensorFlow model with {X_train_tf.shape[1]} features")

Data loaded successfully:
- Training: (8850, 40) features, 8850 samples
- Validation: (2567, 40) features, 2567 samples
- Test: (589, 40) features, 589 samples
- Selected features: 40
- Class weights: {0: 0.512264982373678, 1: 20.88323353293413}
- Preprocessing config loaded
- Feature selection metadata loaded
- Data split validation loaded
- Feature consistency across all splits

Target distribution validation:
- Training positive rate: 23.1% (after SMOTE)
- Validation positive rate: 3.0%
- Test positive rate: 1.0%
- No missing values in any split
- All features are numeric

All data validation checks passed!

Feature Statistics (Training Data):
- Mean range: -0.102 to 1124.981
- Std range: 0.028 to 271.169
- Min values: -11.500 to 20.074
- Max values: 0.170 to 1757.000
- Features with std > 10: 4 (may need scaling)

Class Balance Check:
- Training: {0: 6808, 1: 2042}
- Validation: {0: 2490, 1: 77}
- Test: {0: 583, 1: 6}

Sample Features by Type:
- Workload features (5): ['total_actio

In [5]:
# Validation

print("Data shapes after all preprocessing:")
print(f"  - X_train: {X_train_tf.shape}")
print(f"  - y_train: {y_train_tf.shape}")
print(f"  - X_val: {X_val_tf.shape}")
print(f"  - y_val: {y_val_tf.shape}")
print(f"  - X_test: {X_test_tf.shape}")
print(f"  - y_test: {y_test_tf.shape}")

print(f"\nClass distribution summary:")
print(f"  - Training: {np.bincount(y_train_tf.astype(int))} (ratio: {(y_train_tf == 0).sum()/(y_train_tf == 1).sum():.1f}:1)")
print(f"  - Validation: {np.bincount(y_val_tf.astype(int))} (ratio: {(y_val_tf == 0).sum()/(y_val_tf == 1).sum():.1f}:1)")
print(f"  - Test: {np.bincount(y_test_tf.astype(int))} (ratio: {(y_test_tf == 0).sum()/(y_test_tf == 1).sum():.1f}:1)")

print(f"\nClass weights for model: {class_weights}")

print(f"\nReady")
print(f"   - Features: {X_train_tf.shape[1]}")
print(f"   - Training samples: {X_train_tf.shape[0]:,}")
print(f"   - Target: {TARGET_COLUMN}")
print(f"   - Model: {MODEL_NAME}")

Data shapes after all preprocessing:
  - X_train: (8850, 40)
  - y_train: (8850,)
  - X_val: (2567, 40)
  - y_val: (2567,)
  - X_test: (589, 40)
  - y_test: (589,)

Class distribution summary:
  - Training: [6808 2042] (ratio: 3.3:1)
  - Validation: [2490   77] (ratio: 32.3:1)
  - Test: [583   6] (ratio: 97.2:1)

Class weights for model: {0: 0.512264982373678, 1: 20.88323353293413}

Ready
   - Features: 40
   - Training samples: 8,850
   - Target: injury_next_14_days
   - Model: nba_injury_predictor_v1


In [6]:
# Feature scaling

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print(f"- Training scaled shape: {X_train_scaled.shape}")
print(f"- Feature stats after scaling: mean={X_train_scaled.mean():.3f}, std={X_train_scaled.std():.3f}")


- Training scaled shape: (8850, 40)
- Feature stats after scaling: mean=0.151, std=3.844


In [10]:
# Logisitc Regression
logistic_model = LogisticRegression(
    random_state=RANDOM_STATE,
    class_weight=class_weights,  # Handle class imbalance
    penalty='l2',                # L2 regularization
    C=1.0,                      # Regularization strength (will tune later)
    max_iter=1000,              # Ensure convergence
    solver='lbfgs'              # Good for small datasets
)

logistic_model.fit(X_train_scaled, y_train)

# Predictions

# Predictions on all sets
y_train_pred = logistic_model.predict(X_train_scaled)
y_val_pred = logistic_model.predict(X_val_scaled)
y_test_pred = logistic_model.predict(X_test_scaled)

# Prediction probabilities
y_train_prob = logistic_model.predict_proba(X_train_scaled)[:, 1]
y_val_prob = logistic_model.predict_proba(X_val_scaled)[:, 1]
y_test_prob = logistic_model.predict_proba(X_test_scaled)[:, 1]

def evaluate_model_performance(y_true, y_pred, y_prob, dataset_name):
    """
    Model evaluation
    """
    print(f"\n{dataset_name.upper()} Performance:")
    
    # Basic metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    
    # AUC metrics
    roc_auc = roc_auc_score(y_true, y_prob)
    pr_auc = average_precision_score(y_true, y_prob)
    
    print(f"- Accuracy: {accuracy:.3f}")
    print(f"- Precision: {precision:.3f}")
    print(f"- Recall: {recall:.3f}")
    print(f"- F1-Score: {f1:.3f}")
    print(f"- ROC AUC: {roc_auc:.3f}")
    print(f"- PR AUC: {pr_auc:.3f}")
    
    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred)
    print(f"Confusion Matrix:")
    print(f"    TN: {cm[0,0]:4d} | FP: {cm[0,1]:4d}")
    print(f"    FN: {cm[1,0]:4d} | TP: {cm[1,1]:4d}")
    
    return {
        'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1,
        'roc_auc': roc_auc, 'pr_auc': pr_auc, 'confusion_matrix': cm
    }

# Evaluates on all datasets
train_metrics = evaluate_model_performance(y_train, y_train_pred, y_train_prob, "Training")
val_metrics = evaluate_model_performance(y_val, y_val_pred, y_val_prob, "Validation")
test_metrics = evaluate_model_performance(y_test, y_test_pred, y_test_prob, "Test")


TRAINING Performance:
- Accuracy: 0.806
- Precision: 0.544
- Recall: 0.991
- F1-Score: 0.702
- ROC AUC: 0.982
- PR AUC: 0.944
Confusion Matrix:
    TN: 5109 | FP: 1699
    FN:   19 | TP: 2023

VALIDATION Performance:
- Accuracy: 0.676
- Precision: 0.065
- Recall: 0.727
- F1-Score: 0.119
- ROC AUC: 0.765
- PR AUC: 0.182
Confusion Matrix:
    TN: 1679 | FP:  811
    FN:   21 | TP:   56

TEST Performance:
- Accuracy: 0.506
- Precision: 0.017
- Recall: 0.833
- F1-Score: 0.033
- ROC AUC: 0.549
- PR AUC: 0.013
Confusion Matrix:
    TN:  293 | FP:  290
    FN:    1 | TP:    5


In [11]:
# Top K Risk Prediciton Analysis

def analyze_top_k_predictions(y_true, y_prob, k_values=[5, 10, 15, 20]):
    """
    Analyzes what percentage of actual injuries are captured in top K% predictions
    """
    
    print(f"Top K Risk Prediction Performance:")
    
    # Sorts by probability (highest risk first)
    sorted_indices = np.argsort(y_prob)[::-1]
    sorted_true = y_true[sorted_indices]
    
    total_positives = y_true.sum()
    n_samples = len(y_true)
    
    for k in k_values:
        # Top k% of predictions
        top_k_size = int(n_samples * k / 100)
        top_k_true = sorted_true[:top_k_size]
        
        # Calculates capture rate
        captured_positives = top_k_true.sum()
        capture_rate = captured_positives / total_positives if total_positives > 0 else 0
        precision_at_k = captured_positives / top_k_size if top_k_size > 0 else 0
        
        print(f"    Top {k:2d}%: {capture_rate*100:5.1f}% of injuries captured, "
              f"precision = {precision_at_k*100:5.1f}%")

# Analyzes on validation set
analyze_top_k_predictions(y_val.values, y_val_prob)

Top K Risk Prediction Performance:
    Top  5%:  32.5% of injuries captured, precision =  19.5%
    Top 10%:  45.5% of injuries captured, precision =  13.7%
    Top 15%:  55.8% of injuries captured, precision =  11.2%
    Top 20%:  59.7% of injuries captured, precision =   9.0%


In [13]:
# Feature importance Analysis

# Gets feature coefficients (weights)
feature_coefficients = logistic_model.coef_[0]
feature_importance = pd.DataFrame({
    'feature': selected_features,
    'coefficient': feature_coefficients,
    'abs_coefficient': np.abs(feature_coefficients)
}).sort_values('abs_coefficient', ascending=False)

print(f"Top 15 Most Important Features:")
print(f"{'Feature':<25} {'Coefficient':<12} {'Impact':<15}")

for idx, row in feature_importance.head(15).iterrows():
    impact = "Increase Injury Risk" if row['coefficient'] > 0 else "Decrease Injury Risk"
    print(f"  {row['feature']:<25} {row['coefficient']:>10.3f}   {impact}")

Top 15 Most Important Features:
Feature                   Coefficient  Impact         
  season_fatigue                 7.406   Increase Injury Risk
  missed_shots                  -6.904   Decrease Injury Risk
  total_shot_attempts            5.936   Increase Injury Risk
  is_late_season                -5.876   Decrease Injury Risk
  is_mid_season                 -5.543   Decrease Injury Risk
  made_shots                    -5.418   Decrease Injury Risk
  rebounds                      -4.708   Decrease Injury Risk
  total_actions_7d               4.109   Increase Injury Risk
  actions_vs_career_avg          3.321   Increase Injury Risk
  game_month                     3.138   Increase Injury Risk
  is_early_season               -3.026   Decrease Injury Risk
  current_vs_14day_avg           2.291   Increase Injury Risk
  is_low_performance             1.630   Increase Injury Risk
  substitutions                 -1.590   Decrease Injury Risk
  total_season_games            -1.555   Decr

In [17]:
# Random Forest Model

# Hyperparameter grid
rf_param_grid = {
    'n_estimators': [50, 100, 150], 
    'max_depth': [5, 10, 15],      
    'min_samples_split': [20, 50, 100], 
    'min_samples_leaf': [5, 10, 20],    
    'max_features': ['sqrt', 0.5],      
    'class_weight': ['balanced_subsample'],
    'random_state': [RANDOM_STATE]
}

rf_base = RandomForestClassifier(
    random_state=RANDOM_STATE,
    n_jobs=-1  # Use all available cores
)

# Grid Search with Cross Validation
cv_folds = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)

rf_grid_search = GridSearchCV(
    estimator=rf_base,
    param_grid=rf_param_grid,
    cv=cv_folds,
    scoring='average_precision',  # PR-AUC for imbalanced data
    n_jobs=-1,
    verbose=1
)

start_time = time.time()
rf_grid_search.fit(X_train_scaled, y_train)
training_time = time.time() - start_time

print(f"Training completed in {training_time:.1f} seconds")
print(f"Best parameters: {rf_grid_search.best_params_}")
print(f"Best cross-validation PR-AUC: {rf_grid_search.best_score_:.4f}")

# Get the best model
rf_model = rf_grid_search.best_estimator_

# Make predictions on all sets
y_train_pred_rf = rf_model.predict(X_train_scaled)
y_val_pred_rf = rf_model.predict(X_val_scaled)
y_test_pred_rf = rf_model.predict(X_test_scaled)

# Prediction probabilities
y_train_prob_rf = rf_model.predict_proba(X_train_scaled)[:, 1]
y_val_prob_rf = rf_model.predict_proba(X_val_scaled)[:, 1]
y_test_prob_rf = rf_model.predict_proba(X_test_scaled)[:, 1]

# Evaluations
train_metrics_rf = evaluate_model_performance(y_train, y_train_pred_rf, y_train_prob_rf, "Training")
val_metrics_rf = evaluate_model_performance(y_val, y_val_pred_rf, y_val_prob_rf, "Validation")
test_metrics_rf = evaluate_model_performance(y_test, y_test_pred_rf, y_test_prob_rf, "Test")

# Top K Risk Analysis
analyze_top_k_predictions(y_val.values, y_val_prob_rf)

# Feature Importance Analysis

# Gets feature importances
feature_importance_rf = pd.DataFrame({
    'feature': selected_features,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 15 Most Important Features (Random Forest):")
print(f"{'Feature':<25} {'Importance':<12}")

for idx, row in feature_importance_rf.head(15).iterrows():
    print(f"{row['feature']:<25} {row['importance']:>10.4f}")


Fitting 3 folds for each of 162 candidates, totalling 486 fits
Training completed in 67.3 seconds
Best parameters: {'class_weight': 'balanced_subsample', 'max_depth': 15, 'max_features': 0.5, 'min_samples_leaf': 5, 'min_samples_split': 20, 'n_estimators': 150, 'random_state': 42}
Best cross-validation PR-AUC: 0.9928

TRAINING Performance:
- Accuracy: 0.993
- Precision: 0.979
- Recall: 0.991
- F1-Score: 0.985
- ROC AUC: 1.000
- PR AUC: 0.999
Confusion Matrix:
    TN: 6765 | FP:   43
    FN:   18 | TP: 2024

VALIDATION Performance:
- Accuracy: 0.966
- Precision: 0.452
- Recall: 0.545
- F1-Score: 0.494
- ROC AUC: 0.832
- PR AUC: 0.586
Confusion Matrix:
    TN: 2439 | FP:   51
    FN:   35 | TP:   42

TEST Performance:
- Accuracy: 0.922
- Precision: 0.000
- Recall: 0.000
- F1-Score: 0.000
- ROC AUC: 0.491
- PR AUC: 0.012
Confusion Matrix:
    TN:  543 | FP:   40
    FN:    6 | TP:    0
Top K Risk Prediction Performance:
    Top  5%:  59.7% of injuries captured, precision =  35.9%
    Top 1

In [21]:
# XGBoost Model

# XGBoost Model 
def train_xgboost_model(X_train_scaled, y_train, X_val_scaled, y_val, class_weights, random_state=42):
    """
    Trains XGBoost model
    """
    # Calculates scale_pos_weight for XGBoost
    neg_count = (y_train == 0).sum()
    pos_count = (y_train == 1).sum()
    scale_pos_weight = neg_count / pos_count
    
    print(f"Class distribution in training:")
    print(f"- Negative class: {neg_count:,}")
    print(f"- Positive class: {pos_count:,}")
    print(f"- Scale pos weight: {scale_pos_weight:.2f}")
    
    # Base XGBoost model 
    xgb_base = xgb.XGBClassifier(
        objective='binary:logistic',
        scale_pos_weight=scale_pos_weight,  # Handles imbalance
        random_state=random_state,
        eval_metric=['logloss', 'auc'],
        early_stopping_rounds=10,
        n_jobs=-1
    )
    
    # Hyperparameter grid
    param_grid = {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.05, 0.1], 
        'n_estimators': [100, 200, 500],
        'subsample': [0.7, 0.8, 0.9],
        'colsample_bytree': [0.7, 0.8, 0.9],
        'reg_alpha': [0, 0.1, 0.5],  # L1 regularization
        'reg_lambda': [1, 1.5, 2]   # L2 regularization
    }
    
    # Stratified CV for imbalanced data
    cv_strategy = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state)
    
    # Grid search with PR-AUC scoring
    print("Performing hyperparameter tuning...")
    grid_search = GridSearchCV(
        estimator=xgb_base,
        param_grid=param_grid,
        scoring='average_precision',  # PR-AUC 
        cv=cv_strategy,
        n_jobs=-1,
        verbose=1,
        return_train_score=True
    )
    
    # Fit with validation set for early stopping
    grid_search.fit(
        X_train_scaled, y_train,
        eval_set=[(X_val_scaled, y_val)],
        verbose=False
    )
    
    best_model = grid_search.best_estimator_
    
    print(f"\nBest hyperparameters:")
    for param, value in grid_search.best_params_.items():
        print(f"- {param}: {value}")
    
    print(f"Best CV PR-AUC: {grid_search.best_score_:.3f}")
    
    return best_model, grid_search

# Train XGBoost model
xgb_model, xgb_grid_search = train_xgboost_model(
    X_train_scaled, y_train, 
    X_val_scaled, y_val, 
    class_weights
)

# Predictions

# Training predictions
y_train_pred_xgb = xgb_model.predict(X_train_scaled)
y_train_prob_xgb = xgb_model.predict_proba(X_train_scaled)[:, 1]

# Validation predictions  
y_val_pred_xgb = xgb_model.predict(X_val_scaled)
y_val_prob_xgb = xgb_model.predict_proba(X_val_scaled)[:, 1]

# Test predictions
y_test_pred_xgb = xgb_model.predict(X_test_scaled) 
y_test_prob_xgb = xgb_model.predict_proba(X_test_scaled)[:, 1]

# Evaluations
xgb_train_metrics = evaluate_model_performance(y_train, y_train_pred_xgb, y_train_prob_xgb, "XGBoost Training")
xgb_val_metrics = evaluate_model_performance(y_val, y_val_pred_xgb, y_val_prob_xgb, "XGBoost Validation") 
xgb_test_metrics = evaluate_model_performance(y_test, y_test_pred_xgb, y_test_prob_xgb, "XGBoost Test")

# Feature importance analysis

# Get feature importance
feature_importance_xgb = pd.DataFrame({
    'feature': selected_features,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 20 Most Important Features (XGBoost):")
print(f"{'Feature':<30} {'Importance':<12}")
for idx, row in feature_importance_xgb.head(20).iterrows():
    print(f"{row['feature']:<30} {row['importance']:>10.4f}")

# Top K Risk Analysis for XGBoost
# Analyze on validation set
analyze_top_k_predictions(y_val.values, y_val_prob_xgb)


Class distribution in training:
- Negative class: 6,808
- Positive class: 2,042
- Scale pos weight: 3.33
Performing hyperparameter tuning...
Fitting 3 folds for each of 2187 candidates, totalling 6561 fits

Best hyperparameters:
- colsample_bytree: 0.7
- learning_rate: 0.1
- max_depth: 7
- n_estimators: 100
- reg_alpha: 0.1
- reg_lambda: 1
- subsample: 0.8
Best CV PR-AUC: 0.988

XGBOOST TRAINING Performance:
- Accuracy: 0.980
- Precision: 0.939
- Recall: 0.977
- F1-Score: 0.958
- ROC AUC: 0.997
- PR AUC: 0.990
Confusion Matrix:
    TN: 6679 | FP:  129
    FN:   47 | TP: 1995

XGBOOST VALIDATION Performance:
- Accuracy: 0.915
- Precision: 0.207
- Recall: 0.649
- F1-Score: 0.314
- ROC AUC: 0.839
- PR AUC: 0.596
Confusion Matrix:
    TN: 2299 | FP:  191
    FN:   27 | TP:   50

XGBOOST TEST Performance:
- Accuracy: 0.893
- Precision: 0.000
- Recall: 0.000
- F1-Score: 0.000
- ROC AUC: 0.539
- PR AUC: 0.012
Confusion Matrix:
    TN:  526 | FP:   57
    FN:    6 | TP:    0
Top 20 Most Import