In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import joblib
import shap
from datetime import datetime

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks, optimizers
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score, 
    roc_curve, precision_recall_curve, average_precision_score,
    accuracy_score, precision_score, recall_score, f1_score
)

# Ignore warnings
warnings.filterwarnings('ignore')

In [6]:
# Visualization settings\

plt.style.use('default')
sns.set_palette("husl")

In [7]:
# Configuration and random seeds

RANDOM_STATE = 42
TARGET_COLUMN = 'injury_next_14_days'
MODEL_NAME = 'nba_injury_predictor_v1'

# Random seeds for reproducibility
np.random.seed(RANDOM_STATE)
tf.random.set_seed(RANDOM_STATE)

print(f"Configuration and etc:")
print(f"- Target: {TARGET_COLUMN}")
print(f"- Random State: {RANDOM_STATE}")
print(f"- Model Name: {MODEL_NAME}")
print(f"- TensorFlow version: {tf.__version__}")
print(f"- GPU available: {len(tf.config.list_physical_devices('GPU')) > 0}")

Configuration and etc:
- Target: injury_next_14_days
- Random State: 42
- Model Name: nba_injury_predictor_v1
- TensorFlow version: 2.19.0
- GPU available: False


In [12]:
# Loads all processed data
# Training data (SMOTE balanced + feature selected)
X_train = pd.read_csv('../data/processed/X_train_final.csv')
y_train = pd.read_csv('../data/processed/y_train_final.csv').squeeze()

# Validation data (feature selected)
X_val = pd.read_csv('../data/processed/X_validation_final.csv')
y_val = pd.read_csv('../data/processed/y_validation_final.csv').squeeze()

# Test data (feature selected)
X_test = pd.read_csv('../data/processed/X_test_final.csv')
y_test = pd.read_csv('../data/processed/y_test_final.csv').squeeze()

print(f"Data loaded successfully:")
print(f"- Training: {X_train.shape} features, {len(y_train)} samples")
print(f"- Validation: {X_val.shape} features, {len(y_val)} samples") 
print(f"- Test: {X_test.shape} features, {len(y_test)} samples")

# Loads metadata and configuration
# Selected features list
selected_features = joblib.load('../data/processed/selected_features.pkl')
print(f"- Selected features: {len(selected_features)}")

# Class weights for handling imbalance
class_weights = joblib.load('../data/processed/class_weights.pkl')
print(f"- Class weights: {class_weights}")

# Preprocessing configuration
preprocessing_config = joblib.load('../data/processed/preprocessing_config.pkl')
print(f"- Preprocessing config loaded")

# Feature selection results
feature_selection_results = joblib.load('../data/processed/feature_selection_results.pkl')
print(f"- Feature selection metadata loaded")

# Split information for validation
split_info = joblib.load('../data/processed/split_info.pkl')
print(f"- Data split validation loaded")

# Data validation and consistency checks
# Check feature consistency
assert list(X_train.columns) == selected_features, "Training features don't match selected features"
assert list(X_val.columns) == selected_features, "Validation features don't match selected features"  
assert list(X_test.columns) == selected_features, "Test features don't match selected features"
print("- Feature consistency across all splits")

# Checks target distributions
train_positive_rate = y_train.mean()
val_positive_rate = y_val.mean()
test_positive_rate = y_test.mean()

print(f"\nTarget distribution validation:")
print(f"- Training positive rate: {train_positive_rate:.1%} (after SMOTE)")
print(f"- Validation positive rate: {val_positive_rate:.1%}")
print(f"- Test positive rate: {test_positive_rate:.1%}")

# Checks for missing values
train_missing = X_train.isnull().sum().sum()
val_missing = X_val.isnull().sum().sum()
test_missing = X_test.isnull().sum().sum()

assert train_missing == 0, f"Training data has {train_missing} missing values"
assert val_missing == 0, f"Validation data has {val_missing} missing values"
assert test_missing == 0, f"Test data has {test_missing} missing values"
print("- No missing values in any split")

# Verifies data types
assert X_train.dtypes.apply(lambda x: x.kind in 'biufc').all(), "Non-numeric features in training"
assert X_val.dtypes.apply(lambda x: x.kind in 'biufc').all(), "Non-numeric features in validation"
assert X_test.dtypes.apply(lambda x: x.kind in 'biufc').all(), "Non-numeric features in test"
print("- All features are numeric")

print("\nAll data validation checks passed!")

# Feature statistics
print(f"\nFeature Statistics (Training Data):")
print(f"- Mean range: {X_train.mean().min():.3f} to {X_train.mean().max():.3f}")
print(f"- Std range: {X_train.std().min():.3f} to {X_train.std().max():.3f}")
print(f"- Min values: {X_train.min().min():.3f} to {X_train.min().max():.3f}")
print(f"- Max values: {X_train.max().min():.3f} to {X_train.max().max():.3f}")

# Checks for potential scaling issues
features_need_scaling = (X_train.std() > 10).sum()
print(f"- Features with std > 10: {features_need_scaling} (may need scaling)")

# Targets class balance verification
print(f"\nClass Balance Check:")
print(f"- Training: {y_train.value_counts().to_dict()}")
print(f"- Validation: {y_val.value_counts().to_dict()}")
print(f"- Test: {y_test.value_counts().to_dict()}")

# Sample feature names by category
print(f"\nSample Features by Type:")
workload_features = [f for f in selected_features if any(x in f for x in ['_7d', '_30d', 'load'])]
fatigue_features = [f for f in selected_features if any(x in f for x in ['fatigue', 'rest', 'back_to_back'])]
context_features = [f for f in selected_features if any(x in f for x in ['age', 'bmi', 'position'])]

print(f"- Workload features ({len(workload_features)}): {workload_features[:3]}...")
print(f"- Fatigue features ({len(fatigue_features)}): {fatigue_features[:3]}...")
print(f"- Context features ({len(context_features)}): {context_features[:3]}...")


# Data preprocessing for modeling
# Feature scaling
# RobustScaler to handle outliers better than StandardScaler
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrames for easier handling
X_train_scaled = pd.DataFrame(X_train_scaled, columns=selected_features, index=X_train.index)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=selected_features, index=X_val.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=selected_features, index=X_test.index)

print(f"- Features scaled using RobustScaler")
print(f"  - Training scaled shape: {X_train_scaled.shape}")
print(f"  - Scaled feature stats: mean≈{X_train_scaled.mean().mean():.3f}, std≈{X_train_scaled.std().mean():.3f}")

# Converts to numpy arrays for TensorFlow
X_train_tf = X_train_scaled.values.astype(np.float32)
X_val_tf = X_val_scaled.values.astype(np.float32)
X_test_tf = X_test_scaled.values.astype(np.float32)
y_train_tf = y_train.values.astype(np.float32)
y_val_tf = y_val.values.astype(np.float32)
y_test_tf = y_test.values.astype(np.float32)

print(f"- Data converted to TensorFlow format")
print(f"  - Input shape: {X_train_tf.shape}")
print(f"  - Target shape: {y_train_tf.shape}")
print(f"  - Data types: {X_train_tf.dtype}, {y_train_tf.dtype}")

# Stores scaler for later use
joblib.dump(scaler, f'../data/processed/{MODEL_NAME}_scaler.pkl')
print(f"- Scaler saved for deployment")

print("Data loaded and prepared for modeling stage")
print(f"Prepared to build TensorFlow model with {X_train_tf.shape[1]} features")

Data loaded successfully:
- Training: (8850, 40) features, 8850 samples
- Validation: (2567, 40) features, 2567 samples
- Test: (589, 40) features, 589 samples
- Selected features: 40
- Class weights: {0: 0.512264982373678, 1: 20.88323353293413}
- Preprocessing config loaded
- Feature selection metadata loaded
- Data split validation loaded
- Feature consistency across all splits

Target distribution validation:
- Training positive rate: 23.1% (after SMOTE)
- Validation positive rate: 3.0%
- Test positive rate: 1.0%
- No missing values in any split
- All features are numeric

All data validation checks passed!

Feature Statistics (Training Data):
- Mean range: -0.102 to 1124.981
- Std range: 0.028 to 271.169
- Min values: -11.500 to 20.074
- Max values: 0.170 to 1757.000
- Features with std > 10: 4 (may need scaling)

Class Balance Check:
- Training: {0: 6808, 1: 2042}
- Validation: {0: 2490, 1: 77}
- Test: {0: 583, 1: 6}

Sample Features by Type:
- Workload features (5): ['total_actio

In [13]:
# Validation

print("Data shapes after all preprocessing:")
print(f"  - X_train: {X_train_tf.shape}")
print(f"  - y_train: {y_train_tf.shape}")
print(f"  - X_val: {X_val_tf.shape}")
print(f"  - y_val: {y_val_tf.shape}")
print(f"  - X_test: {X_test_tf.shape}")
print(f"  - y_test: {y_test_tf.shape}")

print(f"\nClass distribution summary:")
print(f"  - Training: {np.bincount(y_train_tf.astype(int))} (ratio: {(y_train_tf == 0).sum()/(y_train_tf == 1).sum():.1f}:1)")
print(f"  - Validation: {np.bincount(y_val_tf.astype(int))} (ratio: {(y_val_tf == 0).sum()/(y_val_tf == 1).sum():.1f}:1)")
print(f"  - Test: {np.bincount(y_test_tf.astype(int))} (ratio: {(y_test_tf == 0).sum()/(y_test_tf == 1).sum():.1f}:1)")

print(f"\nClass weights for model: {class_weights}")

print(f"\nReady")
print(f"   - Features: {X_train_tf.shape[1]}")
print(f"   - Training samples: {X_train_tf.shape[0]:,}")
print(f"   - Target: {TARGET_COLUMN}")
print(f"   - Model: {MODEL_NAME}")

Data shapes after all preprocessing:
  - X_train: (8850, 40)
  - y_train: (8850,)
  - X_val: (2567, 40)
  - y_val: (2567,)
  - X_test: (589, 40)
  - y_test: (589,)

Class distribution summary:
  - Training: [6808 2042] (ratio: 3.3:1)
  - Validation: [2490   77] (ratio: 32.3:1)
  - Test: [583   6] (ratio: 97.2:1)

Class weights for model: {0: 0.512264982373678, 1: 20.88323353293413}

Ready
   - Features: 40
   - Training samples: 8,850
   - Target: injury_next_14_days
   - Model: nba_injury_predictor_v1
