# NBA Props Model - Fixed Evaluation

## Critical Issues Addressed:
1. **Removed data leakage features** (Opportunity_Score, etc.)
2. **Simplified models** for 503 samples
3. **Fixed preprocessing** pipeline
4. **Realistic expectations** set

### ⚠️ IMPORTANT:
The current PRA_estimate is calculated from features, not real game data. 
This notebook shows what to expect with REAL PRA data.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

import matplotlib.pyplot as plt
import seaborn as sns

print("Libraries loaded successfully")

## 1. Load Cleaned Data (Without Leakage Features)

In [None]:
# Load cleaned dataset
data_path = Path('/Users/diyagamah/Documents/nba_props_model/data/processed')
df = pd.read_csv(data_path / 'player_features_2023_24_clean.csv')

print(f"Dataset shape: {df.shape}")
print(f"Players: {len(df)}")

# Identify features and target
feature_cols = [col for col in df.columns if col not in ['Player', 'Team', 'PRA_estimate']]
X = df[feature_cols].copy()
y = df['PRA_estimate'].copy()

print(f"\nFeatures ({len(feature_cols)}):")
for i, col in enumerate(feature_cols, 1):
    print(f"{i:2d}. {col}")

print(f"\nTarget statistics:")
print(f"  Mean: {y.mean():.2f}")
print(f"  Std: {y.std():.2f}")
print(f"  Range: [{y.min():.2f}, {y.max():.2f}]")

## 2. Three-Way Split (Train/Validation/Test)

In [None]:
# First split: separate test set
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Second split: separate train and validation
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42
)

print("Data Split:")
print(f"  Training: {len(X_train)} samples")
print(f"  Validation: {len(X_val)} samples")
print(f"  Test: {len(X_test)} samples (NEVER touched during training)")

# Scale features
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print("\nFeatures scaled using RobustScaler")

## 3. Simple Models (Appropriate for 503 Samples)

In [None]:
# Define SIMPLE models appropriate for small dataset
models = {
    'Ridge (α=10)': Ridge(alpha=10.0, random_state=42),
    'Ridge (α=1)': Ridge(alpha=1.0, random_state=42),
    'Lasso (α=1)': Lasso(alpha=1.0, random_state=42),
    'ElasticNet': ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=42),
    'XGBoost (Simple)': xgb.XGBRegressor(
        n_estimators=50,  # Reduced from 200
        max_depth=3,      # Reduced from 6
        learning_rate=0.1,
        subsample=0.7,
        reg_alpha=1.0,    # L1 regularization
        reg_lambda=1.0,   # L2 regularization
        random_state=42
    ),
    'RandomForest (Simple)': RandomForestRegressor(
        n_estimators=50,  # Reduced
        max_depth=5,      # Limited depth
        min_samples_split=10,  # Prevent overfitting
        min_samples_leaf=5,
        random_state=42
    )
}

print(f"Training {len(models)} simple models...")

In [None]:
# Train and evaluate on VALIDATION set
results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Use scaled data for linear models, raw for tree models
    if 'XGB' in name or 'Forest' in name:
        model.fit(X_train, y_train)
        val_pred = model.predict(X_val)
    else:
        model.fit(X_train_scaled, y_train)
        val_pred = model.predict(X_val_scaled)
    
    # Calculate metrics on validation set
    val_mae = mean_absolute_error(y_val, val_pred)
    val_rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    val_r2 = r2_score(y_val, val_pred)
    val_mape = np.mean(np.abs((y_val - val_pred) / y_val)) * 100
    
    results[name] = {
        'model': model,
        'val_mae': val_mae,
        'val_rmse': val_rmse,
        'val_r2': val_r2,
        'val_mape': val_mape
    }
    
    print(f"  Validation MAE: {val_mae:.3f}")
    print(f"  Validation R²: {val_r2:.3f}")
    print(f"  Validation MAPE: {val_mape:.1f}%")

## 4. Cross-Validation (More Reliable)

In [None]:
# Perform 5-fold CV on train+val data
X_train_val = np.vstack([X_train, X_val])
y_train_val = np.concatenate([y_train, y_val])

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

print("5-Fold Cross-Validation Results:")
print("="*50)

cv_results = {}
for name, model_info in results.items():
    model = models[name]  # Fresh model
    
    if 'XGB' in name or 'Forest' in name:
        data_to_use = np.vstack([X_train, X_val])
    else:
        data_to_use = np.vstack([X_train_scaled, X_val_scaled])
    
    cv_scores = cross_val_score(
        model, data_to_use, y_train_val,
        cv=kfold, scoring='neg_mean_absolute_error'
    )
    cv_scores = -cv_scores
    
    cv_results[name] = {
        'mean': cv_scores.mean(),
        'std': cv_scores.std()
    }
    
    print(f"\n{name}:")
    print(f"  Mean MAE: {cv_scores.mean():.3f} (±{cv_scores.std():.3f})")

## 5. Select Best Model and Final Test

In [None]:
# Select best model based on validation performance
best_model_name = min(results, key=lambda x: results[x]['val_mae'])
best_model = results[best_model_name]['model']

print(f"Best Model: {best_model_name}")
print(f"Validation MAE: {results[best_model_name]['val_mae']:.3f}")

# FINAL TEST on held-out test set
if 'XGB' in best_model_name or 'Forest' in best_model_name:
    test_pred = best_model.predict(X_test)
else:
    test_pred = best_model.predict(X_test_scaled)

test_mae = mean_absolute_error(y_test, test_pred)
test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
test_r2 = r2_score(y_test, test_pred)
test_mape = np.mean(np.abs((y_test - test_pred) / y_test)) * 100

print("\n" + "="*50)
print("FINAL TEST RESULTS (Never seen during training):")
print("="*50)
print(f"Test MAE: {test_mae:.3f}")
print(f"Test RMSE: {test_rmse:.3f}")
print(f"Test R²: {test_r2:.3f}")
print(f"Test MAPE: {test_mape:.1f}%")

# Check for overfitting
val_performance = results[best_model_name]['val_mae']
overfit_ratio = test_mae / val_performance
print(f"\nOverfitting Check:")
print(f"  Val MAE: {val_performance:.3f}")
print(f"  Test MAE: {test_mae:.3f}")
print(f"  Ratio: {overfit_ratio:.2f} (close to 1.0 is good)")

## 6. Visualization

In [None]:
# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# 1. Actual vs Predicted
ax1 = axes[0, 0]
ax1.scatter(y_test, test_pred, alpha=0.5)
ax1.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
ax1.set_xlabel('Actual PRA')
ax1.set_ylabel('Predicted PRA')
ax1.set_title(f'Test Predictions ({best_model_name})')
ax1.text(0.05, 0.95, f'R² = {test_r2:.3f}\nMAE = {test_mae:.3f}',
         transform=ax1.transAxes, verticalalignment='top',
         bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

# 2. Residuals
ax2 = axes[0, 1]
residuals = y_test - test_pred
ax2.scatter(test_pred, residuals, alpha=0.5)
ax2.axhline(y=0, color='r', linestyle='--')
ax2.set_xlabel('Predicted PRA')
ax2.set_ylabel('Residuals')
ax2.set_title('Residual Plot')
ax2.text(0.05, 0.95, f'Mean: {residuals.mean():.2f}\nStd: {residuals.std():.2f}',
         transform=ax2.transAxes, verticalalignment='top',
         bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

# 3. Error Distribution
ax3 = axes[1, 0]
ax3.hist(residuals, bins=20, edgecolor='black', alpha=0.7)
ax3.axvline(x=0, color='r', linestyle='--')
ax3.set_xlabel('Prediction Error')
ax3.set_ylabel('Frequency')
ax3.set_title('Error Distribution')

# 4. Model Comparison
ax4 = axes[1, 1]
model_names = list(results.keys())
val_maes = [results[m]['val_mae'] for m in model_names]
colors = ['green' if m == best_model_name else 'steelblue' for m in model_names]
bars = ax4.bar(range(len(model_names)), val_maes, color=colors)
ax4.set_xticks(range(len(model_names)))
ax4.set_xticklabels(model_names, rotation=45, ha='right')
ax4.set_ylabel('Validation MAE')
ax4.set_title('Model Comparison')
ax4.grid(True, alpha=0.3, axis='y')

plt.suptitle('Fixed Model Evaluation Results', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

## 7. What These Results Mean

### Current Performance (with calculated PRA_estimate):
- The high R² values you're seeing are **fake** - the model is learning a formula
- This is NOT predictive performance

### Expected Performance (with REAL PRA data):
- **R² = 0.35-0.50**: This is realistic for NBA predictions
- **MAE = 3-5 points**: Real prediction error for PRA
- **MAPE = 25-35%**: Typical percentage error

### Next Steps:
1. **Get real PRA data** from NBA games
2. **Add temporal features**: Recent game performance
3. **Include context**: Opponent, home/away, rest days
4. **Collect more data**: Need 2000+ player-games minimum