# Model Diagnostics & Performance Analysis

This notebook analyzes the current baseline model to identify:
- Probability distribution vs true labels
- Where the model has strong vs weak separation
- Per-symbol performance breakdown
- Feature importance and correlations
- Opportunities for improvement


In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, confusion_matrix, classification_report

from src.models import load_model
from src.dataset import get_train_val_test_splits, load_labeled_dataset

%matplotlib inline
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
pd.set_option('display.max_columns', 100)


## 1. Load Model and Data


In [None]:
# Load trained model
model, feature_names, metadata = load_model("lgbm_baseline")

# Load datasets
X_train, y_train, X_val, y_val, X_test, y_test = get_train_val_test_splits()

print(f"Model: {len(feature_names)} features")
print(f"Train: {len(X_train):,} samples")
print(f"Val: {len(X_val):,} samples")
print(f"Test: {len(X_test):,} samples")


## 2. Probability Distribution Analysis

Key insight: Is the model confident or uncertain in its predictions?


In [None]:
# Get predictions
y_pred_proba_test = model.predict_proba(X_test)[:, 1]
y_pred_proba_train = model.predict_proba(X_train)[:, 1]

# Plot probability distributions
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Test set - all predictions
ax = axes[0, 0]
ax.hist(y_pred_proba_test, bins=50, alpha=0.7, edgecolor='black')
ax.axvline(0.5, color='red', linestyle='--', label='Decision boundary')
ax.set_xlabel('Predicted Probability (Long Win)')
ax.set_ylabel('Count')
ax.set_title('Test Set: Probability Distribution (All Predictions)', fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)

# Test set - by actual outcome
ax = axes[0, 1]
ax.hist(y_pred_proba_test[y_test == 1], bins=50, alpha=0.6, label='Actual Long Wins', color='green')
ax.hist(y_pred_proba_test[y_test == 0], bins=50, alpha=0.6, label='Actual Short Wins', color='red')
ax.axvline(0.5, color='black', linestyle='--', alpha=0.5)
ax.set_xlabel('Predicted Probability')
ax.set_ylabel('Count')
ax.set_title('Test Set: Probabilities by True Label', fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)

# Calibration plot
ax = axes[1, 0]
bins = np.linspace(0, 1, 11)
bin_centers = (bins[:-1] + bins[1:]) / 2
true_probs = []
for i in range(len(bins)-1):
    mask = (y_pred_proba_test >= bins[i]) & (y_pred_proba_test < bins[i+1])
    if mask.sum() > 0:
        true_probs.append(y_test[mask].mean())
    else:
        true_probs.append(np.nan)

ax.plot([0, 1], [0, 1], 'k--', label='Perfect calibration')
ax.plot(bin_centers, true_probs, 'o-', linewidth=2, markersize=8, label='Model calibration')
ax.set_xlabel('Predicted Probability')
ax.set_ylabel('Actual Fraction of Long Wins')
ax.set_title('Calibration Curve', fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)

# Win rate by confidence bucket
ax = axes[1, 1]
buckets = [(0, 0.4), (0.4, 0.45), (0.45, 0.5), (0.5, 0.55), (0.55, 0.6), (0.6, 1.0)]
bucket_names = []
win_rates = []
counts = []

for low, high in buckets:
    mask = (y_pred_proba_test >= low) & (y_pred_proba_test < high)
    if mask.sum() > 0:
        bucket_names.append(f'{low:.2f}-{high:.2f}')
        win_rates.append(y_test[mask].mean() * 100)
        counts.append(mask.sum())

x_pos = range(len(bucket_names))
bars = ax.bar(x_pos, win_rates, alpha=0.7)
ax.axhline(50, color='red', linestyle='--', alpha=0.5, label='Random (50%)')
ax.set_xlabel('Probability Bucket')
ax.set_ylabel('Win Rate (%)')
ax.set_title('Win Rate by Confidence Bucket', fontweight='bold')
ax.set_xticks(x_pos)
ax.set_xticklabels(bucket_names, rotation=45)
ax.legend()
ax.grid(axis='y', alpha=0.3)

# Add count labels
for i, (bar, count) in enumerate(zip(bars, counts)):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, 
            f'n={count}', ha='center', fontsize=9)

plt.tight_layout()
plt.show()

print("\n" + "="*80)
print("KEY INSIGHTS:")
print("="*80)
print(f"Mean predicted probability: {y_pred_proba_test.mean():.3f}")
print(f"Std predicted probability: {y_pred_proba_test.std():.3f}")
print(f"% predictions near 0.5 (0.45-0.55): {((y_pred_proba_test >= 0.45) & (y_pred_proba_test <= 0.55)).mean()*100:.1f}%")
print(f"% high confidence (>0.6 or <0.4): {((y_pred_proba_test > 0.6) | (y_pred_proba_test < 0.4)).mean()*100:.1f}%")

## 3. Feature Importance Analysis

# Top 20 features
importance_df = metadata['feature_importance'].head(20)
plt.figure(figsize=(12, 8))
plt.barh(range(len(importance_df)), importance_df['importance'])
plt.yticks(range(len(importance_df)), importance_df['feature'])
plt.xlabel('Importance Score')
plt.title('Top 20 Most Important Features', fontweight='bold')
plt.grid(alpha=0.3, axis='x')
plt.tight_layout()
plt.show()
