In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# MLB Team Success Predictor - Model Development\n",
    "\n",
    "This notebook develops and trains machine learning models for predicting MLB team success.\n",
    "\n",
    "## Objectives:\n",
    "1. Prepare data for modeling\n",
    "2. Train classification models for division winners\n",
    "3. Train regression models for win totals\n",
    "4. Develop milestone prediction models\n",
    "5. Implement ensemble methods\n",
    "6. Perform hyperparameter tuning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from pathlib import Path\n",
    "import json\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "# Add project root to path\n",
    "import sys\n",
    "sys.path.append('..')\n",
    "\n",
    "# Import custom modules\n",
    "from src.training.train_classifier import ClassifierTrainer\n",
    "from src.training.train_regressor import RegressorTrainer\n",
    "from src.training.hyperparameter_tuning import HyperparameterTuner\n",
    "from src.training.cross_validation import CrossValidator\n",
    "from src.models.ensemble_models import MLBEnsembleModel\n",
    "from src.visualization.model_plots import ModelVisualizer\n",
    "\n",
    "# Set random seed\n",
    "from src.utils.helpers import set_random_seed\n",
    "set_random_seed(42)\n",
    "\n",
    "print(\"Libraries loaded successfully!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Load Engineered Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load engineered data\n",
    "data_path = Path('../data/processed/mlb_data_engineered.csv')\n",
    "df = pd.read_csv(data_path)\n",
    "\n",
    "# Load feature lists\n",
    "with open('../data/processed/feature_lists.json', 'r') as f:\n",
    "    feature_lists = json.load(f)\n",
    "\n",
    "print(f\"Data shape: {df.shape}\")\n",
    "print(f\"Classification features: {len(feature_lists['classification_features'])}\")\n",
    "print(f\"Regression features: {len(feature_lists['regression_features'])}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Filter to modern era for initial models\n",
    "modern_df = df[df['year'] >= 2006].copy()\n",
    "print(f\"\\nModern era data shape: {modern_df.shape}\")\n",
    "print(f\"Years: {modern_df['year'].min()} - {modern_df['year'].max()}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Division Winner Classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize classifier trainer\n",
    "classifier_trainer = ClassifierTrainer(\n",
    "    task_type='division_winner',\n",
    "    era_strategy='modern',\n",
    "    model_types=['logistic', 'random_forest', 'xgboost', 'lightgbm']\n",
    ")\n",
    "\n",
    "# Use custom features\n",
    "classification_features = feature_lists['classification_features']\n",
    "\n",
    "# Prepare data\n",
    "X_train, X_test, y_train, y_test, X_val, y_val = classifier_trainer.prepare_data(\n"
    "custom_features=classification_features\n",
    ")\n",
    "\n",
    "print(f\"Training set shape: {X_train.shape}\")\n",
    "print(f\"Validation set shape: {X_val.shape}\")\n",
    "print(f\"Test set shape: {X_test.shape}\")\n",
    "print(f\"\\nClass distribution:\")\n",
    "print(f\"Training: {np.bincount(y_train)}\")\n",
    "print(f\"Test: {np.bincount(y_test)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train classification models\n",
    "classifier_trainer.train_models(X_train, y_train, X_val, y_val)\n",
    "\n",
    "# Select best model\n",
    "best_classifier = classifier_trainer.select_best_model(metric='roc_auc')\n",
    "\n",
    "# Display results\n",
    "results_df = pd.DataFrame(classifier_trainer.results).T\n",
    "print(\"\\nModel Performance Summary:\")\n",
    "print(results_df[['accuracy', 'precision', 'recall', 'f1', 'roc_auc']].round(3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Evaluate on test set\n",
    "test_results = classifier_trainer.evaluate_on_test_set(X_test, y_test)\n",
    "\n",
    "print(\"\\nTest Set Performance:\")\n",
    "print(f\"Accuracy: {test_results['accuracy']:.3f}\")\n",
    "print(f\"ROC AUC: {test_results['roc_auc']:.3f}\")\n",
    "print(f\"Precision: {test_results['precision']:.3f}\")\n",
    "print(f\"Recall: {test_results['recall']:.3f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Win Total Regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize regression trainer\n",
    "regressor_trainer = RegressorTrainer(\n",
    "    target_type='wins',\n",
    "    era_strategy='modern',\n",
    "    model_types=['ridge', 'random_forest', 'xgboost', 'lightgbm']\n",
    ")\n",
    "\n",
    "# Prepare data\n",
    "X_train_r, X_test_r, y_train_r, y_test_r, X_val_r, y_val_r = regressor_trainer.prepare_data(\n",
    "    custom_features=feature_lists['regression_features']\n",
    ")\n",
    "\n",
    "print(f\"Regression data prepared\")\n",
    "print(f\"Target statistics:\")\n",
    "print(f\"  Train mean: {y_train_r.mean():.1f}, std: {y_train_r.std():.1f}\")\n",
    "print(f\"  Test mean: {y_test_r.mean():.1f}, std: {y_test_r.std():.1f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train regression models\n",
    "regressor_trainer.train_models(X_train_r, y_train_r, X_val_r, y_val_r)\n",
    "\n",
    "# Select best model\n",
    "best_regressor = regressor_trainer.select_best_model(metric='rmse', minimize=True)\n",
    "\n",
    "# Display results\n",
    "reg_results_df = pd.DataFrame(regressor_trainer.results).T\n",
    "print(\"\\nRegression Model Performance:\")\n",
    "print(reg_results_df[['rmse', 'mae', 'r2', 'within_5_wins', 'within_10_wins']].round(3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Test set evaluation\n",
    "test_results_r = regressor_trainer.evaluate_on_test_set(X_test_r, y_test_r)\n",
    "\n",
    "print(\"\\nTest Set Performance:\")\n",
    "print(f\"RMSE: {test_results_r['rmse']:.2f}\")\n",
    "print(f\"MAE: {test_results_r['mae']:.2f}\")\n",
    "print(f\"R²: {test_results_r['r2']:.3f}\")\n",
    "print(f\"Within 5 wins: {test_results_r['within_5_wins']:.1%}\")\n",
    "print(f\"Within 10 wins: {test_results_r['within_10_wins']:.1%}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Cross-Validation Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Perform cross-validation for best models\n",
    "cv = CrossValidator(cv_strategy='stratified', n_folds=5)\n",
    "\n",
    "# Classification CV\n",
    "print(\"Classification Model Cross-Validation:\")\n",
    "cv_results_class = cv.validate_model(\n",
    "    classifier_trainer.best_model.model,\n",
    "    np.vstack([X_train, X_val]),\n",
    "    np.hstack([y_train, y_val]),\n",
    "    scoring=['accuracy', 'roc_auc', 'f1']\n",
    ")\n",
    "\n",
    "for metric in ['accuracy', 'roc_auc', 'f1']:\n",
    "    mean = cv_results_class[f'{metric}_mean']\n",
    "    std = cv_results_class[f'{metric}_std']\n",
    "    print(f\"  {metric}: {mean:.3f} (+/- {std:.3f})\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Regression CV\n",
    "cv_reg = CrossValidator(cv_strategy='kfold', n_folds=5)\n",
    "\n",
    "print(\"\\nRegression Model Cross-Validation:\")\n",
    "cv_results_reg = cv_reg.validate_model(\n",
    "    regressor_trainer.best_model.model,\n",
    "    np.vstack([X_train_r, X_val_r]),\n",
    "    np.hstack([y_train_r, y_val_r]),\n",
    "    scoring=['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2']\n",
    ")\n",
    "\n",
    "print(f\"  RMSE: {np.sqrt(-cv_results_reg['neg_mean_squared_error_mean']):.2f} \"\n",
    "      f\"(+/- {np.sqrt(cv_results_reg['neg_mean_squared_error_std']):.2f})\")\n",
    "print(f\"  MAE: {-cv_results_reg['neg_mean_absolute_error_mean']:.2f} \"\n",
    "      f\"(+/- {cv_results_reg['neg_mean_absolute_error_std']:.2f})\")\n",
    "print(f\"  R²: {cv_results_reg['r2_mean']:.3f} (+/- {cv_results_reg['r2_std']:.3f})\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Hyperparameter Tuning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Hyperparameter tuning for best models\n",
    "print(\"Hyperparameter tuning for XGBoost classifier...\")\n",
    "\n",
    "tuner = HyperparameterTuner(\n",
    "    model_class='classification',\n",
    "    model_type='xgboost',\n",
    "    search_method='random',  # Using random instead of bayesian to avoid skopt\n",
    "    n_iter=20,\n",
    "    cv_folds=3\n",
    ")\n",
    "\n",
    "# Combine train and validation for tuning\n",
    "X_tune = np.vstack([X_train, X_val])\n",
    "y_tune = np.hstack([y_train, y_val])\n",
    "\n",
    "tuning_results = tuner.optimize(X_tune, y_tune)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Display tuning results\n",
    "print(\"\\nBest parameters found:\")\n",
    "for param, value in tuning_results['best_params'].items():\n",
    "    print(f\"  {param}: {value}\")\n",
    "\n",
    "print(f\"\\nBest cross-validation score: {tuning_results['best_score']:.4f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Ensemble Model Development"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create ensemble model for classification\n",
    "from sklearn.ensemble import VotingClassifier\n",
    "\n",
    "# Get top 3 models\n",
    "top_models = [\n",
    "    (name, model.model) \n",
    "    for name, model in classifier_trainer.models.items() \n",
    "    if model is not None\n",
    "][:3]\n",
    "\n",
    "# Create voting ensemble\n",
    "ensemble_classifier = MLBEnsembleModel(\n",
    "    ensemble_type='voting',\n",
    "    task_type='classification',\n",
    "    models=top_models\n",
    ")\n",
    "\n",
    "# Train ensemble\n",
    "ensemble_classifier.train(X_train, y_train, X_val, y_val)\n",
    "\n",
    "# Evaluate\n",
    "ensemble_pred = ensemble_classifier.predict(X_test)\n",
    "ensemble_proba = ensemble_classifier.predict_proba(X_test)\n",
    "\n",
    "from sklearn.metrics import accuracy_score, roc_auc_score\n",
    "print(\"Ensemble Classifier Performance:\")\n",
    "print(f\"  Accuracy: {accuracy_score(y_test, ensemble_pred):.3f}\")\n",
    "print(f\"  ROC AUC: {roc_auc_score(y_test, ensemble_proba[:, 1]):.3f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. Milestone Prediction Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Prepare milestone targets\n",
    "from src.models.milestone_predictor import MilestonePredictor\n",
    "\n",
    "milestone_predictor = MilestonePredictor()\n",
    "\n",
    "# Define milestones\n",
    "milestone_columns = ['achieved_90_wins', 'achieved_100_wins', 'scored_800_runs']\n",
    "\n",
    "# Check availability\n",
    "available_milestones = [col for col in milestone_columns if col in modern_df.columns]\n",
    "print(f\"Available milestones: {available_milestones}\")\n",
    "\n",
    "if len(available_milestones) > 0:\n",
    "    # Prepare milestone targets\n",
    "    milestone_df = modern_df[feature_lists['regression_features'][:30] + available_milestones].dropna()\n",
    "    \n",
    "    X_milestone = milestone_df[feature_lists['regression_features'][:30]].values\n",
    "    y_milestone = milestone_df[available_milestones].values\n",
    "    \n",
    "    print(f\"Milestone data shape: {X_milestone.shape}\")\n",
    "    print(f\"Milestone achievement rates:\")\n",
    "    for i, milestone in enumerate(available_milestones):\n",
    "        print(f\"  {milestone}: {y_milestone[:, i].mean():.1%}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8. Feature Importance Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get feature importance from best models\n",
    "if hasattr(classifier_trainer.best_model, 'get_feature_importance'):\n",
    "    importance_df = classifier_trainer.best_model.get_feature_importance()\n",
    "    \n",
    "    if importance_df is not None:\n",
    "        # Plot feature importance\n",
    "        visualizer = ModelVisualizer()\n",
    "        fig = visualizer.plot_feature_importance(top_n=20)\n",
    "        plt.show()\n",
    "        \n",
    "        print(\"\\nTop 10 Most Important Features:\")\n",
    "        print(importance_df.head(10))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 9. Model Comparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Compare all models\n",
    "comparison_data = []\n",
    "\n",
    "# Classification models\n",
    "for name, result in classifier_trainer.results.items():\n",
    "    if 'error' not in result:\n",
    "        comparison_data.append({\n",
    "            'Model': f'Class_{name}',\n",
    "            'Type': 'Classification',\n",
    "            'Accuracy': result.get('accuracy', 0),\n",
    "            'ROC_AUC': result.get('roc_auc', 0),\n",
    "            'Train_Time': result.get('train_time', 0)\n",
    "        })\n",
    "\n",
    "# Regression models  \n",
    "for name, result in regressor_trainer.results.items():\n",
    "    if 'error' not in result:\n",
    "        comparison_data.append({\n",
    "            'Model': f'Reg_{name}',\n",
    "            'Type': 'Regression',\n",
    "            'RMSE': result.get('rmse', 0),\n",
    "            'R2': result.get('r2', 0),\n",
    "            'Train_Time': result.get('train_time', 0)\n",
    "        })\n",
    "\n",
    "comparison_df = pd.DataFrame(comparison_data)\n",
    "\n",
    "# Visualize comparison\n",
    "fig, axes = plt.subplots(1, 2, figsize=(12, 5))\n",
    "\n",
    "# Classification models\n",
    "class_df = comparison_df[comparison_df['Type'] == 'Classification']\n",
    "if len(class_df) > 0:\n",
    "    ax = axes[0]\n",
    "    x = np.arange(len(class_df))\n",
    "    width = 0.35\n",
    "    \n",
    "    ax.bar(x - width/2, class_df['Accuracy'], width, label='Accuracy')\n",
    "    ax.bar(x + width/2, class_df['ROC_AUC'], width, label='ROC AUC')\n",
    "    \n",
    "    ax.set_xlabel('Model')\n",
    "    ax.set_ylabel('Score')\n",
    "    ax.set_title('Classification Model Comparison')\n",
    "    ax.set_xticks(x)\n",
    "    ax.set_xticklabels([m.replace('Class_', '') for m in class_df['Model']], rotation=45)\n",
    "    ax.legend()\n",
    "    ax.grid(True, alpha=0.3)\n",
    "\n",
    "# Regression models\n",
    "reg_df = comparison_df[comparison_df['Type'] == 'Regression']\n",
    "if len(reg_df) > 0:\n",
    "    ax = axes[1]\n",
    "    x = np.arange(len(reg_df))\n",
    "    \n",
    "    ax2 = ax.twinx()\n",
    "    ax.bar(x, reg_df['RMSE'], color='red', alpha=0.7, label='RMSE')\n",
    "    ax2.plot(x, reg_df['R2'], 'go-', linewidth=2, markersize=8, label='R²')\n",
    "    \n",
    "    ax.set_xlabel('Model')\n",
    "    ax.set_ylabel('RMSE', color='red')\n",
    "    ax2.set_ylabel('R²', color='green')\n",
    "    ax.set_title('Regression Model Comparison')\n",
    "    ax.set_xticks(x)\n",
    "    ax.set_xticklabels([m.replace('Reg_', '') for m in reg_df['Model']], rotation=45)\n",
    "    ax.grid(True, alpha=0.3)\n",
    "    \n",
    "    # Combine legends\n",
    "    lines1, labels1 = ax.get_legend_handles_labels()\n",
    "    lines2, labels2 = ax2.get_legend_handles_labels()\n",
    "    ax.legend(lines1 + lines2, labels1 + labels2, loc='upper left')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 10. Save Models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save best models\n",
    "print(\"Saving models...\")\n",
    "\n",
    "# Save classification models\n",
    "classifier_trainer.save_models(save_all=False)\n",
    "\n",
    "# Save regression models\n",
    "regressor_trainer.save_models(save_all=False)\n",
    "\n",
    "# Save training reports\n",
    "class_report = classifier_trainer.generate_training_report()\n",
    "reg_report = regressor_trainer.generate_training_report()\n",
    "\n",
    "# Save combined report\n",
    "combined_report = {\n",
    "    'classification': class_report,\n",
    "    'regression': reg_report,\n",
    "    'training_date': pd.Timestamp.now().isoformat()\n",
    "}\n",
    "\n",
    "report_path = Path('../models/training_report.json')\n",
    "with open(report_path, 'w') as f:\n",
    "    json.dump(combined_report, f, indent=2)\n",
    "\n",
    "print(f\"\\nModels and reports saved successfully!\")\n",
    "print(f\"Training report saved to: {report_path}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}