In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# MLB Team Success Predictor - Feature Engineering\n",
    "\n",
    "This notebook creates engineered features to improve model performance.\n",
    "\n",
    "## Objectives:\n",
    "1. Create performance-based features\n",
    "2. Engineer historical/lag features\n",
    "3. Generate era-adjusted statistics\n",
    "4. Build milestone and target variables\n",
    "5. Validate and select features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from pathlib import Path\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "# Add project root to path\n",
    "import sys\n",
    "sys.path.append('..')\n",
    "\n",
    "# Import custom modules\n",
    "from src.data.data_preprocessor import DataPreprocessor\n",
    "from src.data.feature_engineering import FeatureEngineer\n",
    "from src.utils.config import CLASSIFICATION_FEATURES, REGRESSION_FEATURES\n",
    "from src.utils.constants import ERA_DEFINITIONS\n",
    "\n",
    "# Set up visualization\n",
    "plt.style.use('seaborn-v0_8-darkgrid')\n",
    "%matplotlib inline\n",
    "\n",
    "print(\"Libraries loaded successfully!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Load and Preprocess Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load data from previous notebook\n",
    "data_path = Path('../data/processed/mlb_data_explored.csv')\n",
    "df = pd.read_csv(data_path)\n",
    "\n",
    "print(f\"Data loaded: {df.shape}\")\n",
    "print(f\"Years: {df['year'].min()} - {df['year'].max()}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Preprocess data\n",
    "preprocessor = DataPreprocessor(era_strategy='all')  # Keep all eras for now\n",
    "df_processed = preprocessor.preprocess(df)\n",
    "\n",
    "print(f\"Processed data shape: {df_processed.shape}\")\n",
    "print(f\"\\nNew columns from preprocessing:\")\n",
    "new_cols = set(df_processed.columns) - set(df.columns)\n",
    "print(list(new_cols))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Basic Performance Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize feature engineer\n",
    "engineer = FeatureEngineer(include_era_features=True)\n",
    "\n",
    "# Engineer all features\n",
    "df_engineered = engineer.engineer_features(df_processed)\n",
    "\n",
    "print(f\"Engineered data shape: {df_engineered.shape}\")\n",
    "print(f\"Total new features created: {len(df_engineered.columns) - len(df_processed.columns)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Display feature groups\n",
    "feature_groups = engineer.get_feature_groups()\n",
    "\n",
    "for group_name, features in feature_groups.items():\n",
    "    print(f\"\\n{group_name.upper()} Features ({len(features)}):\")\n",
    "    print(f\"  Sample: {features[:5]}...\" if len(features) > 5 else f\"  {features}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Feature Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Analyze new features\n",
    "performance_features = feature_groups.get('performance', [])\n",
    "\n",
    "if len(performance_features) > 0:\n",
    "    fig, axes = plt.subplots(2, 2, figsize=(12, 10))\n",
    "    axes = axes.ravel()\n",
    "    \n",
    "    for i, feature in enumerate(performance_features[:4]):\n",
    "        if feature in df_engineered.columns:\n",
    "            ax = axes[i]\n",
    "            df_engineered[feature].hist(bins=30, ax=ax, edgecolor='black', alpha=0.7)\n",
    "            ax.set_title(f'Distribution of {feature}')\n",
    "            ax.set_xlabel(feature)\n",
    "            \n",
    "            # Add mean line\n",
    "            mean_val = df_engineered[feature].mean()\n",
    "            ax.axvline(mean_val, color='red', linestyle='--', linewidth=2)\n",
    "            ax.text(mean_val, ax.get_ylim()[1]*0.9, f'Mean: {mean_val:.3f}', \n",
    "                   bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))\n",
    "    \n",
    "    plt.tight_layout()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check for missing values in engineered features\n",
    "missing_features = df_engineered[engineer.feature_names].isnull().sum()\n",
    "missing_features = missing_features[missing_features > 0].sort_values(ascending=False)\n",
    "\n",
    "if len(missing_features) > 0:\n",
    "    print(\"Features with missing values:\")\n",
    "    print(missing_features.head(10))\n",
    "    \n",
    "    # Visualize missing pattern\n",
    "    plt.figure(figsize=(10, 6))\n",
    "    missing_pct = (missing_features / len(df_engineered)) * 100\n",
    "    missing_pct.head(20).plot(kind='bar')\n",
    "    plt.title('Top 20 Features with Missing Values')\n",
    "    plt.xlabel('Feature')\n",
    "    plt.ylabel('Missing Percentage')\n",
    "    plt.xticks(rotation=45, ha='right')\n",
    "    plt.tight_layout()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Historical Features Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Analyze historical features\n",
    "historical_features = [f for f in engineer.feature_names if 'prev_' in f or '_avg' in f]\n",
    "\n",
    "print(f\"Historical features created: {len(historical_features)}\")\n",
    "print(f\"Sample: {historical_features[:10]}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize relationship between current and previous year performance\n",
    "if 'prev_wins' in df_engineered.columns:\n",
    "    fig, axes = plt.subplots(1, 2, figsize=(12, 5))\n",
    "    \n",
    "    # Scatter plot\n",
    "    ax = axes[0]\n",
    "    valid_data = df_engineered.dropna(subset=['wins', 'prev_wins'])\n",
    "    ax.scatter(valid_data['prev_wins'], valid_data['wins'], alpha=0.5, s=10)\n",
    "    \n",
    "    # Add regression line\n",
    "    from scipy import stats\n",
    "    slope, intercept, r_value, _, _ = stats.linregress(valid_data['prev_wins'], valid_data['wins'])\n",
    "    x_line = np.linspace(valid_data['prev_wins'].min(), valid_data['prev_wins'].max(), 100)\n",
    "    y_line = slope * x_line + intercept\n",
    "    ax.plot(x_line, y_line, 'r-', linewidth=2, label=f'R² = {r_value**2:.3f}')\n",
    "    \n",
    "    ax.set_xlabel('Previous Year Wins')\n",
    "    ax.set_ylabel('Current Year Wins')\n",
    "    ax.set_title('Year-over-Year Win Correlation')\n",
    "    ax.legend()\n",
    "    ax.grid(True, alpha=0.3)\n",
    "    \n",
    "    # Distribution of win changes\n",
    "    ax = axes[1]\n",
    "    if 'wins_change' in df_engineered.columns:\n",
    "        win_changes = df_engineered['wins_change'].dropna()\n",
    "        ax.hist(win_changes, bins=30, edgecolor='black', alpha=0.7)\n",
    "        ax.axvline(0, color='red', linestyle='--', linewidth=2)\n",
    "        ax.set_xlabel('Change in Wins')\n",
    "        ax.set_ylabel('Frequency')\n",
    "        ax.set_title('Distribution of Year-over-Year Win Changes')\n",
    "        \n",
    "        # Add statistics\n",
    "        mean_change = win_changes.mean()\n",
    "        std_change = win_changes.std()\n",
    "        ax.text(0.05, 0.95, f'Mean: {mean_change:.1f}\\nStd: {std_change:.1f}',\n",
    "               transform=ax.transAxes, verticalalignment='top',\n",
    "               bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))\n",
    "    \n",
    "    plt.tight_layout()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Era-Adjusted Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Analyze era-adjusted features\n",
    "era_features = [f for f in engineer.feature_names if 'era' in f]\n",
    "\n",
    "print(f\"Era-adjusted features: {len(era_features)}\")\n",
    "\n",
    "# Compare raw vs era-adjusted metrics\n",
    "if 'wins_era_zscore' in df_engineered.columns:\n",
    "    fig, axes = plt.subplots(1, 2, figsize=(12, 5))\n",
    "    \n",
    "    # Raw wins by era\n",
    "    ax = axes[0]\n",
    "    df_engineered.boxplot(column='wins', by='era', ax=ax)\n",
    "    ax.set_title('Raw Wins by Era')\n",
    "    ax.set_xlabel('Era')\n",
    "    ax.set_ylabel('Wins')\n",
    "    plt.suptitle('')  # Remove default title\n",
    "    \n",
    "    # Era-adjusted wins (z-score)\n",
    "    ax = axes[1]\n",
    "    df_engineered.boxplot(column='wins_era_zscore', by='era', ax=ax)\n",
    "    ax.set_title('Era-Adjusted Wins (Z-Score)')\n",
    "    ax.set_xlabel('Era')\n",
    "    ax.set_ylabel('Z-Score')\n",
    "    ax.axhline(0, color='red', linestyle='--', alpha=0.5)\n",
    "    plt.suptitle('')\n",
    "    \n",
    "    plt.tight_layout()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Target Variable Creation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Analyze target variables\n",
    "target_vars = ['is_division_winner', 'made_playoffs', 'achieved_90_wins', \n",
    "               'achieved_100_wins', 'scored_800_runs']\n",
    "\n",
    "target_summary = []\n",
    "for target in target_vars:\n",
    "    if target in df_engineered.columns:\n",
    "        positive_rate = df_engineered[target].mean()\n",
    "        target_summary.append({\n",
    "            'Target': target,\n",
    "            'Positive_Rate': positive_rate,\n",
    "            'Positive_Count': df_engineered[target].sum(),\n",
    "            'Total_Count': df_engineered[target].count()\n",
    "        })\n",
    "\n",
    "target_df = pd.DataFrame(target_summary)\n",
    "print(\"Target Variable Summary:\")\n",
    "print(target_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize target distributions\n",
    "fig, ax = plt.subplots(figsize=(10, 6))\n",
    "\n",
    "if len(target_df) > 0:\n",
    "    target_df.plot(x='Target', y='Positive_Rate', kind='bar', ax=ax)\n",
    "    ax.set_title('Target Variable Positive Class Rates')\n",
    "    ax.set_xlabel('Target Variable')\n",
    "    ax.set_ylabel('Positive Rate')\n",
    "    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')\n",
    "    \n",
    "    # Add percentage labels\n",
    "    for i, v in enumerate(target_df['Positive_Rate']):\n",
    "        ax.text(i, v + 0.01, f'{v:.1%}', ha='center')\n",
    "    \n",
    "    plt.tight_layout()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. Feature Correlation Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Select features for modeling\n",
    "classification_features = engineer.select_features_for_model('division_winner')\n",
    "regression_features = engineer.select_features_for_model('win_total')\n",
    "\n",
    "print(f\"Classification features selected: {len(classification_features)}\")\n",
    "print(f\"Regression features selected: {len(regression_features)}\")\n",
    "\n",
    "# Check feature availability\n",
    "available_class_features = [f for f in classification_features if f in df_engineered.columns]\n",
    "available_reg_features = [f for f in regression_features if f in df_engineered.columns]\n",
    "\n",
    "print(f\"\\nAvailable classification features: {len(available_class_features)}\")\n",
    "print(f\"Available regression features: {len(available_reg_features)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Feature correlation with targets\n",
    "if 'is_division_winner' in df_engineered.columns and len(available_class_features) > 0:\n",
    "    # Calculate correlations\n",
    "    feature_corrs = []\n",
    "    for feature in available_class_features[:20]:  # Top 20 features\n",
    "        if feature in df_engineered.columns:\n",
    "            corr = df_engineered[feature].corr(df_engineered['is_division_winner'])\n",
    "            feature_corrs.append({'Feature': feature, 'Correlation': corr})\n",
    "    \n",
    "    corr_df = pd.DataFrame(feature_corrs).sort_values('Correlation', key=abs, ascending=False)\n",
    "    \n",
    "    # Plot\n",
    "    plt.figure(figsize=(10, 8))\n",
    "    plt.barh(corr_df['Feature'], corr_df['Correlation'])\n",
    "    plt.xlabel('Correlation with Division Winner')\n",
    "    plt.title('Feature Correlations with Division Winner Target')\n",
    "    plt.axvline(0, color='black', linestyle='-', linewidth=0.5)\n",
    "    \n",
    "    # Color bars based on positive/negative correlation\n",
    "    colors = ['green' if x > 0 else 'red' for x in corr_df['Correlation']]\n",
    "    bars = plt.gca().patches\n",
    "    for bar, color in zip(bars, colors):\n",
    "        bar.set_facecolor(color)\n",
    "        bar.set_alpha(0.7)\n",
    "    \n",
    "    plt.tight_layout()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8. Feature Importance Preview"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Quick feature importance check using Random Forest\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "# Prepare data for quick test\n",
    "if 'is_division_winner' in df_engineered.columns and len(available_class_features) > 10:\n",
    "    # Remove rows with missing values\n",
    "    test_features = available_class_features[:30]  # Limit features for quick test\n",
    "    test_df = df_engineered[test_features + ['is_division_winner']].dropna()\n",
    "    \n",
    "    if len(test_df) > 100:\n",
    "        X = test_df[test_features]\n",
    "        y = test_df['is_division_winner']\n",
    "        \n",
    "        # Quick train/test split\n",
    "        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
    "        \n",
    "        # Train simple model\n",
    "        rf = RandomForestClassifier(n_estimators=50, random_state=42)\n",
    "        rf.fit(X_train, y_train)\n",
    "        \n",
    "        # Get feature importance\n",
    "        importance_df = pd.DataFrame({\n",
    "            'feature': test_features,\n",
    "            'importance': rf.feature_importances_\n",
    "        }).sort_values('importance', ascending=False)\n",
    "        \n",
    "        # Plot top 15 features\n",
    "        plt.figure(figsize=(10, 8))\n",
    "        top_features = importance_df.head(15)\n",
    "        plt.barh(range(len(top_features)), top_features['importance'])\n",
    "        plt.yticks(range(len(top_features)), top_features['feature'])\n",
    "        plt.xlabel('Importance')\n",
    "        plt.title('Preliminary Feature Importance (Random Forest)')\n",
    "        plt.gca().invert_yaxis()\n",
    "        plt.tight_layout()\n",
    "        plt.show()\n",
    "        \n",
    "        print(f\"\\nQuick model accuracy: {rf.score(X_test, y_test):.3f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 9. Feature Engineering Summary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Generate feature engineering summary\n",
    "summary = {\n",
    "    'Total Features Created': len(engineer.feature_names),\n",
    "    'Performance Features': len(feature_groups.get('performance', [])),\n",
    "    'Efficiency Features': len(feature_groups.get('efficiency', [])),\n",
    "    'Historical Features': len(feature_groups.get('historical', [])),\n",
    "    'Era-Adjusted Features': len(feature_groups.get('era_adjusted', [])),\n",
    "    'Target Variables': len([col for col in df_engineered.columns if col.startswith(('is_', 'achieved_', 'made_'))])\n",
    "}\n",
    "\n",
    "print(\"\\nFeature Engineering Summary:\")\n",
    "for key, value in summary.items():\n",
    "    print(f\"- {key}: {value}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save engineered data\n",
    "output_path = Path('../data/processed/mlb_data_engineered.csv')\n",
    "df_engineered.to_csv(output_path, index=False)\n",
    "print(f\"\\nEngineered data saved to {output_path}\")\n",
    "\n",
    "# Save feature lists\n",
    "import json\n",
    "\n",
    "feature_lists = {\n",
    "    'all_features': engineer.feature_names,\n",
    "    'classification_features': available_class_features,\n",
    "    'regression_features': available_reg_features,\n",
    "    'feature_groups': {k: v for k, v in feature_groups.items()}\n",
    "}\n",
    "\n",
    "feature_path = Path('../data/processed/feature_lists.json')\n",
    "with open(feature_path, 'w') as f:\n",
    "    json.dump(feature_lists, f, indent=2)\n",
    "print(f\"Feature lists saved to {feature_path}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}