In [None]:
# notebooks/04_hyperparameter_tuning.ipynb
# Run in: VS Code or Colab

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Hyperparameter Tuning for Stock Prediction Models\n",
    "Optimize model parameters for best performance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "from pathlib import Path\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.model_selection import TimeSeriesSplit, GridSearchCV\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "project_root = Path.cwd().parent\n",
    "sys.path.insert(0, str(project_root))\n",
    "\n",
    "from src.data.data_loader import DataLoader\n",
    "from src.models.hyperparameter_tuner import HyperparameterTuner\n",
    "from src.models.random_forest import RandomForestModel\n",
    "from src.models.xgboost_model import XGBoostModel\n",
    "from src.models.base_model import BaseModel\n",
    "\n",
    "plt.style.use('seaborn-v0_8-darkgrid')\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Load and Prepare Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_loader = DataLoader()\n",
    "tuner = HyperparameterTuner()\n",
    "\n",
    "symbol = 'AAPL'\n",
    "df = data_loader.load_stock_data(symbol, period='2y')\n",
    "\n",
    "base_model = RandomForestModel()\n",
    "X_train, X_test, y_train, y_test = base_model.split_data(df)\n",
    "\n",
    "print(f\"Training samples: {len(X_train)}\")\n",
    "print(f\"Testing samples: {len(X_test)}\")\n",
    "print(f\"Features: {X_train.shape[1]}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Random Forest Hyperparameter Tuning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\n=== Tuning Random Forest ===\")\n",
    "print(\"This may take several minutes...\")\n",
    "\n",
    "rf_best_params, rf_best_score = tuner.tune_random_forest(X_train, y_train, cv=3)\n",
    "\n",
    "print(f\"\\nBest Parameters:\")\n",
    "for param, value in rf_best_params.items():\n",
    "    print(f\"  {param}: {value}\")\n",
    "print(f\"\\nBest Cross-Validation R² Score: {rf_best_score:.4f}\")\n",
    "\n",
    "rf_default = RandomForestModel()\n",
    "rf_default.build()\n",
    "rf_default.train(X_train, y_train)\n",
    "default_metrics = rf_default.evaluate(X_test, y_test)\n",
    "\n",
    "rf_tuned = RandomForestModel(**rf_best_params)\n",
    "rf_tuned.build()\n",
    "rf_tuned.train(X_train, y_train)\n",
    "tuned_metrics = rf_tuned.evaluate(X_test, y_test)\n",
    "\n",
    "print(f\"\\nDefault Model - R²: {default_metrics['r2_score']:.4f}, RMSE: {default_metrics['rmse']:.4f}\")\n",
    "print(f\"Tuned Model   - R²: {tuned_metrics['r2_score']:.4f}, RMSE: {tuned_metrics['rmse']:.4f}\")\n",
    "print(f\"Improvement   - R²: {(tuned_metrics['r2_score'] - default_metrics['r2_score']):.4f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. XGBoost Hyperparameter Tuning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\n=== Tuning XGBoost ===\")\n",
    "print(\"This may take several minutes...\")\n",
    "\n",
    "xgb_best_params, xgb_best_score = tuner.tune_xgboost(X_train, y_train, cv=3)\n",
    "\n",
    "if xgb_best_params:\n",
    "    print(f\"\\nBest Parameters:\")\n",
    "    for param, value in xgb_best_params.items():\n",
    "        print(f\"  {param}: {value}\")\n",
    "    print(f\"\\nBest Cross-Validation R² Score: {xgb_best_score:.4f}\")\n",
    "    \n",
    "    xgb_default = XGBoostModel()\n",
    "    xgb_default.build()\n",
    "    xgb_default.train(X_train, y_train)\n",
    "    xgb_default_metrics = xgb_default.evaluate(X_test, y_test)\n",
    "    \n",
    "    xgb_tuned = XGBoostModel(**xgb_best_params)\n",
    "    xgb_tuned.build()\n",
    "    xgb_tuned.train(X_train, y_train)\n",
    "    xgb_tuned_metrics = xgb_tuned.evaluate(X_test, y_test)\n",
    "    \n",
    "    print(f\"\\nDefault Model - R²: {xgb_default_metrics['r2_score']:.4f}, RMSE: {xgb_default_metrics['rmse']:.4f}\")\n",
    "    print(f\"Tuned Model   - R²: {xgb_tuned_metrics['r2_score']:.4f}, RMSE: {xgb_tuned_metrics['rmse']:.4f}\")\n",
    "    print(f\"Improvement   - R²: {(xgb_tuned_metrics['r2_score'] - xgb_default_metrics['r2_score']):.4f}\")\n",
    "else:\n",
    "    print(\"XGBoost tuning failed or not available\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. LSTM Lookback Tuning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\n=== Tuning LSTM Lookback Period ===\")\n",
    "print(\"This may take several minutes...\")\n",
    "\n",
    "lstm_best_params, lstm_best_score = tuner.tune_lstm(df['Close'].values, y_train, lookback_values=[30, 60, 90])\n",
    "\n",
    "if lstm_best_params:\n",
    "    print(f\"\\nBest Lookback Period: {lstm_best_params['lookback']}\")\n",
    "    print(f\"Best R² Score: {lstm_best_score:.4f}\")\n",
    "else:\n",
    "    print(\"LSTM tuning failed or not available\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Parameter Sensitivity Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "n_estimators_range = [50, 100, 150, 200, 250]\n",
    "max_depth_range = [5, 10, 15, 20, 25, 30, None]\n",
    "\n",
    "n_estimators_scores = []\n",
    "for n in n_estimators_range:\n",
    "    model = RandomForestModel(n_estimators=n)\n",
    "    model.build()\n",
    "    model.train(X_train, y_train)\n",
    "    metrics = model.evaluate(X_test, y_test)\n",
    "    n_estimators_scores.append(metrics['r2_score'])\n",
    "\n",
    "max_depth_scores = []\n",
    "for depth in max_depth_range:\n",
    "    model = RandomForestModel(max_depth=depth)\n",
    "    model.build()\n",
    "    model.train(X_train, y_train)\n",
    "    metrics = model.evaluate(X_test, y_test)\n",
    "    max_depth_scores.append(metrics['r2_score'])\n",
    "\n",
    "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
    "\n",
    "axes[0].plot(n_estimators_range, n_estimators_scores, marker='o', linewidth=2)\n",
    "axes[0].set_title('Effect of n_estimators on Model Performance', fontweight='bold')\n",
    "axes[0].set_xlabel('Number of Estimators')\n",
    "axes[0].set_ylabel('R² Score')\n",
    "axes[0].grid(True, alpha=0.3)\n",
    "\n",
    "depth_labels = [str(d) if d is not None else 'None' for d in max_depth_range]\n",
    "axes[1].plot(depth_labels, max_depth_scores, marker='o', linewidth=2)\n",
    "axes[1].set_title('Effect of max_depth on Model Performance', fontweight='bold')\n",
    "axes[1].set_xlabel('Max Depth')\n",
    "axes[1].set_ylabel('R² Score')\n",
    "axes[1].grid(True, alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Cross-Validation with Best Parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cv_results = tuner.cross_validate_with_tuning('random_forest', df, n_splits=5)\n",
    "\n",
    "if cv_results:\n",
    "    print(\"\\nCross-Validation Results:\")\n",
    "    print(f\"Average R²: {cv_results['avg_r2']:.4f}\")\n",
    "    print(f\"Average RMSE: {cv_results['avg_rmse']:.4f}\")\n",
    "    \n",
    "    fold_results = cv_results['fold_results']\n",
    "    \n",
    "    r2_scores = [r['metrics']['r2_score'] for r in fold_results]\n",
    "    rmse_scores = [r['metrics']['rmse'] for r in fold_results]\n",
    "    \n",
    "    fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
    "    \n",
    "    axes[0].bar(range(1, len(r2_scores) + 1), r2_scores)\n",
    "    axes[0].axhline(y=cv_results['avg_r2'], color='r', linestyle='--', label='Average')\n",
    "    axes[0].set_title('R² Score by Fold', fontweight='bold')\n",
    "    axes[0].set_xlabel('Fold')\n",
    "    axes[0].set_ylabel('R² Score')\n",
    "    axes[0].legend()\n",
    "    axes[0].grid(True, alpha=0.3)\n",
    "    \n",
    "    axes[1].bar(range(1, len(rmse_scores) + 1), rmse_scores, color='orange')\n",
    "    axes[1].axhline(y=cv_results['avg_rmse'], color='r', linestyle='--', label='Average')\n",
    "    axes[1].set_title('RMSE by Fold', fontweight='bold')\n",
    "    axes[1].set_xlabel('Fold')\n",
    "    axes[1].set_ylabel('RMSE')\n",
    "    axes[1].legend()\n",
    "    axes[1].grid(True, alpha=0.3)\n",
    "    \n",
    "    plt.tight_layout()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. Ensemble Weight Optimization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\n=== Optimizing Ensemble Weights ===\")\n",
    "\n",
    "best_weights, best_r2, all_results = tuner.grid_search_ensemble(df)\n",
    "\n",
    "if best_weights:\n",
    "    print(f\"\\nBest Weights:\")\n",
    "    for model, weight in best_weights.items():\n",
    "        print(f\"  {model}: {weight:.3f}\")\n",
    "    print(f\"\\nBest R² Score: {best_r2:.4f}\")\n",
    "    \n",
    "    if all_results:\n",
    "        results_df = pd.DataFrame([\n",
    "            {'combination': i+1, **result}\n",
    "            for i, result in enumerate(all_results)\n",
    "        ])\n",
    "        \n",
    "        plt.figure(figsize=(12, 6))\n",
    "        plt.bar(results_df['combination'], results_df['r2_score'])\n",
    "        plt.axhline(y=best_r2, color='r', linestyle='--', linewidth=2, label='Best')\n",
    "        plt.title('Ensemble Performance with Different Weight Combinations', fontweight='bold')\n",
    "        plt.xlabel('Weight Combination')\n",
    "        plt.ylabel('R² Score')\n",
    "        plt.legend()\n",
    "        plt.grid(True, alpha=0.3)\n",
    "        plt.tight_layout()\n",
    "        plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8. Learning Curve Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import learning_curve\n",
    "\n",
    "train_sizes = np.linspace(0.1, 1.0, 10)\n",
    "\n",
    "rf_model = RandomForestModel(**rf_best_params)\n",
    "rf_model.build()\n",
    "\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "base_rf = RandomForestRegressor(**rf_best_params, random_state=42)\n",
    "\n",
    "train_sizes_abs, train_scores, val_scores = learning_curve(\n",
    "    base_rf, X_train, y_train,\n",
    "    train_sizes=train_sizes,\n",
    "    cv=TimeSeriesSplit(n_splits=3),\n",
    "    scoring='r2',\n",
    "    n_jobs=-1\n",
    ")\n",
    "\n",
    "train_mean = np.mean(train_scores, axis=1)\n",
    "train_std = np.std(train_scores, axis=1)\n",
    "val_mean = np.mean(val_scores, axis=1)\n",
    "val_std = np.std(val_scores, axis=1)\n",
    "\n",
    "plt.figure(figsize=(12, 6))\n",
    "plt.plot(train_sizes_abs, train_mean, label='Training Score', marker='o', linewidth=2)\n",
    "plt.fill_between(train_sizes_abs, train_mean - train_std, train_mean + train_std, alpha=0.2)\n",
    "\n",
    "plt.plot(train_sizes_abs, val_mean, label='Validation Score', marker='o', linewidth=2)\n",
    "plt.fill_between(train_sizes_abs, val_mean - val_std, val_mean + val_std, alpha=0.2)\n",
    "\n",
    "plt.title('Learning Curve - Random Forest (Tuned)', fontweight='bold', fontsize=14)\n",
    "plt.xlabel('Training Set Size')\n",
    "plt.ylabel('R² Score')\n",
    "plt.legend()\n",
    "plt.grid(True, alpha=0.3)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 9. Model Comparison: Before and After Tuning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "comparison_data = [\n",
    "    {'Model': 'Random Forest (Default)', **default_metrics},\n",
    "    {'Model': 'Random Forest (Tuned)', **tuned_metrics}\n",
    "]\n",
    "\n",
    "if xgb_best_params:\n",
    "    comparison_data.extend([\n",
    "        {'Model': 'XGBoost (Default)', **xgb_default_metrics},\n",
    "        {'Model': 'XGBoost (Tuned)', **xgb_tuned_metrics}\n",
    "    ])\n",
    "\n",
    "comparison_df = pd.DataFrame(comparison_data)\n",
    "\n",
    "print(\"\\nModel Performance Comparison:\")\n",
    "print(comparison_df[['Model', 'r2_score', 'rmse', 'mae', 'mape']])\n",
    "\n",
    "fig, axes = plt.subplots(2, 2, figsize=(14, 10))\n",
    "\n",
    "comparison_df.plot(x='Model', y='r2_score', kind='bar', ax=axes[0, 0], legend=False, color='steelblue')\n",
    "axes[0, 0].set_title('R² Score Comparison', fontweight='bold')\n",
    "axes[0, 0].set_ylabel('R² Score')\n",
    "axes[0, 0].tick_params(axis='x', rotation=45)\n",
    "axes[0, 0].grid(True, alpha=0.3)\n",
    "\n",
    "comparison_df.plot(x='Model', y='rmse', kind='bar', ax=axes[0, 1], legend=False, color='coral')\n",
    "axes[0, 1].set_title('RMSE Comparison', fontweight='bold')\n",
    "axes[0, 1].set_ylabel('RMSE')\n",
    "axes[0, 1].tick_params(axis='x', rotation=45)\n",
    "axes[0, 1].grid(True, alpha=0.3)\n",
    "\n",
    "comparison_df.plot(x='Model', y='mae', kind='bar', ax=axes[1, 0], legend=False, color='lightgreen')\n",
    "axes[1, 0].set_title('MAE Comparison', fontweight='bold')\n",
    "axes[1, 0].set_ylabel('MAE')\n",
    "axes[1, 0].tick_params(axis='x', rotation=45)\n",
    "axes[1, 0].grid(True, alpha=0.3)\n",
    "\n",
    "comparison_df.plot(x='Model', y='mape', kind='bar', ax=axes[1, 1], legend=False, color='gold')\n",
    "axes[1, 1].set_title('MAPE Comparison', fontweight='bold')\n",
    "axes[1, 1].set_ylabel('MAPE (%)')\n",
    "axes[1, 1].tick_params(axis='x', rotation=45)\n",
    "axes[1, 1].grid(True, alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 10. Summary and Recommendations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\n\" + \"=\"*70)\n",
    "print(\"HYPERPARAMETER TUNING SUMMARY\")\n",
    "print(\"=\"*70)\n",
    "\n",
    "print(\"\\n1. Random Forest Optimization:\")\n",
    "print(f\"   Best Parameters: {rf_best_params}\")\n",
    "print(f\"   Performance Improvement: {(tuned_metrics['r2_score'] - default_metrics['r2_score'])*100:.2f}%\")\n",
    "\n",
    "if xgb_best_params:\n",
    "    print(\"\\n2. XGBoost Optimization:\")\n",
    "    print(f\"   Best Parameters: {xgb_best_params}\")\n",
    "    print(f\"   Performance Improvement: {(xgb_tuned_metrics['r2_score'] - xgb_default_metrics['r2_score'])*100:.2f}%\")\n",
    "\n",
    "if best_weights:\n",
    "    print(\"\\n3. Ensemble Weight Optimization:\")\n",
    "    print(f\"   Optimal Weights: {best_weights}\")\n",
    "    print(f\"   Best R² Score: {best_r2:.4f}\")\n",
    "\n",
    "print(\"\\n\" + \"=\"*70)\n",
    "print(\"RECOMMENDATIONS:\")\n",
    "print(\"=\"*70)\n",
    "print(\"1. Use tuned hyperparameters for production models\")\n",
    "print(\"2. Re-tune periodically as new data becomes available\")\n",
    "print(\"3. Monitor performance on out-of-sample data\")\n",
    "print(\"4. Consider ensemble methods for optimal results\")\n",
    "print(\"=\"*70)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}