In [None]:
# notebooks/02_model_development.ipynb
# Run in: VS Code or Colab

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Stock Price Prediction Model Development\n",
    "Development and comparison of different machine learning models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "from pathlib import Path\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "project_root = Path.cwd().parent\n",
    "sys.path.insert(0, str(project_root))\n",
    "\n",
    "from src.data.data_loader import DataLoader\n",
    "from src.models.linear_regression import LinearRegressionModel\n",
    "from src.models.random_forest import RandomForestModel\n",
    "from src.models.xgboost_model import XGBoostModel\n",
    "from src.models.ensemble import EnsembleModel\n",
    "from src.models.model_evaluator import ModelEvaluator\n",
    "\n",
    "plt.style.use('seaborn-v0_8-darkgrid')\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Load and Prepare Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_loader = DataLoader()\n",
    "symbol = 'AAPL'\n",
    "period = '2y'\n",
    "\n",
    "print(f\"Loading {symbol} data...\")\n",
    "df = data_loader.load_stock_data(symbol, period=period)\n",
    "\n",
    "print(f\"Data shape: {df.shape}\")\n",
    "print(f\"Date range: {df.index.min()} to {df.index.max()}\")\n",
    "\n",
    "plt.figure(figsize=(14, 6))\n",
    "plt.plot(df.index, df['Close'], linewidth=2)\n",
    "plt.title(f'{symbol} Historical Price', fontweight='bold', fontsize=14)\n",
    "plt.xlabel('Date')\n",
    "plt.ylabel('Price ($)')\n",
    "plt.grid(True, alpha=0.3)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Linear Regression Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\n=== Training Linear Regression Model ===\")\n",
    "\n",
    "lr_model = LinearRegressionModel()\n",
    "X_train, X_test, y_train, y_test = lr_model.split_data(df)\n",
    "\n",
    "print(f\"Training samples: {len(X_train)}\")\n",
    "print(f\"Testing samples: {len(X_test)}\")\n",
    "\n",
    "lr_model.build()\n",
    "lr_model.train(X_train, y_train, X_test, y_test)\n",
    "\n",
    "lr_metrics = lr_model.evaluate(X_test, y_test)\n",
    "print(f\"\\nLinear Regression Metrics:\")\n",
    "for key, value in lr_metrics.items():\n",
    "    print(f\"  {key}: {value:.4f}\")\n",
    "\n",
    "y_pred_lr = lr_model.predict(X_test)\n",
    "\n",
    "plt.figure(figsize=(14, 6))\n",
    "plt.plot(y_test.values, label='Actual', linewidth=2)\n",
    "plt.plot(y_pred_lr, label='Predicted', linewidth=2, alpha=0.7)\n",
    "plt.title('Linear Regression: Actual vs Predicted', fontweight='bold')\n",
    "plt.xlabel('Sample')\n",
    "plt.ylabel('Price ($)')\n",
    "plt.legend()\n",
    "plt.grid(True, alpha=0.3)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Random Forest Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\n=== Training Random Forest Model ===\")\n",
    "\n",
    "rf_model = RandomForestModel(n_estimators=100)\n",
    "rf_model.build()\n",
    "rf_model.train(X_train, y_train, X_test, y_test)\n",
    "\n",
    "rf_metrics = rf_model.evaluate(X_test, y_test)\n",
    "print(f\"\\nRandom Forest Metrics:\")\n",
    "for key, value in rf_metrics.items():\n",
    "    print(f\"  {key}: {value:.4f}\")\n",
    "\n",
    "y_pred_rf = rf_model.predict(X_test)\n",
    "\n",
    "plt.figure(figsize=(14, 6))\n",
    "plt.plot(y_test.values, label='Actual', linewidth=2)\n",
    "plt.plot(y_pred_rf, label='Predicted', linewidth=2, alpha=0.7)\n",
    "plt.title('Random Forest: Actual vs Predicted', fontweight='bold')\n",
    "plt.xlabel('Sample')\n",
    "plt.ylabel('Price ($)')\n",
    "plt.legend()\n",
    "plt.grid(True, alpha=0.3)\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "feature_importance = rf_model.get_feature_importance()\n",
    "print(f\"\\nTop 10 Features:\")\n",
    "print(feature_importance.head(10))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. XGBoost Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\n=== Training XGBoost Model ===\")\n",
    "\n",
    "xgb_model = XGBoostModel(n_estimators=100)\n",
    "xgb_model.build()\n",
    "xgb_model.train(X_train, y_train, X_test, y_test)\n",
    "\n",
    "xgb_metrics = xgb_model.evaluate(X_test, y_test)\n",
    "print(f\"\\nXGBoost Metrics:\")\n",
    "for key, value in xgb_metrics.items():\n",
    "    print(f\"  {key}: {value:.4f}\")\n",
    "\n",
    "y_pred_xgb = xgb_model.predict(X_test)\n",
    "\n",
    "plt.figure(figsize=(14, 6))\n",
    "plt.plot(y_test.values, label='Actual', linewidth=2)\n",
    "plt.plot(y_pred_xgb, label='Predicted', linewidth=2, alpha=0.7)\n",
    "plt.title('XGBoost: Actual vs Predicted', fontweight='bold')\n",
    "plt.xlabel('Sample')\n",
    "plt.ylabel('Price ($)')\n",
    "plt.legend()\n",
    "plt.grid(True, alpha=0.3)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Ensemble Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\n=== Training Ensemble Model ===\")\n",
    "\n",
    "models_dict = {\n",
    "    'linear_regression': lr_model,\n",
    "    'random_forest': rf_model,\n",
    "    'xgboost': xgb_model\n",
    "}\n",
    "\n",
    "ensemble_model = EnsembleModel(models=models_dict)\n",
    "ensemble_model.feature_names = lr_model.feature_names\n",
    "ensemble_model.is_trained = True\n",
    "\n",
    "ensemble_model.optimize_weights(X_test, y_test)\n",
    "\n",
    "print(f\"\\nOptimized Weights:\")\n",
    "print(ensemble_model.get_model_contributions())\n",
    "\n",
    "y_pred_ensemble = ensemble_model.predict(X_test)\n",
    "\n",
    "ensemble_metrics = ensemble_model.evaluate(X_test, y_test)\n",
    "print(f\"\\nEnsemble Metrics:\")\n",
    "for key, value in ensemble_metrics.items():\n",
    "    print(f\"  {key}: {value:.4f}\")\n",
    "\n",
    "plt.figure(figsize=(14, 6))\n",
    "plt.plot(y_test.values, label='Actual', linewidth=2)\n",
    "plt.plot(y_pred_ensemble, label='Ensemble', linewidth=2, alpha=0.7)\n",
    "plt.title('Ensemble: Actual vs Predicted', fontweight='bold')\n",
    "plt.xlabel('Sample')\n",
    "plt.ylabel('Price ($)')\n",
    "plt.legend()\n",
    "plt.grid(True, alpha=0.3)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Model Comparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "comparison_df = pd.DataFrame([\n",
    "    {'Model': 'Linear Regression', **lr_metrics},\n",
    "    {'Model': 'Random Forest', **rf_metrics},\n",
    "    {'Model': 'XGBoost', **xgb_metrics},\n",
    "    {'Model': 'Ensemble', **ensemble_metrics}\n",
    "])\n",
    "\n",
    "print(\"\\nModel Comparison:\")\n",
    "print(comparison_df[['Model', 'r2_score', 'rmse', 'mae', 'mape']])\n",
    "\n",
    "fig, axes = plt.subplots(2, 2, figsize=(14, 10))\n",
    "\n",
    "comparison_df.plot(x='Model', y='r2_score', kind='bar', ax=axes[0, 0], legend=False)\n",
    "axes[0, 0].set_title('R² Score', fontweight='bold')\n",
    "axes[0, 0].set_ylabel('Score')\n",
    "axes[0, 0].grid(True, alpha=0.3)\n",
    "\n",
    "comparison_df.plot(x='Model', y='rmse', kind='bar', axes[0, 1].set_title('RMSE', fontweight='bold')\n",
    "axes[0, 1].set_ylabel('Error')\n",
    "axes[0, 1].grid(True, alpha=0.3)\n",
    "\n",
    "comparison_df.plot(x='Model', y='mae', kind='bar', ax=axes[1, 0], legend=False, color='orange')\n",
    "axes[1, 0].set_title('MAE', fontweight='bold')\n",
    "axes[1, 0].set_ylabel('Error')\n",
    "axes[1, 0].grid(True, alpha=0.3)\n",
    "\n",
    "comparison_df.plot(x='Model', y='mape', kind='bar', ax=axes[1, 1], legend=False, color='green')\n",
    "axes[1, 1].set_title('MAPE (%)', fontweight='bold')\n",
    "axes[1, 1].set_ylabel('Error (%)')\n",
    "axes[1, 1].grid(True, alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. Prediction Visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(16, 8))\n",
    "plt.plot(y_test.values, label='Actual', linewidth=3, color='black')\n",
    "plt.plot(y_pred_lr, label='Linear Regression', linewidth=2, alpha=0.7)\n",
    "plt.plot(y_pred_rf, label='Random Forest', linewidth=2, alpha=0.7)\n",
    "plt.plot(y_pred_xgb, label='XGBoost', linewidth=2, alpha=0.7)\n",
    "plt.plot(y_pred_ensemble, label='Ensemble', linewidth=2, alpha=0.7)\n",
    "\n",
    "plt.title('All Models: Actual vs Predicted Prices', fontweight='bold', fontsize=14)\n",
    "plt.xlabel('Sample')\n",
    "plt.ylabel('Price ($)')\n",
    "plt.legend()\n",
    "plt.grid(True, alpha=0.3)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8. Future Price Prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "forecast_days = 30\n",
    "\n",
    "print(f\"\\nGenerating {forecast_days}-day forecast...\")\n",
    "\n",
    "forecast_lr = lr_model.predict_future(df, days=forecast_days)\n",
    "forecast_rf = rf_model.predict_future(df, days=forecast_days)\n",
    "forecast_xgb = xgb_model.predict_future(df, days=forecast_days)\n",
    "forecast_ensemble = ensemble_model.predict_future(df, days=forecast_days)\n",
    "\n",
    "forecast_dates = pd.date_range(start=df.index[-1] + pd.Timedelta(days=1), periods=forecast_days)\n",
    "\n",
    "plt.figure(figsize=(16, 8))\n",
    "\n",
    "plt.plot(df.index[-60:], df['Close'].values[-60:], label='Historical', linewidth=3, color='black')\n",
    "plt.plot(forecast_dates, forecast_lr, label='LR Forecast', linewidth=2, linestyle='--', alpha=0.7)\n",
    "plt.plot(forecast_dates, forecast_rf, label='RF Forecast', linewidth=2, linestyle='--', alpha=0.7)\n",
    "plt.plot(forecast_dates, forecast_xgb, label='XGB Forecast', linewidth=2, linestyle='--', alpha=0.7)\n",
    "plt.plot(forecast_dates, forecast_ensemble, label='Ensemble Forecast', linewidth=3, linestyle='--', alpha=0.9)\n",
    "\n",
    "plt.axvline(x=df.index[-1], color='red', linestyle=':', linewidth=2, label='Forecast Start')\n",
    "\n",
    "plt.title(f'{symbol} Price Forecast ({forecast_days} Days)', fontweight='bold', fontsize=14)\n",
    "plt.xlabel('Date')\n",
    "plt.ylabel('Price ($)')\n",
    "plt.legend()\n",
    "plt.grid(True, alpha=0.3)\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "print(f\"\\nForecast Summary:\")\n",
    "print(f\"Current Price: ${df['Close'].iloc[-1]:.2f}\")\n",
    "print(f\"Ensemble 30D Forecast: ${forecast_ensemble[-1]:.2f}\")\n",
    "print(f\"Expected Change: {((forecast_ensemble[-1] - df['Close'].iloc[-1]) / df['Close'].iloc[-1] * 100):.2f}%\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 9. Residual Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "residuals_ensemble = y_test.values - y_pred_ensemble\n",
    "\n",
    "fig, axes = plt.subplots(2, 2, figsize=(14, 10))\n",
    "\n",
    "axes[0, 0].scatter(y_pred_ensemble, residuals_ensemble, alpha=0.5)\n",
    "axes[0, 0].axhline(y=0, color='r', linestyle='--')\n",
    "axes[0, 0].set_title('Residuals vs Predicted', fontweight='bold')\n",
    "axes[0, 0].set_xlabel('Predicted Values')\n",
    "axes[0, 0].set_ylabel('Residuals')\n",
    "axes[0, 0].grid(True, alpha=0.3)\n",
    "\n",
    "axes[0, 1].hist(residuals_ensemble, bins=30, edgecolor='black', alpha=0.7)\n",
    "axes[0, 1].set_title('Residuals Distribution', fontweight='bold')\n",
    "axes[0, 1].set_xlabel('Residuals')\n",
    "axes[0, 1].set_ylabel('Frequency')\n",
    "axes[0, 1].grid(True, alpha=0.3)\n",
    "\n",
    "axes[1, 0].plot(residuals_ensemble)\n",
    "axes[1, 0].axhline(y=0, color='r', linestyle='--')\n",
    "axes[1, 0].set_title('Residuals Over Time', fontweight='bold')\n",
    "axes[1, 0].set_xlabel('Sample')\n",
    "axes[1, 0].set_ylabel('Residuals')\n",
    "axes[1, 0].grid(True, alpha=0.3)\n",
    "\n",
    "from scipy import stats\n",
    "stats.probplot(residuals_ensemble, dist=\"norm\", plot=axes[1, 1])\n",
    "axes[1, 1].set_title('Q-Q Plot', fontweight='bold')\n",
    "axes[1, 1].grid(True, alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "print(f\"\\nResidual Statistics:\")\n",
    "print(f\"Mean: {np.mean(residuals_ensemble):.4f}\")\n",
    "print(f\"Std Dev: {np.std(residuals_ensemble):.4f}\")\n",
    "print(f\"Min: {np.min(residuals_ensemble):.4f}\")\n",
    "print(f\"Max: {np.max(residuals_ensemble):.4f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 10. Conclusion"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\n\" + \"=\"*60)\n",
    "print(\"MODEL DEVELOPMENT SUMMARY\")\n",
    "print(\"=\"*60)\n",
    "\n",
    "best_model = comparison_df.loc[comparison_df['r2_score'].idxmax()]\n",
    "print(f\"\\nBest Model: {best_model['Model']}\")\n",
    "print(f\"R² Score: {best_model['r2_score']:.4f}\")\n",
    "print(f\"RMSE: {best_model['rmse']:.4f}\")\n",
    "print(f\"MAE: {best_model['mae']:.4f}\")\n",
    "print(f\"MAPE: {best_model['mape']:.2f}%\")\n",
    "\n",
    "print(f\"\\nKey Findings:\")\n",
    "print(f\"1. Ensemble model achieves best overall performance\")\n",
    "print(f\"2. Random Forest provides good feature importance insights\")\n",
    "print(f\"3. XGBoost offers fast training with competitive accuracy\")\n",
    "print(f\"4. Linear Regression serves as a solid baseline\")\n",
    "\n",
    "print(\"\\n\" + \"=\"*60)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}