In [None]:
# notebooks/03_feature_engineering.ipynb
# Run in: VS Code or Colab

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Feature Engineering for Stock Prediction\n",
    "Creating and evaluating new features to improve model performance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "from pathlib import Path\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.feature_selection import mutual_info_regression\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "project_root = Path.cwd().parent\n",
    "sys.path.insert(0, str(project_root))\n",
    "\n",
    "from src.data.data_loader import DataLoader\n",
    "from src.data.technical_indicators import TechnicalIndicators\n",
    "\n",
    "plt.style.use('seaborn-v0_8-darkgrid')\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Load Base Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_loader = DataLoader()\n",
    "tech_indicators = TechnicalIndicators()\n",
    "\n",
    "symbol = 'AAPL'\n",
    "df = data_loader.load_stock_data(symbol, period='2y')\n",
    "\n",
    "print(f\"Base features: {df.columns.tolist()}\")\n",
    "print(f\"Data shape: {df.shape}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Price-Based Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_price_features(df):\n",
    "    df = df.copy()\n",
    "    \n",
    "    df['Price_Range'] = df['High'] - df['Low']\n",
    "    df['Price_Range_Pct'] = (df['Price_Range'] / df['Close']) * 100\n",
    "    \n",
    "    df['Gap'] = df['Open'] - df['Close'].shift(1)\n",
    "    df['Gap_Pct'] = (df['Gap'] / df['Close'].shift(1)) * 100\n",
    "    \n",
    "    df['Body'] = df['Close'] - df['Open']\n",
    "    df['Body_Pct'] = (df['Body'] / df['Open']) * 100\n",
    "    \n",
    "    df['Upper_Shadow'] = df['High'] - df[['Open', 'Close']].max(axis=1)\n",
    "    df['Lower_Shadow'] = df[['Open', 'Close']].min(axis=1) - df['Low']\n",
    "    \n",
    "    for period in [5, 10, 20, 50]:\n",
    "        df[f'High_{period}D'] = df['High'].rolling(window=period).max()\n",
    "        df[f'Low_{period}D'] = df['Low'].rolling(window=period).min()\n",
    "        df[f'Range_{period}D'] = df[f'High_{period}D'] - df[f'Low_{period}D']\n",
    "    \n",
    "    return df\n",
    "\n",
    "df_with_price_features = create_price_features(df)\n",
    "print(f\"\\nAfter price features: {df_with_price_features.shape}\")\n",
    "print(f\"New features: {[col for col in df_with_price_features.columns if col not in df.columns]}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Returns and Momentum Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_returns_features(df):\n",
    "    df = df.copy()\n",
    "    \n",
    "    for period in [1, 5, 10, 20, 50]:\n",
    "        df[f'Return_{period}D'] = df['Close'].pct_change(period) * 100\n",
    "        df[f'LogReturn_{period}D'] = np.log(df['Close'] / df['Close'].shift(period)) * 100\n",
    "    \n",
    "    for period in [5, 10, 20]:\n",
    "        df[f'Return_Std_{period}D'] = df['Return_1D'].rolling(window=period).std()\n",
    "    \n",
    "    for period in [10, 20, 50]:\n",
    "        df[f'Momentum_{period}D'] = df['Close'] - df['Close'].shift(period)\n",
    "        df[f'ROC_{period}D'] = ((df['Close'] - df['Close'].shift(period)) / df['Close'].shift(period)) * 100\n",
    "    \n",
    "    return df\n",
    "\n",
    "df_with_returns = create_returns_features(df_with_price_features)\n",
    "print(f\"\\nAfter returns features: {df_with_returns.shape}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Volume Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_volume_features(df):\n",
    "    df = df.copy()\n",
    "    \n",
    "    for period in [5, 10, 20, 50]:\n",
    "        df[f'Volume_SMA_{period}'] = df['Volume'].rolling(window=period).mean()\n",
    "        df[f'Volume_Ratio_{period}'] = df['Volume'] / df[f'Volume_SMA_{period}']\n",
    "    \n",
    "    df['Volume_Change'] = df['Volume'].pct_change() * 100\n",
    "    \n",
    "    df['Price_Volume'] = df['Close'] * df['Volume']\n",
    "    \n",
    "    df['OBV'] = (np.sign(df['Close'].diff()) * df['Volume']).fillna(0).cumsum()\n",
    "    \n",
    "    df['Volume_Price_Trend'] = ((df['Close'] - df['Close'].shift(1)) / df['Close'].shift(1)) * df['Volume']\n",
    "    \n",
    "    return df\n",
    "\n",
    "df_with_volume = create_volume_features(df_with_returns)\n",
    "print(f\"\\nAfter volume features: {df_with_volume.shape}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Technical Indicators"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_with_indicators = tech_indicators.add_all_indicators(df_with_volume)\n",
    "\n",
    "df_with_indicators = tech_indicators.add_stochastic_oscillator(df_with_indicators)\n",
    "df_with_indicators = tech_indicators.add_cci(df_with_indicators)\n",
    "df_with_indicators = tech_indicators.add_adx(df_with_indicators)\n",
    "df_with_indicators = tech_indicators.add_williams_r(df_with_indicators)\n",
    "\n",
    "print(f\"\\nAfter technical indicators: {df_with_indicators.shape}\")\n",
    "print(f\"Total features: {len(df_with_indicators.columns)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Time-Based Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_time_features(df):\n",
    "    df = df.copy()\n",
    "    \n",
    "    df['DayOfWeek'] = df.index.dayofweek\n",
    "    df['DayOfMonth'] = df.index.day\n",
    "    df['Month'] = df.index.month\n",
    "    df['Quarter'] = df.index.quarter\n",
    "    df['DayOfYear'] = df.index.dayofyear\n",
    "    \n",
    "    df['IsMonthStart'] = df.index.is_month_start.astype(int)\n",
    "    df['IsMonthEnd'] = df.index.is_month_end.astype(int)\n",
    "    df['IsQuarterStart'] = df.index.is_quarter_start.astype(int)\n",
    "    df['IsQuarterEnd'] = df.index.is_quarter_end.astype(int)\n",
    "    \n",
    "    return df\n",
    "\n",
    "df_complete = create_time_features(df_with_indicators)\n",
    "df_complete = df_complete.fillna(method='bfill').fillna(method='ffill')\n",
    "\n",
    "print(f\"\\nComplete feature set: {df_complete.shape}\")\n",
    "print(f\"Total features: {len(df_complete.columns)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. Feature Importance Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = df_complete.drop(columns=['Close', 'Open', 'High', 'Low', 'Volume'])\n",
    "y = df_complete['Close']\n",
    "\n",
    "X = X.replace([np.inf, -np.inf], np.nan).fillna(0)\n",
    "\n",
    "mi_scores = mutual_info_regression(X, y, random_state=42)\n",
    "\n",
    "feature_importance = pd.DataFrame({\n",
    "    'feature': X.columns,\n",
    "    'importance': mi_scores\n",
    "}).sort_values('importance', ascending=False)\n",
    "\n",
    "print(\"\\nTop 20 Most Important Features:\")\n",
    "print(feature_importance.head(20))\n",
    "\n",
    "plt.figure(figsize=(12, 8))\n",
    "plt.barh(feature_importance.head(20)['feature'], feature_importance.head(20)['importance'])\n",
    "plt.xlabel('Mutual Information Score')\n",
    "plt.title('Top 20 Feature Importance', fontweight='bold')\n",
    "plt.gca().invert_yaxis()\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8. Feature Correlation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "top_features = feature_importance.head(15)['feature'].tolist()\n",
    "top_features_df = df_complete[top_features + ['Close']]\n",
    "\n",
    "correlation_matrix = top_features_df.corr()\n",
    "\n",
    "plt.figure(figsize=(14, 12))\n",
    "sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', \n",
    "            center=0, square=True, linewidths=1)\n",
    "plt.title('Top Features Correlation Matrix', fontweight='bold', fontsize=14)\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "close_correlations = correlation_matrix['Close'].drop('Close').sort_values(ascending=False)\n",
    "print(\"\\nFeature Correlation with Close Price:\")\n",
    "print(close_correlations)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 9. Feature Selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_selection import SelectKBest, f_regression\n",
    "\n",
    "selector = SelectKBest(score_func=f_regression, k=30)\n",
    "X_selected = selector.fit_transform(X, y)\n",
    "\n",
    "selected_features = X.columns[selector.get_support()].tolist()\n",
    "\n",
    "print(f\"\\nSelected {len(selected_features)} features:\")\n",
    "for i, feature in enumerate(selected_features, 1):\n",
    "    print(f\"{i}. {feature}\")\n",
    "\n",
    "df_selected = df_complete[selected_features + ['Close']]\n",
    "print(f\"\\nFinal dataset shape: {df_selected.shape}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 10. Save Engineered Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_path = project_root / 'data' / 'engineered_features.csv'\n",
    "df_selected.to_csv(output_path)\n",
    "print(f\"\\nEngineered features saved to: {output_path}\")\n",
    "\n",
    "feature_list_path = project_root / 'data' / 'selected_features.txt'\n",
    "with open(feature_list_path, 'w') as f:\n",
    "    for feature in selected_features:\n",
    "        f.write(f\"{feature}\\n\")\n",
    "print(f\"Feature list saved to: {feature_list_path}\")\n",
    "\n",
    "print(\"\\n\" + \"=\"*60)\n",
    "print(\"FEATURE ENGINEERING COMPLETE\")\n",
    "print(\"=\"*60)\n",
    "print(f\"Original features: {len(df.columns)}\")\n",
    "print(f\"Engineered features: {len(df_complete.columns)}\")\n",
    "print(f\"Selected features: {len(selected_features)}\")\n",
    "print(\"=\"*60)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}