In [None]:
# notebooks/01_data_exploration.ipynb
# Run in: VS Code or Colab

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Stock Data Exploration and Analysis\n",
    "This notebook performs exploratory data analysis on stock market data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "from pathlib import Path\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "project_root = Path.cwd().parent\n",
    "sys.path.insert(0, str(project_root))\n",
    "\n",
    "from src.data.data_loader import DataLoader\n",
    "from src.data.technical_indicators import TechnicalIndicators\n",
    "from src.data.data_validator import DataValidator\n",
    "\n",
    "plt.style.use('seaborn-v0_8-darkgrid')\n",
    "sns.set_palette('husl')\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Load Stock Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_loader = DataLoader()\n",
    "tech_indicators = TechnicalIndicators()\n",
    "validator = DataValidator()\n",
    "\n",
    "symbols = ['AAPL', 'GOOGL', 'MSFT', 'TSLA', 'AMZN']\n",
    "period = '2y'\n",
    "\n",
    "stock_data = {}\n",
    "for symbol in symbols:\n",
    "    print(f\"Loading {symbol}...\")\n",
    "    df = data_loader.load_stock_data(symbol, period=period)\n",
    "    if df is not None:\n",
    "        stock_data[symbol] = df\n",
    "        print(f\"{symbol}: {len(df)} rows loaded\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Data Overview"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "symbol = 'AAPL'\n",
    "df = stock_data[symbol]\n",
    "\n",
    "print(f\"\\n{symbol} Data Overview:\")\n",
    "print(f\"Shape: {df.shape}\")\n",
    "print(f\"Date Range: {df.index.min()} to {df.index.max()}\")\n",
    "print(f\"\\nFirst 5 rows:\")\n",
    "print(df.head())\n",
    "print(f\"\\nLast 5 rows:\")\n",
    "print(df.tail())\n",
    "print(f\"\\nStatistical Summary:\")\n",
    "print(df.describe())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Data Quality Check"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "quality_report = validator.check_data_quality(df)\n",
    "\n",
    "print(\"Data Quality Report:\")\n",
    "print(f\"Total Rows: {quality_report['total_rows']}\")\n",
    "print(f\"Date Range: {quality_report['date_range']}\")\n",
    "print(f\"Quality Score: {quality_report['quality_score']:.2f}/100\")\n",
    "print(f\"\\nMissing Values:\")\n",
    "for col, info in quality_report['missing_values'].items():\n",
    "    print(f\"  {col}: {info['count']} ({info['percentage']:.2f}%)\")\n",
    "print(f\"\\nOutliers Detected:\")\n",
    "for col, count in quality_report['outliers'].items():\n",
    "    print(f\"  {col}: {count}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Price Visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axes = plt.subplots(3, 2, figsize=(16, 12))\n",
    "fig.suptitle('Stock Price Analysis', fontsize=16, fontweight='bold')\n",
    "\n",
    "for idx, (symbol, df) in enumerate(stock_data.items()):\n",
    "    if idx >= 6:\n",
    "        break\n",
    "    row = idx // 2\n",
    "    col = idx % 2\n",
    "    \n",
    "    axes[row, col].plot(df.index, df['Close'], linewidth=2)\n",
    "    axes[row, col].set_title(f'{symbol} Closing Price', fontweight='bold')\n",
    "    axes[row, col].set_xlabel('Date')\n",
    "    axes[row, col].set_ylabel('Price ($)')\n",
    "    axes[row, col].grid(True, alpha=0.3)\n",
    "    axes[row, col].tick_params(axis='x', rotation=45)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Volume Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axes = plt.subplots(2, 1, figsize=(16, 10))\n",
    "\n",
    "symbol = 'AAPL'\n",
    "df = stock_data[symbol]\n",
    "\n",
    "axes[0].plot(df.index, df['Close'], linewidth=2, label='Close Price')\n",
    "axes[0].set_title(f'{symbol} Price', fontweight='bold')\n",
    "axes[0].set_ylabel('Price ($)')\n",
    "axes[0].legend()\n",
    "axes[0].grid(True, alpha=0.3)\n",
    "\n",
    "axes[1].bar(df.index, df['Volume'], alpha=0.7, label='Volume')\n",
    "axes[1].set_title(f'{symbol} Trading Volume', fontweight='bold')\n",
    "axes[1].set_xlabel('Date')\n",
    "axes[1].set_ylabel('Volume')\n",
    "axes[1].legend()\n",
    "axes[1].grid(True, alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Returns Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "returns_data = {}\n",
    "\n",
    "for symbol, df in stock_data.items():\n",
    "    returns = df['Close'].pct_change().dropna()\n",
    "    returns_data[symbol] = returns\n",
    "\n",
    "returns_df = pd.DataFrame(returns_data)\n",
    "\n",
    "print(\"Returns Statistics:\")\n",
    "print(returns_df.describe())\n",
    "\n",
    "fig, axes = plt.subplots(1, 2, figsize=(16, 5))\n",
    "\n",
    "returns_df.plot(kind='hist', bins=50, alpha=0.6, ax=axes[0])\n",
    "axes[0].set_title('Daily Returns Distribution', fontweight='bold')\n",
    "axes[0].set_xlabel('Returns')\n",
    "axes[0].set_ylabel('Frequency')\n",
    "axes[0].legend()\n",
    "axes[0].grid(True, alpha=0.3)\n",
    "\n",
    "returns_df.plot(kind='box', ax=axes[1])\n",
    "axes[1].set_title('Returns Box Plot', fontweight='bold')\n",
    "axes[1].set_ylabel('Returns')\n",
    "axes[1].grid(True, alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. Correlation Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "close_prices = pd.DataFrame({symbol: df['Close'] for symbol, df in stock_data.items()})\n",
    "correlation_matrix = close_prices.corr()\n",
    "\n",
    "plt.figure(figsize=(10, 8))\n",
    "sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, \n",
    "            square=True, linewidths=1, cbar_kws={\"shrink\": 0.8})\n",
    "plt.title('Stock Price Correlation Matrix', fontweight='bold', fontsize=14)\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "returns_correlation = returns_df.corr()\n",
    "\n",
    "plt.figure(figsize=(10, 8))\n",
    "sns.heatmap(returns_correlation, annot=True, cmap='coolwarm', center=0,\n",
    "            square=True, linewidths=1, cbar_kws={\"shrink\": 0.8})\n",
    "plt.title('Returns Correlation Matrix', fontweight='bold', fontsize=14)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8. Technical Indicators"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "symbol = 'AAPL'\n",
    "df = stock_data[symbol].copy()\n",
    "df_with_indicators = tech_indicators.add_all_indicators(df)\n",
    "\n",
    "fig, axes = plt.subplots(3, 1, figsize=(16, 12))\n",
    "\n",
    "axes[0].plot(df_with_indicators.index, df_with_indicators['Close'], label='Close Price', linewidth=2)\n",
    "axes[0].plot(df_with_indicators.index, df_with_indicators['SMA_20'], label='SMA 20', linestyle='--')\n",
    "axes[0].plot(df_with_indicators.index, df_with_indicators['SMA_50'], label='SMA 50', linestyle='--')\n",
    "axes[0].set_title(f'{symbol} Price with Moving Averages', fontweight='bold')\n",
    "axes[0].set_ylabel('Price ($)')\n",
    "axes[0].legend()\n",
    "axes[0].grid(True, alpha=0.3)\n",
    "\n",
    "axes[1].plot(df_with_indicators.index, df_with_indicators['RSI'], linewidth=2, color='purple')\n",
    "axes[1].axhline(y=70, color='r', linestyle='--', label='Overbought')\n",
    "axes[1].axhline(y=30, color='g', linestyle='--', label='Oversold')\n",
    "axes[1].set_title('RSI Indicator', fontweight='bold')\n",
    "axes[1].set_ylabel('RSI')\n",
    "axes[1].legend()\n",
    "axes[1].grid(True, alpha=0.3)\n",
    "\n",
    "axes[2].plot(df_with_indicators.index, df_with_indicators['MACD'], label='MACD', linewidth=2)\n",
    "axes[2].plot(df_with_indicators.index, df_with_indicators['MACD_Signal'], label='Signal', linewidth=2)\n",
    "axes[2].bar(df_with_indicators.index, df_with_indicators['MACD_Histogram'], label='Histogram', alpha=0.3)\n",
    "axes[2].set_title('MACD Indicator', fontweight='bold')\n",
    "axes[2].set_xlabel('Date')\n",
    "axes[2].set_ylabel('MACD')\n",
    "axes[2].legend()\n",
    "axes[2].grid(True, alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 9. Volatility Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "volatility_data = {}\n",
    "\n",
    "for symbol, df in stock_data.items():\n",
    "    returns = df['Close'].pct_change()\n",
    "    volatility = returns.rolling(window=30).std() * np.sqrt(252) * 100\n",
    "    volatility_data[symbol] = volatility\n",
    "\n",
    "volatility_df = pd.DataFrame(volatility_data)\n",
    "\n",
    "plt.figure(figsize=(16, 6))\n",
    "for column in volatility_df.columns:\n",
    "    plt.plot(volatility_df.index, volatility_df[column], label=column, linewidth=2)\n",
    "\n",
    "plt.title('30-Day Rolling Volatility (Annualized)', fontweight='bold', fontsize=14)\n",
    "plt.xlabel('Date')\n",
    "plt.ylabel('Volatility (%)')\n",
    "plt.legend()\n",
    "plt.grid(True, alpha=0.3)\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "print(\"\\nAverage Volatility:\")\n",
    "print(volatility_df.mean().sort_values(ascending=False))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 10. Summary and Key Insights"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "summary = []\n",
    "\n",
    "for symbol, df in stock_data.items():\n",
    "    returns = df['Close'].pct_change().dropna()\n",
    "    \n",
    "    summary.append({\n",
    "        'Symbol': symbol,\n",
    "        'Start Price': df['Close'].iloc[0],\n",
    "        'End Price': df['Close'].iloc[-1],\n",
    "        'Total Return (%)': ((df['Close'].iloc[-1] - df['Close'].iloc[0]) / df['Close'].iloc[0]) * 100,\n",
    "        'Avg Daily Return (%)': returns.mean() * 100,\n",
    "        'Volatility (%)': returns.std() * np.sqrt(252) * 100,\n",
    "        'Max Drawdown (%)': ((df['Close'] / df['Close'].cummax()) - 1).min() * 100,\n",
    "        'Sharpe Ratio': (returns.mean() / returns.std()) * np.sqrt(252)\n",
    "    })\n",
    "\n",
    "summary_df = pd.DataFrame(summary)\n",
    "print(\"\\nStock Performance Summary:\")\n",
    "print(summary_df.to_string(index=False))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}