# Exploratory Data Analysis
## Stock Price Prediction with LSTM & Random Forest

This notebook covers:
1. **Data fetching** via yfinance
2. **Stationarity testing** (ADF test, differencing)
3. **Feature engineering** – technical indicators & sentiment scores
4. **Visualisations** – price history, distributions, correlations


In [None]:
import sys, os
sys.path.insert(0, '..')

import warnings; warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from src.data_loader import fetch_stock_data, adf_test, time_series_split
from src.sentiment_analyzer import add_sentiment_to_df

%matplotlib inline
plt.style.use('seaborn-v0_8-darkgrid')
print('Setup complete')

In [None]:
# ── 1. Fetch Data ─────────────────────────────────────────────────────────────
TICKER = 'AAPL'
START  = '2015-01-01'
END    = '2024-12-31'

df = fetch_stock_data(TICKER, START, END)
df = add_sentiment_to_df(df, TICKER, START, END)
print(df.shape)
df.head()

In [None]:
# ── 2. Stationarity ───────────────────────────────────────────────────────────
# The raw close price is almost certainly non-stationary (unit root).
# We apply the ADF test before and after first-differencing.

print('=== ADF Test Results ===')
r1 = adf_test(df['Close'],         'Close (raw)')
r2 = adf_test(df['Price_Change'],  'Close (1st diff)')
r3 = adf_test(df['Log_Return'],    'Log Return')

# Conclusion: Log returns are stationary – we'll use them as model features

In [None]:
# ── 3. Price History ──────────────────────────────────────────────────────────
fig = make_subplots(rows=3, cols=1, shared_xaxes=True,
                    subplot_titles=['Close Price', 'Volume', 'RSI (14)'],
                    row_heights=[0.5, 0.25, 0.25])

# Candlestick
fig.add_trace(go.Candlestick(
    x=df.index, open=df['Open'], high=df['High'],
    low=df['Low'], close=df['Close'], name='OHLC'
), row=1, col=1)

fig.add_trace(go.Scatter(x=df.index, y=df['SMA_50'],
                         name='SMA 50', line=dict(color='orange')), row=1, col=1)
fig.add_trace(go.Scatter(x=df.index, y=df['SMA_20'],
                         name='SMA 20', line=dict(color='cyan')),   row=1, col=1)

fig.add_trace(go.Bar(x=df.index, y=df['Volume'], name='Volume',
                     marker_color='lightblue'), row=2, col=1)

fig.add_trace(go.Scatter(x=df.index, y=df['RSI_14'],
                         name='RSI 14', line=dict(color='purple')), row=3, col=1)
fig.add_hline(y=70, row=3, col=1, line_dash='dash', line_color='red')
fig.add_hline(y=30, row=3, col=1, line_dash='dash', line_color='green')

fig.update_layout(height=700, title=f'{TICKER} – Interactive Price Chart',
                  xaxis_rangeslider_visible=False)
fig.show()

In [None]:
# ── 4. Return Distribution ────────────────────────────────────────────────────
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].hist(df['Log_Return'].dropna(), bins=100, color='steelblue', edgecolor='k', alpha=0.7)
axes[0].set_title('Log Return Distribution')
axes[0].axvline(0, color='red', linestyle='--')

axes[1].plot(df.index, df['Log_Return'].cumsum(), color='darkorange')
axes[1].set_title('Cumulative Log Return')

plt.tight_layout(); plt.show()

In [None]:
# ── 5. Correlation Heatmap ────────────────────────────────────────────────────
feature_cols = ['Close','SMA_20','SMA_50','RSI_14','MACD','BB_Width',
                'ATR_14','Log_Return','Sentiment','Vol_Change']
corr = df[feature_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', center=0, linewidths=0.5)
plt.title('Feature Correlation Heatmap')
plt.tight_layout(); plt.show()

In [None]:
# ── 6. Bollinger Bands ────────────────────────────────────────────────────────
recent = df.tail(200)
plt.figure(figsize=(14, 5))
plt.plot(recent.index, recent['Close'],    label='Close',    color='steelblue')
plt.plot(recent.index, recent['BB_Upper'], label='BB Upper', color='red',   linestyle='--')
plt.plot(recent.index, recent['BB_Mid'],   label='BB Mid',   color='grey',  linestyle=':')
plt.plot(recent.index, recent['BB_Lower'], label='BB Lower', color='green', linestyle='--')
plt.fill_between(recent.index, recent['BB_Lower'], recent['BB_Upper'],
                 alpha=0.08, color='grey')
plt.title(f'{TICKER} – Bollinger Bands (last 200 days)')
plt.legend(); plt.tight_layout(); plt.show()

In [None]:
# ── 7. Sentiment Overview ─────────────────────────────────────────────────────
fig, ax1 = plt.subplots(figsize=(14, 4))
ax2 = ax1.twinx()
ax1.plot(df.index, df['Close'],     color='steelblue', label='Close', alpha=0.7)
ax2.plot(df.index, df['Sentiment'], color='darkorange',label='Sentiment', alpha=0.5)
ax1.set_ylabel('Price (USD)')
ax2.set_ylabel('Sentiment Score')
ax2.axhline(0, color='red', linestyle='--', linewidth=0.8)
plt.title(f'{TICKER} Close Price vs Sentiment')
fig.legend(loc='upper left', bbox_to_anchor=(0.1, 0.9))
plt.tight_layout(); plt.show()

print('\nCorrelation between Sentiment and next-day return:',
      round(df['Sentiment'].corr(df['Log_Return'].shift(-1)), 4))

In [None]:
# ── 8. Save cleaned dataset ───────────────────────────────────────────────────
os.makedirs('../data', exist_ok=True)
df.to_csv(f'../data/{TICKER}_features.csv')
print(f'Saved {len(df)} rows → ../data/{TICKER}_features.csv')