# üìä Notebook 1 ‚Äî Exploratory Data Analysis (EDA)

**Project:** Stock Trend Predictor with Sentiment Analysis  
**Goal:** Understand the raw stock data before building any models

### What we cover:
1. Fetch & inspect raw stock data
2. Plot price history & volume
3. Analyze distributions & correlations
4. Visualize technical indicators
5. Check class imbalance in labels
6. Sentiment score distribution

## 1. Imports & Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import yfinance as yf
import ta
import warnings
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.4f}'.format)
plt.style.use('seaborn-v0_8-whitegrid')

STOCKS = {
    'RELIANCE.NS': 'Reliance Industries',
    'TCS.NS':      'TCS',
    'INFY.NS':     'Infosys',
    'HDFCBANK.NS': 'HDFC Bank',
    'WIPRO.NS':    'Wipro'
}

print('‚úÖ Imports done!')

## 2. Fetch Raw Stock Data

In [None]:
# Fetch data for Reliance (our primary analysis stock)
ticker = 'RELIANCE.NS'
df_raw = yf.download(ticker, start='2020-01-01', end='2024-12-31', progress=False)

# Flatten multi-level columns if present
if isinstance(df_raw.columns, pd.MultiIndex):
    df_raw.columns = df_raw.columns.get_level_values(0)

print(f'Shape: {df_raw.shape}')
print(f'Date range: {df_raw.index[0].date()} to {df_raw.index[-1].date()}')
print(f'\nFirst 5 rows:')
df_raw.head()

In [None]:
# Basic statistics
print('üìã Basic Statistics:')
df_raw.describe().round(2)

In [None]:
# Check for missing values
print('üîç Missing Values:')
print(df_raw.isnull().sum())
print(f'\nTotal rows: {len(df_raw)}')
print(f'Missing %: {df_raw.isnull().sum().sum() / (len(df_raw) * len(df_raw.columns)) * 100:.2f}%')

## 3. Price History & Volume Chart

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 8), gridspec_kw={'height_ratios': [3, 1]})

# Price
ax1.plot(df_raw.index, df_raw['Close'], color='#2563EB', linewidth=1.5, label='Close Price')
ax1.fill_between(df_raw.index, df_raw['Close'], alpha=0.1, color='#2563EB')
ax1.set_title(f'{STOCKS[ticker]} ‚Äî Price History (2020-2024)', fontsize=14, fontweight='bold')
ax1.set_ylabel('Price (‚Çπ)')
ax1.legend()
ax1.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

# Volume
colors = ['#059669' if c >= o else '#DC2626'
          for c, o in zip(df_raw['Close'], df_raw['Open'])]
ax2.bar(df_raw.index, df_raw['Volume'], color=colors, alpha=0.7, width=1)
ax2.set_ylabel('Volume')
ax2.set_xlabel('Date')
ax2.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

plt.tight_layout()
plt.savefig('../data/price_history.png', dpi=150, bbox_inches='tight')
plt.show()
print('Chart saved!')

## 4. Candlestick Chart (Interactive)

In [None]:
# Last 6 months candlestick
df_6m = df_raw.last('180D')

fig = go.Figure(go.Candlestick(
    x=df_6m.index,
    open=df_6m['Open'], high=df_6m['High'],
    low=df_6m['Low'],   close=df_6m['Close'],
    increasing_line_color='#059669',
    decreasing_line_color='#DC2626'
))

fig.update_layout(
    title=f'{STOCKS[ticker]} ‚Äî Last 6 Months Candlestick',
    xaxis_rangeslider_visible=False,
    template='plotly_white', height=400
)
fig.show()

## 5. Daily Returns Distribution

In [None]:
daily_returns = df_raw['Close'].pct_change().dropna() * 100

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(daily_returns, bins=80, color='#2563EB', alpha=0.7, edgecolor='white')
axes[0].axvline(daily_returns.mean(), color='red', linestyle='--',
                label=f'Mean: {daily_returns.mean():.2f}%')
axes[0].axvline(0, color='black', linestyle='-', alpha=0.3)
axes[0].set_title('Daily Returns Distribution')
axes[0].set_xlabel('Daily Return (%)')
axes[0].set_ylabel('Frequency')
axes[0].legend()

# Rolling volatility (30-day std of returns)
rolling_vol = daily_returns.rolling(30).std()
axes[1].plot(rolling_vol.index, rolling_vol, color='#D97706', linewidth=1.5)
axes[1].set_title('30-Day Rolling Volatility')
axes[1].set_xlabel('Date')
axes[1].set_ylabel('Std Dev of Returns (%)')
axes[1].xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

plt.tight_layout()
plt.show()

print(f'üìä Returns Stats:')
print(f'  Mean:     {daily_returns.mean():.3f}%')
print(f'  Std Dev:  {daily_returns.std():.3f}%')
print(f'  Skewness: {daily_returns.skew():.3f}')
print(f'  Kurtosis: {daily_returns.kurtosis():.3f}')

## 6. All Stocks Price Comparison

In [None]:
# Fetch all stocks and normalize to 100 (% change from start)
all_data = {}
for t, name in STOCKS.items():
    d = yf.download(t, start='2020-01-01', end='2024-12-31', progress=False)
    if isinstance(d.columns, pd.MultiIndex):
        d.columns = d.columns.get_level_values(0)
    all_data[name] = d['Close']

prices_df = pd.DataFrame(all_data).dropna()

# Normalize: base 100
normalized = prices_df / prices_df.iloc[0] * 100

fig = px.line(normalized, title='Normalized Price Performance (Base=100, Jan 2020)',
              labels={'value': 'Normalized Price', 'variable': 'Stock'},
              template='plotly_white', height=400)
fig.update_layout(legend=dict(orientation='h', y=1.05))
fig.show()

print('\nüìà Total Returns (2020-2024):')
for col in normalized.columns:
    ret = normalized[col].iloc[-1] - 100
    print(f'  {col:<25} {ret:+.1f}%')

## 7. Technical Indicators Visualization

In [None]:
# Add indicators
df = df_raw.copy()
close = df['Close']

df['RSI']      = ta.momentum.RSIIndicator(close=close, window=14).rsi()
df['MACD']     = ta.trend.MACD(close=close).macd()
df['MACD_Sig'] = ta.trend.MACD(close=close).macd_signal()
df['EMA_9']    = ta.trend.EMAIndicator(close=close, window=9).ema_indicator()
df['EMA_21']   = ta.trend.EMAIndicator(close=close, window=21).ema_indicator()

bb = ta.volatility.BollingerBands(close=close, window=20)
df['BB_High'] = bb.bollinger_hband()
df['BB_Low']  = bb.bollinger_lband()

df_plot = df.last('365D').copy()

fig, axes = plt.subplots(3, 1, figsize=(14, 12),
                         gridspec_kw={'height_ratios': [3, 1.5, 1.5]})

# --- Plot 1: Price + EMA + Bollinger ---
axes[0].plot(df_plot.index, df_plot['Close'],  color='#1B3A6B', lw=1.5, label='Close')
axes[0].plot(df_plot.index, df_plot['EMA_9'],  color='#2563EB', lw=1,   label='EMA 9',  linestyle='--')
axes[0].plot(df_plot.index, df_plot['EMA_21'], color='#F59E0B', lw=1,   label='EMA 21', linestyle='--')
axes[0].fill_between(df_plot.index, df_plot['BB_High'], df_plot['BB_Low'],
                     alpha=0.12, color='#2563EB', label='Bollinger Bands')
axes[0].set_title(f'{STOCKS[ticker]} ‚Äî Technical Indicators (Last 1 Year)', fontsize=13, fontweight='bold')
axes[0].set_ylabel('Price (‚Çπ)')
axes[0].legend(loc='upper left', fontsize=9)

# --- Plot 2: RSI ---
axes[1].plot(df_plot.index, df_plot['RSI'], color='#7C3AED', lw=1.5)
axes[1].axhline(70, color='#DC2626', linestyle='--', alpha=0.7, label='Overbought (70)')
axes[1].axhline(30, color='#059669', linestyle='--', alpha=0.7, label='Oversold (30)')
axes[1].fill_between(df_plot.index, df_plot['RSI'], 50,
                     where=df_plot['RSI'] > 50, alpha=0.1, color='#059669')
axes[1].fill_between(df_plot.index, df_plot['RSI'], 50,
                     where=df_plot['RSI'] < 50, alpha=0.1, color='#DC2626')
axes[1].set_ylim(0, 100)
axes[1].set_ylabel('RSI')
axes[1].legend(fontsize=9)

# --- Plot 3: MACD ---
axes[2].plot(df_plot.index, df_plot['MACD'],     color='#2563EB', lw=1.5, label='MACD')
axes[2].plot(df_plot.index, df_plot['MACD_Sig'], color='#F59E0B', lw=1.5, label='Signal')
axes[2].bar(df_plot.index,
            df_plot['MACD'] - df_plot['MACD_Sig'],
            color=['#059669' if v >= 0 else '#DC2626'
                   for v in (df_plot['MACD'] - df_plot['MACD_Sig'])],
            alpha=0.5, width=1, label='Histogram')
axes[2].axhline(0, color='black', lw=0.8)
axes[2].set_ylabel('MACD')
axes[2].set_xlabel('Date')
axes[2].legend(fontsize=9)

for ax in axes:
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

plt.tight_layout()
plt.savefig('../data/technical_indicators.png', dpi=150, bbox_inches='tight')
plt.show()

## 8. Correlation Heatmap of Features

In [None]:
import sys
sys.path.append('../src')
from features import add_technical_indicators, create_labels, prepare_feature_matrix

df_feat = add_technical_indicators(df_raw.copy())
df_feat = create_labels(df_feat)
df_feat = prepare_feature_matrix(df_feat)

# Correlation matrix
corr = df_feat.corr()

fig, ax = plt.subplots(figsize=(14, 10))
im = ax.imshow(corr, cmap='RdYlBu', aspect='auto', vmin=-1, vmax=1)
plt.colorbar(im, ax=ax)
ax.set_xticks(range(len(corr.columns)))
ax.set_yticks(range(len(corr.columns)))
ax.set_xticklabels(corr.columns, rotation=45, ha='right', fontsize=9)
ax.set_yticklabels(corr.columns, fontsize=9)
ax.set_title('Feature Correlation Heatmap', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('../data/correlation_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()

# Top correlations with Label
label_corr = corr['Label'].drop('Label').abs().sort_values(ascending=False)
print('\nüîç Top 10 features correlated with Label:')
print(label_corr.head(10).to_string())

## 9. Class Label Distribution

In [None]:
label_counts = df_feat['Label'].value_counts().sort_index()
label_names  = {-1: 'DOWN', 0: 'NEUTRAL', 1: 'UP'}
colors       = ['#DC2626', '#D97706', '#059669']

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Bar chart
bars = axes[0].bar(
    [label_names[l] for l in label_counts.index],
    label_counts.values,
    color=colors, edgecolor='white', linewidth=1.5
)
for bar, val in zip(bars, label_counts.values):
    axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5,
                 f'{val}\n({val/len(df_feat)*100:.1f}%)',
                 ha='center', va='bottom', fontsize=11)
axes[0].set_title('Label Distribution (Bar)', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Count')

# Pie chart
axes[1].pie(
    label_counts.values,
    labels=[label_names[l] for l in label_counts.index],
    colors=colors, autopct='%1.1f%%',
    startangle=90, pctdistance=0.75,
    wedgeprops={'edgecolor': 'white', 'linewidth': 2}
)
axes[1].set_title('Label Distribution (Pie)', fontsize=12, fontweight='bold')

plt.suptitle('Class Imbalance Check', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('../data/label_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

print('‚ö†Ô∏è  If classes are very imbalanced, we may need class_weight="balanced" in models!')

## 10. EDA Summary

In [None]:
print('='*50)
print('     üìã EDA SUMMARY')
print('='*50)
print(f'  Total samples (Reliance):  {len(df_feat)}')
print(f'  Total features:            {len(df_feat.columns) - 1}')
print(f'  Date range:                2020 - 2024')
print(f'  Missing values:            0 (cleaned)')
print()
print('  Label Distribution:')
for label, name in {-1: 'DOWN', 0: 'NEUTRAL', 1: 'UP'}.items():
    count = (df_feat['Label'] == label).sum()
    pct   = count / len(df_feat) * 100
    print(f'    {name:<10} {count:>5} samples  ({pct:.1f}%)')
print()
print('  Key Observations:')
print('  ‚úÖ Data fetched successfully from yfinance')
print('  ‚úÖ Technical indicators computed (RSI, MACD, BB, EMA, etc.)')
print('  ‚úÖ Labels created using 5-day forward returns')
print('  ‚ö†Ô∏è  Check class imbalance before training')
print('  ‚û°Ô∏è  Next: Feature Engineering (Notebook 2)')
print('='*50)