# Exploratory Data Analysis for Stock Price Prediction

This notebook explores the stock price dataset and performs initial analysis to understand the data characteristics.

In [None]:
# Import libraries
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Add parent directory to path to import modules
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('__file__'))))

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

# Set pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

## 1. Load the Dataset

First, let's load the stock price dataset.

In [None]:
# Define paths
data_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath('__file__'))), 'data')
raw_dir = os.path.join(data_dir, 'raw')

# Load stock data
from src.data_acquisition import load_stock_data

stock_data_path = os.path.join(raw_dir, 'stock_data.csv')
stock_data = load_stock_data(stock_data_path)

# Display the first few rows
stock_data.head()

## 2. Basic Data Exploration

Let's explore the basic characteristics of the dataset.

In [None]:
# Check the shape of the dataset
print(f"Dataset shape: {stock_data.shape}")

# Check data types
print("\nData types:")
print(stock_data.dtypes)

# Check for missing values
print("\nMissing values:")
print(stock_data.isnull().sum())

# Summary statistics
print("\nSummary statistics:")
stock_data.describe()

## 3. Time Series Visualization

Let's visualize the stock prices over time.

In [None]:
# Plot stock prices
plt.figure(figsize=(14, 8))

for column in stock_data.columns:
    plt.plot(stock_data.index, stock_data[column], label=column)

plt.title('Stock Prices Over Time')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.grid(True)
plt.show()

## 4. Correlation Analysis

Let's analyze the correlation between different stocks.

In [None]:
# Calculate correlation matrix
correlation_matrix = stock_data.corr()

# Plot correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Stock Prices')
plt.show()

## 5. Return Analysis

Let's calculate and analyze daily returns.

In [None]:
# Calculate daily returns
returns = stock_data.pct_change().dropna()

# Plot returns
plt.figure(figsize=(14, 8))

for column in returns.columns:
    plt.plot(returns.index, returns[column], label=column, alpha=0.7)

plt.title('Daily Returns')
plt.xlabel('Date')
plt.ylabel('Return')
plt.legend()
plt.grid(True)
plt.show()

## 6. Return Distribution

Let's analyze the distribution of returns.

In [None]:
# Plot return distributions
plt.figure(figsize=(14, 8))

for i, column in enumerate(returns.columns):
    plt.subplot(2, 3, i+1)
    sns.histplot(returns[column], kde=True)
    plt.title(f'{column} Return Distribution')
    plt.xlabel('Return')
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

## 7. Volatility Analysis

Let's calculate and visualize the volatility of each stock.

In [None]:
# Calculate rolling volatility (standard deviation of returns)
window_size = 20  # 20-day rolling window
volatility = returns.rolling(window=window_size).std() * np.sqrt(window_size)  # Annualized

# Plot volatility
plt.figure(figsize=(14, 8))

for column in volatility.columns:
    plt.plot(volatility.index, volatility[column], label=column)

plt.title(f'{window_size}-Day Rolling Volatility')
plt.xlabel('Date')
plt.ylabel('Volatility')
plt.legend()
plt.grid(True)
plt.show()

## 8. Autocorrelation Analysis

Let's analyze the autocorrelation of stock prices and returns.

In [None]:
from pandas.plotting import autocorrelation_plot

# Plot autocorrelation for the first stock
plt.figure(figsize=(14, 6))
autocorrelation_plot(stock_data.iloc[:, 0])
plt.title(f'Autocorrelation of {stock_data.columns[0]} Prices')
plt.grid(True)
plt.show()

# Plot autocorrelation for returns of the first stock
plt.figure(figsize=(14, 6))
autocorrelation_plot(returns.iloc[:, 0])
plt.title(f'Autocorrelation of {returns.columns[0]} Returns')
plt.grid(True)
plt.show()

## 9. Feature Engineering Preview

Let's preview some technical indicators that we'll use for feature engineering.

In [None]:
from src.feature_engineering import add_technical_indicators

# For this example, let's create a dataframe with OHLCV structure
# We'll use the first stock as Close price and generate other columns
ohlcv_data = pd.DataFrame({
    'Open': stock_data.iloc[:, 0].values * 0.99,  # Slightly lower than Close
    'High': stock_data.iloc[:, 0].values * 1.02,  # Slightly higher than Close
    'Low': stock_data.iloc[:, 0].values * 0.98,   # Slightly lower than Close
    'Close': stock_data.iloc[:, 0].values,
    'Volume': np.random.normal(1000000, 200000, len(stock_data))  # Random volume
}, index=stock_data.index)

# Add technical indicators
tech_data = add_technical_indicators(ohlcv_data, include_all=True)

# Display the first few rows with technical indicators
tech_data.head()

## 10. Visualize Technical Indicators

Let's visualize some of the technical indicators.

In [None]:
# Plot price with SMA
plt.figure(figsize=(14, 6))
plt.plot(tech_data.index, tech_data['Close'], label='Close Price')
plt.plot(tech_data.index, tech_data['SMA_20'], label='SMA 20')
plt.plot(tech_data.index, tech_data['SMA_50'], label='SMA 50')
plt.title('Price with Simple Moving Averages')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.grid(True)
plt.show()

# Plot RSI
plt.figure(figsize=(14, 6))
plt.plot(tech_data.index, tech_data['RSI_14'])
plt.axhline(y=70, color='r', linestyle='--', alpha=0.5)
plt.axhline(y=30, color='g', linestyle='--', alpha=0.5)
plt.title('Relative Strength Index (RSI)')
plt.xlabel('Date')
plt.ylabel('RSI')
plt.grid(True)
plt.show()

# Plot MACD
plt.figure(figsize=(14, 6))
plt.plot(tech_data.index, tech_data['MACD_12_26_9'], label='MACD')
plt.plot(tech_data.index, tech_data['MACDs_12_26_9'], label='Signal')
plt.bar(tech_data.index, tech_data['MACDh_12_26_9'], label='Histogram', alpha=0.5)
plt.title('Moving Average Convergence Divergence (MACD)')
plt.xlabel('Date')
plt.ylabel('MACD')
plt.legend()
plt.grid(True)
plt.show()

## 11. Stationarity Analysis

Let's check if the time series is stationary using the Augmented Dickey-Fuller test.

In [None]:
from statsmodels.tsa.stattools import adfuller

def adf_test(series, title=''):
    """Perform Augmented Dickey-Fuller test"""
    result = adfuller(series.dropna())
    print(f'ADF Test for {title}')
    print('ADF Statistic: %f' % result[0])
    print('p-value: %f' % result[1])
    print('Critical Values:')
    for key, value in result[4].items():
        print('\t%s: %.3f' % (key, value))
    
    # Interpret the result
    if result[1] <= 0.05:
        print("Conclusion: The series is stationary (reject the null hypothesis)")
    else:
        print("Conclusion: The series is non-stationary (fail to reject the null hypothesis)")
    print('\n')

# Test stationarity for the first stock price
adf_test(stock_data.iloc[:, 0], title=f'{stock_data.columns[0]} Price')

# Test stationarity for the first stock returns
adf_test(returns.iloc[:, 0], title=f'{returns.columns[0]} Returns')

## 12. Seasonal Decomposition

Let's decompose the time series into trend, seasonal, and residual components.

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

# Perform seasonal decomposition on the first stock
decomposition = seasonal_decompose(stock_data.iloc[:, 0].dropna(), model='additive', period=30)  # 30-day period

# Plot decomposition
plt.figure(figsize=(14, 10))

plt.subplot(4, 1, 1)
plt.plot(decomposition.observed)
plt.title('Observed')
plt.grid(True)

plt.subplot(4, 1, 2)
plt.plot(decomposition.trend)
plt.title('Trend')
plt.grid(True)

plt.subplot(4, 1, 3)
plt.plot(decomposition.seasonal)
plt.title('Seasonal')
plt.grid(True)

plt.subplot(4, 1, 4)
plt.plot(decomposition.resid)
plt.title('Residual')
plt.grid(True)

plt.tight_layout()
plt.show()

## 13. Feature Importance Preview

Let's preview feature importance using a simple model.

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Prepare features and target
from src.feature_engineering import prepare_features

# Prepare features for the first stock
target_col = stock_data.columns[0]
processed_data, _ = prepare_features(
    stock_data,
    target_col=target_col,
    include_technical=False,  # Set to False since we don't have OHLCV data
    include_statistical=True,
    include_lags=True,
    normalize=False,  # No normalization for feature importance
    reduce_dim=False,
    forecast_horizon=5
)

# Drop rows with NaN values
processed_data = processed_data.dropna()

# Split features and target
X = processed_data.drop(columns=[f'Target_5'])
y = processed_data[f'Target_5']

# Train a Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)

# Get feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance.head(15))
plt.title('Top 15 Feature Importance')
plt.grid(True)
plt.show()

## 14. Conclusion

In this notebook, we've explored the stock price dataset and performed initial analysis to understand the data characteristics. We've also previewed some of the technical indicators and feature engineering techniques that we'll use for our LSTM model.

Key findings:
- The dataset contains multiple stock prices over time
- We've analyzed the correlation between different stocks
- We've calculated and visualized daily returns and volatility
- We've checked for stationarity and performed seasonal decomposition
- We've previewed technical indicators and feature importance

Next steps:
- Implement the full feature engineering pipeline
- Prepare sequences for LSTM model
- Train and evaluate the LSTM model
- Implement the attention mechanism
- Evaluate model performance and interpretability