# Data Exploration and Preprocessing

This notebook explores market data, performs preprocessing, and validates data quality.

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.data.data_loader import DataLoader
from src.data.data_preprocessor import DataPreprocessor
from src.data.feature_engineer import FeatureEngineer

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

## 1. Load Market Data

Load historical price data for analysis.

In [None]:
# Initialize data loader
loader = DataLoader(use_cache=True)

# Load data for SPY (S&P 500 ETF)
symbols = ['SPY', 'QQQ', 'IWM']  # Large cap, Tech, Small cap
start_date = '2015-01-01'
end_date = '2023-12-31'

data = {}
for symbol in symbols:
    print(f"Loading {symbol}...")
    data[symbol] = loader.load_data(symbol, start_date=start_date, end_date=end_date)
    print(f"Loaded {len(data[symbol])} rows")

# Focus on SPY for detailed analysis
spy_data = data['SPY']

## 2. Data Quality Check

In [None]:
# Display basic information
print("Data shape:", spy_data.shape)
print("\nFirst few rows:")
display(spy_data.head())

print("\nLast few rows:")
display(spy_data.tail())

# Check for missing values
print("\nMissing values:")
print(spy_data.isnull().sum())

# Summary statistics
print("\nSummary statistics:")
display(spy_data.describe())

## 3. Visualize Price Data

In [None]:
# Plot price and volume
fig, axes = plt.subplots(2, 1, figsize=(15, 8))

# Price plot
axes[0].plot(spy_data.index, spy_data['close'], label='Close Price', linewidth=1.5)
axes[0].set_title('SPY Price History', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Price ($)', fontsize=12)
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Volume plot
axes[1].bar(spy_data.index, spy_data['volume'], width=1, alpha=0.7, color='steelblue')
axes[1].set_title('Trading Volume', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Volume', fontsize=12)
axes[1].set_xlabel('Date', fontsize=12)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Returns Analysis

In [None]:
# Calculate returns
spy_data['returns'] = spy_data['close'].pct_change()
spy_data['log_returns'] = np.log(spy_data['close'] / spy_data['close'].shift(1))

# Plot returns distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Returns time series
axes[0].plot(spy_data.index, spy_data['returns'], alpha=0.6, linewidth=0.8)
axes[0].axhline(y=0, color='r', linestyle='--', alpha=0.3)
axes[0].set_title('Daily Returns', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Return', fontsize=12)
axes[0].set_xlabel('Date', fontsize=12)
axes[0].grid(True, alpha=0.3)

# Returns distribution
axes[1].hist(spy_data['returns'].dropna(), bins=100, alpha=0.7, edgecolor='black')
axes[1].axvline(x=spy_data['returns'].mean(), color='r', linestyle='--', 
                label=f'Mean: {spy_data["returns"].mean():.4f}')
axes[1].set_title('Returns Distribution', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Return', fontsize=12)
axes[1].set_ylabel('Frequency', fontsize=12)
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print statistics
print("Returns Statistics:")
print(f"Mean:       {spy_data['returns'].mean():.4f}")
print(f"Std Dev:    {spy_data['returns'].std():.4f}")
print(f"Skewness:   {spy_data['returns'].skew():.4f}")
print(f"Kurtosis:   {spy_data['returns'].kurtosis():.4f}")
print(f"Min:        {spy_data['returns'].min():.4f}")
print(f"Max:        {spy_data['returns'].max():.4f}")

## 5. Data Preprocessing

In [None]:
# Apply preprocessing
preprocessor = DataPreprocessor()
clean_data = preprocessor.clean_data(spy_data)

print(f"Original data: {len(spy_data)} rows")
print(f"Clean data: {len(clean_data)} rows")
print(f"Removed: {len(spy_data) - len(clean_data)} rows")

# Validate cleaning
print("\nMissing values after cleaning:")
print(clean_data.isnull().sum())

## 6. Feature Engineering Preview

In [None]:
# Create features
engineer = FeatureEngineer(windows=[10, 20, 50])
features = engineer.create_features(clean_data)

print(f"Number of features created: {len(features.columns)}")
print(f"\nFeature columns:")
for col in features.columns:
    print(f"  - {col}")

# Display feature correlation heatmap (sample)
sample_features = features[['returns', 'volatility_20', 'rsi_14', 'macd', 'atr_14']].dropna()

plt.figure(figsize=(10, 8))
correlation_matrix = sample_features.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, fmt='.2f')
plt.title('Feature Correlation Matrix (Sample)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 7. Multi-Asset Comparison

In [None]:
# Compare returns across assets
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Normalized prices
for symbol, df in data.items():
    normalized = df['close'] / df['close'].iloc[0] * 100
    axes[0].plot(df.index, normalized, label=symbol, linewidth=1.5)

axes[0].set_title('Normalized Price Performance', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Normalized Price (Base=100)', fontsize=12)
axes[0].set_xlabel('Date', fontsize=12)
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Cumulative returns
for symbol, df in data.items():
    returns = df['close'].pct_change()
    cumulative = (1 + returns).cumprod()
    axes[1].plot(df.index, cumulative, label=symbol, linewidth=1.5)

axes[1].set_title('Cumulative Returns', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Cumulative Return', fontsize=12)
axes[1].set_xlabel('Date', fontsize=12)
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Next Steps

- Proceed to `02_regime_analysis.ipynb` for regime detection
- Features are ready for machine learning models
- Data quality has been validated