# Data Exploration and Analysis

This notebook provides comprehensive data exploration for algorithmic trading:

1. **Data Collection** - Load market data from various sources
2. **Price Analysis** - Visualize price movements and patterns
3. **Technical Indicators** - Calculate and analyze indicators
4. **Correlation Analysis** - Study relationships between assets
5. **Volatility Analysis** - Examine risk characteristics
6. **Data Quality** - Check for issues and anomalies

In [None]:
# Import required libraries
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Import our custom modules
from api.data_collector import DataCollector
from api.data_preprocessor import DataPreprocessor
from indicators.technical_indicators import TechnicalIndicators

# Configure plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

## 1. Data Collection

Let's start by collecting market data for multiple assets.

In [None]:
# Initialize data collector
collector = DataCollector()
preprocessor = DataPreprocessor()

# Define symbols to analyze
symbols = ['AAPL', 'GOOGL', 'MSFT', 'TSLA', 'AMZN', 'NVDA']
start_date = '2022-01-01'
end_date = '2024-01-01'

print(f"Collecting data for {len(symbols)} symbols from {start_date} to {end_date}")

In [None]:
# Collect data for all symbols
data_dict = {}

for symbol in symbols:
    print(f"Fetching data for {symbol}...")
    data = collector.get_historical_data(symbol, start_date, end_date, 'yahoo')
    
    if not data.empty:
        # Clean and preprocess
        data = preprocessor.clean_data(data)
        
        # Add technical indicators
        data = TechnicalIndicators.add_all_indicators(data)
        
        data_dict[symbol] = data
        print(f"  ✓ {len(data)} records collected for {symbol}")
    else:
        print(f"  ✗ No data found for {symbol}")

print(f"\nSuccessfully collected data for {len(data_dict)} symbols")

In [None]:
# Display basic information about the data
for symbol, data in data_dict.items():
    print(f"\n{symbol}:")
    print(f"  Shape: {data.shape}")
    print(f"  Date range: {data['datetime'].min()} to {data['datetime'].max()}")
    print(f"  Price range: ${data['low'].min():.2f} - ${data['high'].max():.2f}")
    print(f"  Missing values: {data.isnull().sum().sum()}")

## 2. Price Analysis and Visualization

In [None]:
# Create a combined price dataframe for easier analysis
price_data = pd.DataFrame()

for symbol, data in data_dict.items():
    price_data[symbol] = data.set_index('datetime')['close']

# Display first few rows
print("Combined Price Data:")
print(price_data.head())
print(f"\nShape: {price_data.shape}")

In [None]:
# Interactive price chart with Plotly
fig = go.Figure()

for symbol in symbols:
    if symbol in data_dict:
        data = data_dict[symbol]
        fig.add_trace(go.Scatter(
            x=data['datetime'],
            y=data['close'],
            mode='lines',
            name=symbol,
            line=dict(width=2)
        ))

fig.update_layout(
    title='Stock Price Evolution',
    xaxis_title='Date',
    yaxis_title='Price ($)',
    hovermode='x unified',
    height=600
)

fig.show()

In [None]:
# Normalized price chart (starting from 100)
normalized_prices = (price_data / price_data.iloc[0]) * 100

fig = go.Figure()

for symbol in normalized_prices.columns:
    fig.add_trace(go.Scatter(
        x=normalized_prices.index,
        y=normalized_prices[symbol],
        mode='lines',
        name=symbol,
        line=dict(width=2)
    ))

fig.update_layout(
    title='Normalized Price Performance (Base=100)',
    xaxis_title='Date',
    yaxis_title='Normalized Price',
    hovermode='x unified',
    height=600
)

fig.show()

In [None]:
# Candlestick chart for a specific symbol
symbol_to_plot = 'AAPL'  # Change this to analyze different symbols

if symbol_to_plot in data_dict:
    data = data_dict[symbol_to_plot]
    
    fig = make_subplots(
        rows=2, cols=1,
        shared_xaxes=True,
        vertical_spacing=0.03,
        subplot_titles=(f'{symbol_to_plot} Price', 'Volume'),
        row_width=[0.2, 0.7]
    )
    
    # Candlestick chart
    fig.add_trace(
        go.Candlestick(
            x=data['datetime'],
            open=data['open'],
            high=data['high'],
            low=data['low'],
            close=data['close'],
            name='Price'
        ),
        row=1, col=1
    )
    
    # Volume chart
    fig.add_trace(
        go.Bar(
            x=data['datetime'],
            y=data['volume'],
            name='Volume',
            marker_color='rgba(158,202,225,0.8)'
        ),
        row=2, col=1
    )
    
    fig.update_layout(
        title=f'{symbol_to_plot} Price and Volume Analysis',
        xaxis_rangeslider_visible=False,
        height=700
    )
    
    fig.show()
else:
    print(f"No data available for {symbol_to_plot}")

## 3. Technical Indicators Analysis

In [None]:
# Analyze technical indicators for a specific symbol
symbol_to_analyze = 'AAPL'

if symbol_to_analyze in data_dict:
    data = data_dict[symbol_to_analyze]
    
    # Create subplots for different indicators
    fig = make_subplots(
        rows=4, cols=1,
        shared_xaxes=True,
        vertical_spacing=0.02,
        subplot_titles=(
            f'{symbol_to_analyze} Price with Moving Averages',
            'RSI',
            'MACD',
            'Bollinger Bands Position'
        ),
        row_heights=[0.4, 0.2, 0.2, 0.2]
    )
    
    # Price with moving averages
    fig.add_trace(
        go.Scatter(x=data['datetime'], y=data['close'], name='Close Price', line=dict(color='black')),
        row=1, col=1
    )
    fig.add_trace(
        go.Scatter(x=data['datetime'], y=data['sma_20'], name='SMA 20', line=dict(color='blue')),
        row=1, col=1
    )
    fig.add_trace(
        go.Scatter(x=data['datetime'], y=data['sma_50'], name='SMA 50', line=dict(color='red')),
        row=1, col=1
    )
    
    # RSI
    fig.add_trace(
        go.Scatter(x=data['datetime'], y=data['rsi'], name='RSI', line=dict(color='purple')),
        row=2, col=1
    )
    fig.add_hline(y=70, line_dash="dash", line_color="red", row=2, col=1)
    fig.add_hline(y=30, line_dash="dash", line_color="green", row=2, col=1)
    
    # MACD
    fig.add_trace(
        go.Scatter(x=data['datetime'], y=data['macd'], name='MACD', line=dict(color='blue')),
        row=3, col=1
    )
    fig.add_trace(
        go.Scatter(x=data['datetime'], y=data['macd_signal'], name='Signal', line=dict(color='red')),
        row=3, col=1
    )
    
    # Bollinger Bands Position
    fig.add_trace(
        go.Scatter(x=data['datetime'], y=data['bb_position'], name='BB Position', line=dict(color='orange')),
        row=4, col=1
    )
    fig.add_hline(y=0.8, line_dash="dash", line_color="red", row=4, col=1)
    fig.add_hline(y=0.2, line_dash="dash", line_color="green", row=4, col=1)
    
    fig.update_layout(
        height=1000,
        title=f'Technical Analysis for {symbol_to_analyze}',
        showlegend=True
    )
    
    fig.show()
else:
    print(f"No data available for {symbol_to_analyze}")

In [None]:
# Statistical summary of indicators
if symbol_to_analyze in data_dict:
    data = data_dict[symbol_to_analyze]
    
    indicators = ['rsi', 'macd', 'bb_position', 'stoch_k', 'williams_r', 'cci']
    available_indicators = [ind for ind in indicators if ind in data.columns]
    
    if available_indicators:
        print(f"Technical Indicators Summary for {symbol_to_analyze}:")
        print("=" * 60)
        summary = data[available_indicators].describe()
        print(summary.round(2))
        
        # Current values
        print("\nCurrent Indicator Values:")
        print("-" * 30)
        for ind in available_indicators:
            current_val = data[ind].iloc[-1] if not pd.isna(data[ind].iloc[-1]) else "N/A"
            print(f"{ind.upper()}: {current_val}")
    else:
        print("No technical indicators found in the data")

## 4. Returns and Correlation Analysis

In [None]:
# Calculate returns for all symbols
returns_data = price_data.pct_change().dropna()

# Display basic statistics
print("Daily Returns Statistics:")
print("=" * 50)
print(returns_data.describe())

# Annualized statistics
print("\nAnnualized Statistics:")
print("-" * 30)
for symbol in returns_data.columns:
    ann_return = returns_data[symbol].mean() * 252
    ann_volatility = returns_data[symbol].std() * np.sqrt(252)
    sharpe_ratio = ann_return / ann_volatility if ann_volatility != 0 else 0
    
    print(f"{symbol}:")
    print(f"  Annual Return: {ann_return:.2%}")
    print(f"  Annual Volatility: {ann_volatility:.2%}")
    print(f"  Sharpe Ratio: {sharpe_ratio:.2f}")
    print()

In [None]:
# Correlation matrix
correlation_matrix = returns_data.corr()

# Plot correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5)
plt.title('Daily Returns Correlation Matrix')
plt.tight_layout()
plt.show()

print("\nCorrelation Matrix:")
print(correlation_matrix.round(3))

In [None]:
# Rolling correlation analysis
window = 60  # 60-day rolling window
base_symbol = 'AAPL'  # Reference symbol

if base_symbol in returns_data.columns:
    fig = go.Figure()
    
    for symbol in returns_data.columns:
        if symbol != base_symbol:
            rolling_corr = returns_data[base_symbol].rolling(window).corr(returns_data[symbol])
            fig.add_trace(go.Scatter(
                x=rolling_corr.index,
                y=rolling_corr,
                mode='lines',
                name=f'{base_symbol}-{symbol}'
            ))
    
    fig.update_layout(
        title=f'{window}-Day Rolling Correlation with {base_symbol}',
        xaxis_title='Date',
        yaxis_title='Correlation',
        height=500
    )
    
    fig.show()
else:
    print(f"No data available for {base_symbol}")

## 5. Volatility Analysis

In [None]:
# Calculate rolling volatility
volatility_window = 30  # 30-day rolling volatility
rolling_volatility = returns_data.rolling(volatility_window).std() * np.sqrt(252)

# Plot rolling volatility
fig = go.Figure()

for symbol in rolling_volatility.columns:
    fig.add_trace(go.Scatter(
        x=rolling_volatility.index,
        y=rolling_volatility[symbol],
        mode='lines',
        name=symbol
    ))

fig.update_layout(
    title=f'{volatility_window}-Day Rolling Volatility (Annualized)',
    xaxis_title='Date',
    yaxis_title='Volatility',
    height=500
)

fig.show()

In [None]:
# Volatility distribution
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for i, symbol in enumerate(returns_data.columns[:6]):
    if i < len(axes):
        axes[i].hist(returns_data[symbol], bins=50, alpha=0.7, density=True)
        axes[i].set_title(f'{symbol} Returns Distribution')
        axes[i].set_xlabel('Daily Return')
        axes[i].set_ylabel('Density')
        
        # Add normal distribution overlay
        x = np.linspace(returns_data[symbol].min(), returns_data[symbol].max(), 100)
        normal_dist = (1/np.sqrt(2*np.pi*returns_data[symbol].var())) * \
                     np.exp(-0.5*((x-returns_data[symbol].mean())/returns_data[symbol].std())**2)
        axes[i].plot(x, normal_dist, 'r-', linewidth=2, label='Normal')
        axes[i].legend()

plt.tight_layout()
plt.show()

## 6. Data Quality Analysis

In [None]:
# Data quality checks
print("DATA QUALITY REPORT")
print("=" * 50)

for symbol, data in data_dict.items():
    print(f"\n{symbol}:")
    print(f"  Total records: {len(data)}")
    print(f"  Missing values: {data.isnull().sum().sum()}")
    print(f"  Duplicate dates: {data['datetime'].duplicated().sum()}")
    
    # Check for price anomalies
    price_changes = data['close'].pct_change().abs()
    extreme_changes = (price_changes > 0.2).sum()  # More than 20% daily change
    print(f"  Extreme price changes (>20%): {extreme_changes}")
    
    # Check for zero volume days
    if 'volume' in data.columns:
        zero_volume_days = (data['volume'] == 0).sum()
        print(f"  Zero volume days: {zero_volume_days}")
    
    # Check OHLC consistency
    ohlc_issues = ((data['high'] < data['low']) | 
                   (data['close'] > data['high']) | 
                   (data['close'] < data['low']) |
                   (data['open'] > data['high']) |
                   (data['open'] < data['low'])).sum()
    print(f"  OHLC inconsistencies: {ohlc_issues}")

In [None]:
# Missing data visualization
if data_dict:
    # Create a summary of missing data across all symbols
    missing_data_summary = []
    
    for symbol, data in data_dict.items():
        missing_info = {
            'Symbol': symbol,
            'Total Records': len(data),
            'Missing Values': data.isnull().sum().sum(),
            'Missing %': (data.isnull().sum().sum() / (len(data) * len(data.columns))) * 100
        }
        missing_data_summary.append(missing_info)
    
    missing_df = pd.DataFrame(missing_data_summary)
    
    # Plot missing data summary
    fig = px.bar(missing_df, x='Symbol', y='Missing %', 
                 title='Missing Data Percentage by Symbol',
                 labels={'Missing %': 'Missing Data (%)'})
    fig.show()
    
    print("\nMissing Data Summary:")
    print(missing_df)

## Summary and Insights

Based on the analysis above, we can draw several insights:

1. **Price Trends**: Examine the normalized price chart to see relative performance
2. **Correlation Patterns**: High correlation between tech stocks suggests sector risk
3. **Volatility Regimes**: Identify periods of high and low volatility
4. **Technical Indicators**: RSI, MACD, and Bollinger Bands provide trading signals
5. **Data Quality**: Most data appears clean with minimal missing values

This analysis forms the foundation for strategy development and backtesting.

In [None]:
# Save processed data for further analysis
print("Saving processed data...")

for symbol, data in data_dict.items():
    filename = f'../data/processed/{symbol}_processed.csv'
    data.to_csv(filename, index=False)
    print(f"  ✓ Saved {symbol} data to {filename}")

# Save combined price data
price_data.to_csv('../data/processed/combined_prices.csv')
returns_data.to_csv('../data/processed/combined_returns.csv')

print("\n✓ All data saved successfully!")