# Data Loading and Analysis

This notebook provides a comprehensive guide to loading, cleaning, and analyzing financial data using the Volatility Forecasting toolkit.

## Topics Covered
1. Loading data from multiple sources
2. Data cleaning and validation
3. Handling missing values and outliers
4. Returns calculation and analysis
5. Statistical tests and diagnostics

Let's dive in! üìä

## Setup

In [None]:
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(os.getcwd()), 'src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

from data_loader import DataLoader, fetch_prices, load_prices
from returns import ReturnsCalculator, compute_log_returns, compute_simple_returns
from utils import setup_plot_style, validate_dataframe

setup_plot_style()
plt.rcParams['figure.figsize'] = (14, 6)

print("‚úÖ Setup complete!")

## 1. Loading Data from Yahoo Finance

The easiest way to get started is loading data from Yahoo Finance.

In [None]:
# Load data for multiple tickers
tickers = ['AAPL', 'MSFT', 'GOOGL']
period = '1y'

print(f"üì• Loading data for {', '.join(tickers)}...")
print(f"üìÖ Period: {period}\n")

loader = DataLoader()
prices = loader.load_from_yfinance(
    tickers=tickers,
    period=period,
    interval='1d'
)

print(f"\n‚úÖ Data loaded!")
print(f"üìä Shape: {prices.shape}")
print(f"üìÖ Date range: {prices.index[0].date()} to {prices.index[-1].date()}")

In [None]:
# Quick look at the data
print("üìä First few rows:")
print(prices.head())

print("\nüìä Last few rows:")
print(prices.tail())

print("\nüìä Data info:")
print(prices.info())

In [None]:
# Visualize all price series
fig, axes = plt.subplots(len(tickers), 1, figsize=(14, 4*len(tickers)))

colors = ['#2E86AB', '#A23B72', '#F18F01']

for idx, ticker in enumerate(tickers):
    ax = axes[idx] if len(tickers) > 1 else axes
    ax.plot(prices.index, prices[ticker], linewidth=2, color=colors[idx], label=ticker)
    ax.set_title(f'{ticker} Price History', fontsize=14, fontweight='bold')
    ax.set_xlabel('Date', fontsize=11)
    ax.set_ylabel('Price ($)', fontsize=11)
    ax.grid(True, alpha=0.3)
    ax.legend(fontsize=10)

plt.tight_layout()
plt.show()

## 2. Data Cleaning

Real-world data often contains missing values, outliers, and other issues. Let's clean it up!

In [None]:
# Check for missing values
print("üîç Checking for missing values...\n")

missing_count = prices.isna().sum()
print("Missing values per ticker:")
print(missing_count)

missing_pct = (prices.isna().sum() / len(prices)) * 100
print("\nMissing percentage:")
print(missing_pct.round(2))

# Check for zeros
zero_count = (prices == 0).sum()
print("\nZero values per ticker:")
print(zero_count)

In [None]:
# Clean the data
print("üßπ Cleaning data...\n")

clean_prices = loader.clean_data(
    handle_missing='ffill',  # Forward fill missing values
    handle_zeros='ffill',    # Forward fill zeros
    drop_na_threshold=0.5    # Drop columns with >50% missing
)

print("‚úÖ Data cleaned!")
print(f"üìä Final shape: {clean_prices.shape}")

# Verify no missing values remain
print(f"\n‚úì Missing values remaining: {clean_prices.isna().sum().sum()}")

## 3. Data Validation

Always validate your data before analysis!

In [None]:
# Run comprehensive validation
print("‚úÖ Validating data...\n")

validation = loader.validate_data()

print(f"Data valid: {'‚úÖ' if validation['is_valid'] else '‚ùå'}")

if validation['issues']:
    print("\n‚ö†Ô∏è  Issues found:")
    for issue in validation['issues']:
        print(f"   - {issue}")
else:
    print("\n‚úÖ No issues found!")

print(f"\nüìä Data Statistics:")
stats = validation['stats']
print(f"   Rows: {stats['n_rows']}")
print(f"   Columns: {stats['n_cols']}")
print(f"   Tickers: {', '.join(stats['tickers'])}")
print(f"   Date range: {stats['date_range'][0].date()} to {stats['date_range'][1].date()}")

In [None]:
# Check for gaps in data
print("üìÖ Checking for date gaps...\n")

date_diffs = pd.Series(prices.index).diff()
large_gaps = date_diffs[date_diffs > pd.Timedelta(days=7)]

if len(large_gaps) > 0:
    print(f"‚ö†Ô∏è  Found {len(large_gaps)} gaps > 7 days:")
    for idx, gap in large_gaps.items():
        if idx > 0:
            print(f"   {prices.index[idx-1].date()} -> {prices.index[idx].date()} ({gap.days} days)")
else:
    print("‚úÖ No large gaps found!")

## 4. Returns Calculation

Calculate returns using different methods and compare them.

In [None]:
print("üßÆ Calculating returns...\n")

# Method 1: Log returns (preferred for volatility)
log_returns = compute_log_returns(clean_prices)
print(f"‚úÖ Log returns calculated: {log_returns.shape}")

# Method 2: Simple returns
simple_returns = compute_simple_returns(clean_prices)
print(f"‚úÖ Simple returns calculated: {simple_returns.shape}")

print(f"\nüìä Returns summary (log returns):")
print(log_returns.describe().round(6))

In [None]:
# Compare log vs simple returns
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

ticker = 'AAPL'

# Scatter plot
axes[0].scatter(simple_returns[ticker] * 100, log_returns[ticker] * 100, 
                alpha=0.5, s=10, color='#2E86AB')
axes[0].plot([-5, 5], [-5, 5], 'r--', alpha=0.5, label='y=x')
axes[0].set_xlabel('Simple Returns (%)', fontsize=11)
axes[0].set_ylabel('Log Returns (%)', fontsize=11)
axes[0].set_title('Log vs Simple Returns', fontsize=13, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Difference over time
diff = (simple_returns[ticker] - log_returns[ticker]) * 100
axes[1].plot(diff.index, diff, linewidth=1, alpha=0.7, color='#A23B72')
axes[1].axhline(y=0, color='black', linestyle='--', alpha=0.3)
axes[1].set_xlabel('Date', fontsize=11)
axes[1].set_ylabel('Difference (%)', fontsize=11)
axes[1].set_title('Difference (Simple - Log)', fontsize=13, fontweight='bold')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nüìä For small returns, log ‚âà simple returns")
print(f"   Mean difference: {diff.mean():.6f}%")
print(f"   Max difference:  {diff.abs().max():.6f}%")

## 5. Returns Analysis

Perform comprehensive statistical analysis on returns.

In [None]:
# Use ReturnsCalculator for detailed analysis
calc = ReturnsCalculator()
stats = calc.get_statistics(log_returns)

print("üìä Detailed Return Statistics:\n")
print(stats.round(6))

# Calculate annualized metrics
print("\nüìä Annualized Metrics:")
for ticker in log_returns.columns:
    mean_daily = log_returns[ticker].mean()
    std_daily = log_returns[ticker].std()
    
    mean_annual = mean_daily * 252
    std_annual = std_daily * np.sqrt(252)
    sharpe = mean_annual / std_annual if std_annual > 0 else 0
    
    print(f"\n{ticker}:")
    print(f"   Mean Return: {mean_annual*100:7.2f}%")
    print(f"   Volatility:  {std_annual*100:7.2f}%")
    print(f"   Sharpe:      {sharpe:7.2f}")

In [None]:
# Distribution analysis
fig, axes = plt.subplots(2, len(tickers), figsize=(14, 8))

for idx, ticker in enumerate(tickers):
    returns_pct = log_returns[ticker] * 100
    
    # Histogram
    axes[0, idx].hist(returns_pct, bins=50, alpha=0.7, color=colors[idx], edgecolor='black')
    axes[0, idx].axvline(x=0, color='black', linestyle='--', alpha=0.5)
    axes[0, idx].set_title(f'{ticker} Returns Distribution', fontsize=12, fontweight='bold')
    axes[0, idx].set_xlabel('Returns (%)', fontsize=10)
    axes[0, idx].set_ylabel('Frequency', fontsize=10)
    axes[0, idx].grid(True, alpha=0.3, axis='y')
    
    # Q-Q plot
    from scipy import stats as sp_stats
    sp_stats.probplot(returns_pct, dist="norm", plot=axes[1, idx])
    axes[1, idx].set_title(f'{ticker} Q-Q Plot', fontsize=12, fontweight='bold')
    axes[1, idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nüìä Normality Assessment:")
for ticker in tickers:
    skew = log_returns[ticker].skew()
    kurt = log_returns[ticker].kurtosis()
    print(f"\n{ticker}:")
    print(f"   Skewness: {skew:7.3f} ({'left-skewed' if skew < 0 else 'right-skewed'})")
    print(f"   Kurtosis: {kurt:7.3f} ({'fat tails' if kurt > 0 else 'thin tails'})")

## 6. Correlation Analysis

Analyze correlations between different assets.

In [None]:
# Calculate correlation matrix
correlation_matrix = log_returns.corr()

print("üìä Return Correlation Matrix:\n")
print(correlation_matrix.round(3))

# Visualize correlation matrix
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='RdYlBu_r', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8},
            fmt='.3f', vmin=-1, vmax=1)
plt.title('Return Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Rolling correlation
if len(tickers) >= 2:
    ticker1, ticker2 = tickers[0], tickers[1]
    
    rolling_corr = log_returns[ticker1].rolling(window=60).corr(log_returns[ticker2])
    
    plt.figure(figsize=(14, 6))
    plt.plot(rolling_corr.index, rolling_corr, linewidth=2, color='#2E86AB')
    plt.axhline(y=0, color='black', linestyle='--', alpha=0.3)
    plt.fill_between(rolling_corr.index, 0, rolling_corr, alpha=0.3, color='#2E86AB')
    plt.title(f'60-Day Rolling Correlation: {ticker1} vs {ticker2}', 
              fontsize=14, fontweight='bold')
    plt.xlabel('Date', fontsize=11)
    plt.ylabel('Correlation', fontsize=11)
    plt.ylim(-1, 1)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    print(f"\nüìä Correlation Statistics ({ticker1} vs {ticker2}):")
    print(f"   Mean: {rolling_corr.mean():.3f}")
    print(f"   Std:  {rolling_corr.std():.3f}")
    print(f"   Min:  {rolling_corr.min():.3f}")
    print(f"   Max:  {rolling_corr.max():.3f}")

## 7. Outlier Detection

Identify and analyze outliers in the returns data.

In [None]:
# Detect outliers using z-score method
from scipy import stats

print("üîç Detecting outliers (|z-score| > 3)...\n")

for ticker in tickers:
    returns_series = log_returns[ticker].dropna()
    z_scores = np.abs(stats.zscore(returns_series))
    outliers = returns_series[z_scores > 3]
    
    print(f"{ticker}:")
    print(f"   Total outliers: {len(outliers)}")
    print(f"   Percentage:     {len(outliers)/len(returns_series)*100:.2f}%")
    
    if len(outliers) > 0:
        print(f"   Largest positive: {outliers.max()*100:7.2f}% on {outliers.idxmax().date()}")
        print(f"   Largest negative: {outliers.min()*100:7.2f}% on {outliers.idxmin().date()}")
    print()

In [None]:
# Visualize outliers
fig, axes = plt.subplots(len(tickers), 1, figsize=(14, 4*len(tickers)))

for idx, ticker in enumerate(tickers):
    ax = axes[idx] if len(tickers) > 1 else axes
    
    returns_series = log_returns[ticker] * 100
    z_scores = np.abs(stats.zscore(log_returns[ticker].dropna()))
    
    # Plot all returns
    ax.scatter(returns_series.index, returns_series, alpha=0.4, s=10, 
               color=colors[idx], label='Normal')
    
    # Highlight outliers
    outlier_mask = z_scores > 3
    outlier_dates = returns_series.index[outlier_mask]
    outlier_values = returns_series[outlier_mask]
    ax.scatter(outlier_dates, outlier_values, color='red', s=50, 
               marker='x', label='Outliers', zorder=5)
    
    ax.axhline(y=0, color='black', linestyle='--', alpha=0.3)
    ax.set_title(f'{ticker} Returns with Outliers', fontsize=13, fontweight='bold')
    ax.set_xlabel('Date', fontsize=11)
    ax.set_ylabel('Returns (%)', fontsize=11)
    ax.legend(fontsize=10)
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Summary

Let's summarize our data analysis findings.

In [None]:
print("=" * 70)
print(f"{'DATA ANALYSIS SUMMARY':^70}")
print("=" * 70)

print(f"\nüìä Dataset Info:")
print(f"   Tickers: {', '.join(tickers)}")
print(f"   Period:  {clean_prices.index[0].date()} to {clean_prices.index[-1].date()}")
print(f"   Days:    {len(clean_prices)}")

print(f"\nüìä Data Quality:")
print(f"   Missing values: {clean_prices.isna().sum().sum()}")
print(f"   Zero values:    {(clean_prices == 0).sum().sum()}")
print(f"   Date gaps (>7d): {len(large_gaps) if 'large_gaps' in locals() else 0}")

print(f"\nüìä Returns Statistics (Annualized):")
for ticker in tickers:
    mean_ret = log_returns[ticker].mean() * 252
    std_ret = log_returns[ticker].std() * np.sqrt(252)
    sharpe = mean_ret / std_ret
    
    print(f"\n   {ticker}:")
    print(f"      Return:  {mean_ret*100:6.2f}%")
    print(f"      Vol:     {std_ret*100:6.2f}%")
    print(f"      Sharpe:  {sharpe:6.2f}")

print(f"\nüìä Distribution Properties:")
for ticker in tickers:
    skew = log_returns[ticker].skew()
    kurt = log_returns[ticker].kurtosis()
    print(f"\n   {ticker}:")
    print(f"      Skewness: {skew:6.3f}")
    print(f"      Kurtosis: {kurt:6.3f}")

print("\n" + "=" * 70)
print("‚úÖ Data Analysis Complete!")
print("=" * 70)

## Next Steps

- **03_volatility_models.ipynb**: Deep dive into volatility modeling
- **04_regime_analysis.ipynb**: Advanced regime classification
- **05_strategy_integration.ipynb**: Integrate with trading strategies

---

**Continue exploring! üìäüîç**