# Cryptocurrency Data Exploration

This notebook explores cryptocurrency data to understand its characteristics and patterns.

In [None]:
# Import necessary libraries
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Import our modules
from data.data_loader import CryptoDataLoader
from data.feature_engineering import FeatureEngineer
from utils.helpers import calculate_returns, calculate_volatility

## 1. Load Cryptocurrency Data

In [None]:
# Initialize data loader
loader = CryptoDataLoader('../data/raw')

# Define cryptocurrencies to analyze
tickers = ['BTC-USD', 'ETH-USD', 'BNB-USD', 'ADA-USD', 'SOL-USD']

# Load data for the last 2 years
end_date = datetime.now().strftime('%Y-%m-%d')
start_date = (datetime.now() - timedelta(days=730)).strftime('%Y-%m-%d')

print(f"Loading data from {start_date} to {end_date}")

# Fetch data
crypto_data = loader.fetch_multiple_tickers(tickers, start_date, end_date)

print(f"Loaded data for {len(crypto_data)} cryptocurrencies")
for ticker, data in crypto_data.items():
    print(f"{ticker}: {len(data)} records from {data.index[0].date()} to {data.index[-1].date()}")

## 2. Basic Data Analysis

In [None]:
# Display basic statistics for Bitcoin
btc_data = crypto_data['BTC-USD']
print("Bitcoin Data Statistics:")
print(btc_data.describe())

# Check for missing values
print("\nMissing Values:")
print(btc_data.isnull().sum())

In [None]:
# Plot price trends for all cryptocurrencies
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for i, (ticker, data) in enumerate(crypto_data.items()):
    if i < len(axes):
        axes[i].plot(data.index, data['Close'], linewidth=2)
        axes[i].set_title(f'{ticker} Price')
        axes[i].set_ylabel('Price (USD)')
        axes[i].grid(True, alpha=0.3)
        
        # Format x-axis
        axes[i].tick_params(axis='x', rotation=45)

# Hide the last subplot if we have fewer than 6 cryptocurrencies
if len(crypto_data) < len(axes):
    axes[-1].set_visible(False)

plt.tight_layout()
plt.show()

## 3. Price Analysis

In [None]:
# Calculate returns for each cryptocurrency
returns_data = {}
for ticker, data in crypto_data.items():
    returns_data[ticker] = calculate_returns(data['Close'].values)

# Create returns DataFrame
returns_df = pd.DataFrame(returns_data)

# Plot returns distribution
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for i, ticker in enumerate(tickers):
    if i < len(axes):
        axes[i].hist(returns_df[ticker].dropna(), bins=50, alpha=0.7, edgecolor='black')
        axes[i].set_title(f'{ticker} Returns Distribution')
        axes[i].set_xlabel('Returns')
        axes[i].set_ylabel('Frequency')
        axes[i].grid(True, alpha=0.3)
        
        # Add vertical line at mean
        mean_return = returns_df[ticker].mean()
        axes[i].axvline(mean_return, color='red', linestyle='--', 
                       label=f'Mean: {mean_return:.4f}')
        axes[i].legend()

# Hide the last subplot if we have fewer than 6 cryptocurrencies
if len(tickers) < len(axes):
    axes[-1].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
# Calculate and plot volatility
volatility_data = {}
for ticker, data in crypto_data.items():
    returns = calculate_returns(data['Close'].values)
    volatility_data[ticker] = calculate_volatility(returns, window_size=30)

# Create volatility DataFrame
volatility_df = pd.DataFrame(volatility_data)
volatility_df.index = crypto_data['BTC-USD'].index[30:]  # Adjust index for window size

# Plot volatility
plt.figure(figsize=(14, 7))
for ticker in volatility_df.columns:
    plt.plot(volatility_df.index, volatility_df[ticker], label=ticker, linewidth=2)

plt.title('Cryptocurrency Volatility (30-day Rolling)', fontsize=16)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Volatility', fontsize=12)
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 4. Correlation Analysis

In [None]:
# Calculate correlation matrix
correlation_matrix = returns_df.corr()

# Plot correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5)
plt.title('Cryptocurrency Returns Correlation Matrix', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# Plot scatter plot matrix
from pandas.plotting import scatter_matrix

# Sample data for faster plotting
sampled_returns = returns_df.sample(n=min(500, len(returns_df)), random_state=42)

scatter_matrix(sampled_returns, figsize=(15, 15), diagonal='hist',
               alpha=0.6, grid=True)
plt.suptitle('Cryptocurrency Returns Scatter Plot Matrix', fontsize=16)
plt.tight_layout()
plt.show()

## 5. Volume Analysis

In [None]:
# Plot volume for top cryptocurrencies
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for i, (ticker, data) in enumerate(crypto_data.items()):
    if i < len(axes):
        # Plot price on primary axis
        ax1 = axes[i]
        ax1.plot(data.index, data['Close'], color='blue', linewidth=2, label='Price')
        ax1.set_title(f'{ticker} Price and Volume')
        ax1.set_ylabel('Price (USD)', color='blue')
        ax1.tick_params(axis='y', labelcolor='blue')
        ax1.grid(True, alpha=0.3)
        
        # Plot volume on secondary axis
        ax2 = ax1.twinx()
        ax2.bar(data.index, data['Volume'], alpha=0.3, color='orange', label='Volume')
        ax2.set_ylabel('Volume', color='orange')
        ax2.tick_params(axis='y', labelcolor='orange')
        
        # Format x-axis
        ax1.tick_params(axis='x', rotation=45)

# Hide the last subplot if we have fewer than 6 cryptocurrencies
if len(crypto_data) < len(axes):
    axes[-1].set_visible(False)

plt.tight_layout()
plt.show()

## 6. Seasonal and Pattern Analysis

In [None]:
# Analyze day-of-week patterns
day_of_week_returns = {}
for ticker, data in crypto_data.items():
    returns = calculate_returns(data['Close'].values)
    df_temp = pd.DataFrame({'returns': returns, 'date': data.index[1:]})
    df_temp['day_of_week'] = df_temp['date'].dt.dayofweek
    day_of_week_returns[ticker] = df_temp.groupby('day_of_week')['returns'].mean()

# Create DataFrame for plotting
day_of_week_df = pd.DataFrame(day_of_week_returns)
day_of_week_df.index = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

# Plot day-of-week patterns
plt.figure(figsize=(12, 6))
for ticker in day_of_week_df.columns:
    plt.plot(day_of_week_df.index, day_of_week_df[ticker], marker='o', label=ticker, linewidth=2)

plt.title('Average Returns by Day of Week', fontsize=16)
plt.xlabel('Day of Week', fontsize=12)
plt.ylabel('Average Return', fontsize=12)
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.axhline(y=0, color='black', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

In [None]:
# Analyze monthly patterns
monthly_returns = {}
for ticker, data in crypto_data.items():
    returns = calculate_returns(data['Close'].values)
    df_temp = pd.DataFrame({'returns': returns, 'date': data.index[1:]})
    df_temp['month'] = df_temp['date'].dt.month
    monthly_returns[ticker] = df_temp.groupby('month')['returns'].mean()

# Create DataFrame for plotting
monthly_df = pd.DataFrame(monthly_returns)
monthly_df.index = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
                    'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

# Plot monthly patterns
plt.figure(figsize=(12, 6))
for ticker in monthly_df.columns:
    plt.plot(monthly_df.index, monthly_df[ticker], marker='o', label=ticker, linewidth=2)

plt.title('Average Returns by Month', fontsize=16)
plt.xlabel('Month', fontsize=12)
plt.ylabel('Average Return', fontsize=12)
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.axhline(y=0, color='black', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

## 7. Statistical Tests

In [None]:
# Perform Augmented Dickey-Fuller test for stationarity
from statsmodels.tsa.stattools import adfuller

print("Augmented Dickey-Fuller Test Results:")
print("-" * 50)
for ticker, data in crypto_data.items():
    result = adfuller(data['Close'])
    print(f"\n{ticker}:")
    print(f'ADF Statistic: {result[0]:.4f}')
    print(f'p-value: {result[1]:.4f}')
    print('Critical Values:')
    for key, value in result[4].items():
        print(f'\t{key}: {value:.4f}')
    
    if result[1] < 0.05:
        print("Result: Reject null hypothesis - Series is stationary")
    else:
        print("Result: Fail to reject null hypothesis - Series is non-stationary")

In [None]:
# Calculate Hurst exponent to measure long-term memory
def hurst_exponent(ts):
    """Calculate the Hurst Exponent of the time series"""
    lags = range(2, 100)
    tau = [np.std(np.subtract(ts[lag:], ts[:-lag])) for lag in lags]

    # Use a linear fit to estimate the Hurst Exponent
    poly = np.polyfit(np.log(lags), np.log(tau), 1)
    return poly[0] * 2.0

print("Hurst Exponent Results:")
print("-" * 30)
for ticker, data in crypto_data.items():
    hurst = hurst_exponent(data['Close'].values)
    print(f"{ticker}: {hurst:.4f}")
    
    if hurst < 0.5:
        print("  -> Mean-reverting series")
    elif hurst == 0.5:
        print("  -> Random walk")
    else:
        print("  -> Trending series")

## 8. Summary Statistics

In [None]:
# Create summary statistics table
summary_stats = pd.DataFrame()

for ticker, data in crypto_data.items():
    returns = calculate_returns(data['Close'].values)
    
    stats = {
        'Mean Return': returns.mean(),
        'Std Return': returns.std(),
        'Min Return': returns.min(),
        'Max Return': returns.max(),
        'Skewness': pd.Series(returns).skew(),
        'Kurtosis': pd.Series(returns).kurtosis(),
        'Sharpe Ratio': returns.mean() / returns.std() * np.sqrt(365),
        'Max Drawdown': calculate_max_drawdown(data['Close'].values),
        'Volatility (30d)': calculate_volatility(returns, 30).mean(),
        'Hurst Exponent': hurst_exponent(data['Close'].values)
    }
    
    summary_stats[ticker] = pd.Series(stats)

# Display summary statistics
pd.set_option('display.float_format', '{:.4f}'.format)
display(summary_stats.T)

def calculate_max_drawdown(prices):
    """Calculate maximum drawdown"""
    peak = np.maximum.accumulate(prices)
    drawdown = (prices - peak) / peak
    return drawdown.min()

## 9. Key Insights

Based on our exploration, we can observe:

1. **Price Trends**: Most cryptocurrencies show similar overall trends with periods of high volatility.
2. **Return Distributions**: Returns are typically leptokurtic (fat tails) and often skewed.
3. **Correlations**: Strong positive correlations between major cryptocurrencies, especially during market stress.
4. **Volatility Clustering**: Periods of high volatility tend to cluster together.
5. **Seasonal Patterns**: Some evidence of day-of-week and monthly effects, though not statistically significant.
6. **Non-Stationarity**: Price series are generally non-stationary, requiring differencing for modeling.
7. **Long-term Memory**: Hurst exponents suggest varying degrees of trend-reverting or trending behavior.

These insights will guide our feature engineering and model selection process.