# Phase 0: Data Exploration & Baseline Evaluation

**Objective:** Understand the data before any modeling. This is like calibrating instruments before an experiment.

## What we'll do:
1. Load and inspect all datasets
2. Check for data quality issues (missing values, alignment, etc.)
3. Visualize prices, signals, and on-chain metrics
4. Compute and understand baseline strategy performance

---

In [None]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)
plt.rcParams['font.size'] = 11

# Add src to path
import sys
sys.path.insert(0, str(Path.cwd().parent / 'src'))

# Import our utilities
from data import load_all_data, get_data_summary, compute_returns
from metrics import compute_all_metrics, compute_sharpe_ratio, compute_max_drawdown
from backtesting import compute_strategy_returns, compute_portfolio_returns, compute_equity_curve

## 1. Load the Data

We have three datasets:
- **trade_log.csv**: Binary signals (0=cash, 1=long) for 6 crypto assets
- **price_data.csv**: Asset prices at 3-hour intervals
- **glassnode_metrics.csv**: Bitcoin on-chain features (regime indicators)

In [None]:
# Load all datasets
trade_log, prices, glassnode = load_all_data()

print("=" * 60)
print("DATA LOADED SUCCESSFULLY")
print("=" * 60)

### 1.1 Trade Log (Baseline Signals)

In [None]:
print(f"Shape: {trade_log.shape}")
print(f"Date range: {trade_log.index.min()} to {trade_log.index.max()}")
print(f"Assets: {list(trade_log.columns)}")
print(f"\nFirst 5 rows:")
trade_log.head()

In [None]:
# Signal distribution per asset
print("Signal Distribution (percentage of time in 'long' position):")
print("-" * 50)
for col in trade_log.columns:
    long_pct = trade_log[col].mean() * 100
    print(f"{col:6s}: {long_pct:5.1f}% long, {100-long_pct:5.1f}% cash")

In [None]:
# Count signal changes (trades)
print("\nNumber of signal changes (trades) per asset:")
print("-" * 50)
for col in trade_log.columns:
    n_changes = trade_log[col].diff().abs().sum()
    print(f"{col:6s}: {int(n_changes):4d} trades")

### 1.2 Price Data

In [None]:
print(f"Shape: {prices.shape}")
print(f"Date range: {prices.index.min()} to {prices.index.max()}")
print(f"Assets: {list(prices.columns)}")
print(f"\nFirst 5 rows:")
prices.head()

In [None]:
# Check for missing values
print("Missing values per asset:")
print("-" * 50)
missing = prices.isna().sum()
for col in prices.columns:
    pct = missing[col] / len(prices) * 100
    print(f"{col:6s}: {missing[col]:5d} ({pct:5.2f}%)")

In [None]:
# Price statistics
print("\nPrice Statistics:")
prices.describe()

In [None]:
# Find when each asset starts having data
print("\nFirst valid data point per asset:")
print("-" * 50)
for col in prices.columns:
    first_valid = prices[col].first_valid_index()
    print(f"{col:6s}: {first_valid}")

### 1.3 Glassnode On-Chain Metrics

In [None]:
print(f"Shape: {glassnode.shape}")
print(f"Date range: {glassnode.index.min()} to {glassnode.index.max()}")
print(f"Number of features: {len(glassnode.columns)}")
print(f"\nFeature names:")
for i, col in enumerate(glassnode.columns):
    print(f"  {i+1:2d}. {col}")

In [None]:
# Check key features we'll likely use
key_features = [
    'btc_mvrv_z_score',
    'btc_adjusted_sopr', 
    'btc_fear_greed_index',
    'reserve_risk',
    'btc_puell_multiple',
    'btc_futures_perpetual_funding_rate_mean'
]

print("Key Glassnode features:")
for feat in key_features:
    if feat in glassnode.columns:
        valid_count = glassnode[feat].notna().sum()
        print(f"  {feat}: {valid_count} valid values")
    else:
        print(f"  {feat}: NOT FOUND")

---

## 2. Data Alignment Check

**CRITICAL**: Ensure timestamps align across datasets to avoid lookahead bias.

In [None]:
# Date ranges comparison
print("Date Ranges:")
print("=" * 60)
print(f"Trade Log:  {trade_log.index.min()} to {trade_log.index.max()}")
print(f"Prices:     {prices.index.min()} to {prices.index.max()}")
print(f"Glassnode:  {glassnode.index.min()} to {glassnode.index.max()}")

# Overlapping period
start = max(trade_log.index.min(), prices.index.min())
end = min(trade_log.index.max(), prices.index.max())
print(f"\nOverlapping period: {start} to {end}")

In [None]:
# Check time frequency
print("\nTime frequency analysis:")
print("-" * 50)

# Trade log frequency
tl_diff = trade_log.index.to_series().diff().dropna()
print(f"Trade Log - most common interval: {tl_diff.mode().iloc[0]}")
print(f"Trade Log - min interval: {tl_diff.min()}")
print(f"Trade Log - max interval: {tl_diff.max()}")

# Price frequency
p_diff = prices.index.to_series().diff().dropna()
print(f"\nPrices - most common interval: {p_diff.mode().iloc[0]}")
print(f"Prices - min interval: {p_diff.min()}")

In [None]:
# Check if trade_log timestamps exist in prices
tl_in_prices = trade_log.index.isin(prices.index)
print(f"\nTrade log timestamps found in prices: {tl_in_prices.sum()} / {len(trade_log)}")
print(f"Trade log timestamps NOT in prices: {(~tl_in_prices).sum()}")

---

## 3. Visualizations

### 3.1 Price Time Series

In [None]:
# Normalize prices for comparison (start at 100)
normalized_prices = prices.divide(prices.bfill().iloc[0]) * 100

fig, ax = plt.subplots(figsize=(14, 7))
for col in prices.columns:
    ax.plot(normalized_prices.index, normalized_prices[col], label=col, alpha=0.8)

ax.set_title('Normalized Price Performance (Starting at 100)', fontsize=14)
ax.set_xlabel('Date')
ax.set_ylabel('Normalized Price')
ax.legend(loc='upper left')
ax.set_yscale('log')
plt.tight_layout()
plt.show()

### 3.2 Trading Signals Over Time

In [None]:
# Plot signals as heatmap
fig, ax = plt.subplots(figsize=(14, 4))

# Resample to daily for cleaner visualization
signals_daily = trade_log.resample('D').last()

sns.heatmap(signals_daily.T, cmap='RdYlGn', cbar_kws={'label': '0=Cash, 1=Long'},
            yticklabels=True, xticklabels=False, ax=ax)
ax.set_title('Baseline Strategy Signals Over Time (Green=Long, Red=Cash)', fontsize=14)
ax.set_xlabel('Date')
ax.set_ylabel('Asset')
plt.tight_layout()
plt.show()

### 3.3 Key Glassnode Metrics

In [None]:
# Filter glassnode to overlapping period with trade_log
gn_filtered = glassnode.loc[trade_log.index.min():trade_log.index.max()]

# Plot key metrics
fig, axes = plt.subplots(3, 2, figsize=(14, 10))

metrics_to_plot = [
    ('btc_mvrv_z_score', 'MVRV Z-Score (Over/Undervalued)'),
    ('btc_fear_greed_index', 'Fear & Greed Index (0-100)'),
    ('btc_adjusted_sopr', 'Adjusted SOPR (Profit Taking)'),
    ('reserve_risk', 'Reserve Risk'),
    ('btc_puell_multiple', 'Puell Multiple (Miner Revenue)'),
    ('btc_futures_perpetual_funding_rate_mean', 'Funding Rate Mean')
]

for ax, (metric, title) in zip(axes.flat, metrics_to_plot):
    if metric in gn_filtered.columns:
        data = gn_filtered[metric].dropna()
        ax.plot(data.index, data.values, alpha=0.8)
        ax.set_title(title, fontsize=11)
        ax.tick_params(axis='x', rotation=45)
    else:
        ax.text(0.5, 0.5, 'Data not available', ha='center', va='center')
        ax.set_title(title)

plt.tight_layout()
plt.show()

---

## 4. Baseline Strategy Performance

**Key Question**: How well does the baseline strategy perform? This is our benchmark.

In [None]:
# Align trade_log and prices
# Find common timestamps
common_idx = trade_log.index.intersection(prices.index)
print(f"Common timestamps: {len(common_idx)}")

# Get aligned data
signals_aligned = trade_log.loc[common_idx]
prices_aligned = prices.loc[common_idx]

# Find common assets
common_assets = signals_aligned.columns.intersection(prices_aligned.columns)
print(f"Common assets: {list(common_assets)}")

signals_aligned = signals_aligned[common_assets]
prices_aligned = prices_aligned[common_assets]

In [None]:
# Compute baseline strategy returns (no transaction costs first)
baseline_returns = compute_strategy_returns(
    signals_aligned, 
    prices_aligned,
    transaction_cost=0.0
)

# Portfolio returns (equal weight)
portfolio_returns = compute_portfolio_returns(baseline_returns)

print("Baseline Strategy Returns (No Costs):")
print(f"Total periods: {len(portfolio_returns)}")
print(f"Mean return per period: {portfolio_returns.mean():.6f}")
print(f"Std return per period: {portfolio_returns.std():.6f}")

In [None]:
# Compute all metrics
baseline_metrics = compute_all_metrics(portfolio_returns.dropna())

print("\n" + "=" * 60)
print("BASELINE STRATEGY METRICS (No Transaction Costs)")
print("=" * 60)
for metric, value in baseline_metrics.items():
    if isinstance(value, float):
        print(f"{metric:25s}: {value:12.4f}")
    else:
        print(f"{metric:25s}: {value}")

In [None]:
# Now with transaction costs (0.1% = 10 bps)
baseline_returns_with_costs = compute_strategy_returns(
    signals_aligned,
    prices_aligned,
    transaction_cost=0.001
)

portfolio_returns_with_costs = compute_portfolio_returns(baseline_returns_with_costs)
baseline_metrics_costs = compute_all_metrics(portfolio_returns_with_costs.dropna())

print("\n" + "=" * 60)
print("BASELINE STRATEGY METRICS (With 0.1% Transaction Costs)")
print("=" * 60)
for metric, value in baseline_metrics_costs.items():
    if isinstance(value, float):
        print(f"{metric:25s}: {value:12.4f}")
    else:
        print(f"{metric:25s}: {value}")

In [None]:
# Per-asset metrics
print("\n" + "=" * 60)
print("PER-ASSET PERFORMANCE (With Costs)")
print("=" * 60)

asset_metrics = {}
for asset in common_assets:
    asset_returns = baseline_returns_with_costs[asset].dropna()
    if len(asset_returns) > 0:
        metrics = compute_all_metrics(asset_returns)
        asset_metrics[asset] = metrics
        print(f"\n{asset}:")
        print(f"  Total Return:  {metrics['total_return']*100:7.2f}%")
        print(f"  Sharpe Ratio:  {metrics['sharpe_ratio']:7.2f}")
        print(f"  Max Drawdown:  {metrics['max_drawdown']*100:7.2f}%")
        print(f"  Win Rate:      {metrics['win_rate']*100:7.2f}%")

### 4.1 Equity Curve

In [None]:
# Plot equity curves
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Portfolio equity curve
equity = compute_equity_curve(portfolio_returns_with_costs.dropna())
axes[0].plot(equity.index, equity.values, 'b-', linewidth=1.5, label='Baseline Strategy')
axes[0].axhline(y=1, color='gray', linestyle='--', alpha=0.5)
axes[0].set_title('Portfolio Equity Curve (With 0.1% Transaction Costs)', fontsize=14)
axes[0].set_ylabel('Portfolio Value (Starting at 1.0)')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Per-asset equity curves
for asset in common_assets:
    asset_equity = compute_equity_curve(baseline_returns_with_costs[asset].dropna())
    axes[1].plot(asset_equity.index, asset_equity.values, label=asset, alpha=0.7)

axes[1].axhline(y=1, color='gray', linestyle='--', alpha=0.5)
axes[1].set_title('Per-Asset Equity Curves', fontsize=14)
axes[1].set_ylabel('Portfolio Value')
axes[1].set_xlabel('Date')
axes[1].legend(loc='upper left')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

### 4.2 Drawdown Analysis

In [None]:
from backtesting import compute_drawdown_series

# Compute drawdown series
equity = compute_equity_curve(portfolio_returns_with_costs.dropna())
drawdowns = compute_drawdown_series(equity)

fig, ax = plt.subplots(figsize=(14, 5))
ax.fill_between(drawdowns.index, drawdowns.values, 0, color='red', alpha=0.3)
ax.plot(drawdowns.index, drawdowns.values, 'r-', linewidth=1)
ax.set_title('Portfolio Drawdown Over Time', fontsize=14)
ax.set_ylabel('Drawdown (%)')
ax.set_xlabel('Date')
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x*100:.0f}%'))
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"Maximum Drawdown: {drawdowns.min()*100:.2f}%")
print(f"Max Drawdown Date: {drawdowns.idxmin()}")

### 4.3 Compare to Buy & Hold

In [None]:
# Buy and hold returns (always long)
buy_hold_signals = pd.DataFrame(1, index=signals_aligned.index, columns=common_assets)
buy_hold_returns = compute_strategy_returns(
    buy_hold_signals,
    prices_aligned,
    transaction_cost=0.0  # No transaction costs for buy & hold
)
buy_hold_portfolio = compute_portfolio_returns(buy_hold_returns)

# Compute metrics
buy_hold_metrics = compute_all_metrics(buy_hold_portfolio.dropna())

print("\n" + "=" * 60)
print("COMPARISON: BASELINE vs BUY & HOLD")
print("=" * 60)
print(f"{'Metric':<25} {'Baseline':>12} {'Buy & Hold':>12}")
print("-" * 60)
for metric in ['total_return', 'sharpe_ratio', 'max_drawdown', 'win_rate']:
    b_val = baseline_metrics_costs[metric]
    bh_val = buy_hold_metrics[metric]
    if metric in ['total_return', 'max_drawdown', 'win_rate']:
        print(f"{metric:<25} {b_val*100:>11.2f}% {bh_val*100:>11.2f}%")
    else:
        print(f"{metric:<25} {b_val:>12.2f} {bh_val:>12.2f}")

In [None]:
# Plot comparison
fig, ax = plt.subplots(figsize=(14, 7))

baseline_equity = compute_equity_curve(portfolio_returns_with_costs.dropna())
buy_hold_equity = compute_equity_curve(buy_hold_portfolio.dropna())

ax.plot(baseline_equity.index, baseline_equity.values, 'b-', linewidth=2, label='Baseline Strategy')
ax.plot(buy_hold_equity.index, buy_hold_equity.values, 'gray', linewidth=2, alpha=0.7, label='Buy & Hold')
ax.axhline(y=1, color='black', linestyle='--', alpha=0.3)

ax.set_title('Baseline Strategy vs Buy & Hold', fontsize=14)
ax.set_ylabel('Portfolio Value')
ax.set_xlabel('Date')
ax.legend()
ax.grid(True, alpha=0.3)
ax.set_yscale('log')
plt.tight_layout()
plt.show()

---

## 5. Key Observations

Document your findings here after running the notebook:

### Data Quality
- [ ] Trade log has X timestamps, covering period Y to Z
- [ ] Some assets (SOL, DOGE, ADA) have missing price data early in the period
- [ ] Glassnode data covers full period but some features have gaps

### Baseline Performance
- [ ] Sharpe ratio: X.XX
- [ ] Max drawdown: XX%
- [ ] Compared to buy & hold: Better/Worse?

### Next Steps
- [ ] Create labels for ML model
- [ ] Engineer features from price and Glassnode data
- [ ] Implement walk-forward validation

---

## 6. Save Processed Data

In [None]:
# Save aligned data for use in next notebooks
output_dir = Path.cwd().parent / 'data' / 'processed'
output_dir.mkdir(parents=True, exist_ok=True)

signals_aligned.to_csv(output_dir / 'signals_aligned.csv')
prices_aligned.to_csv(output_dir / 'prices_aligned.csv')

print(f"Saved aligned data to {output_dir}")
print(f"  - signals_aligned.csv: {signals_aligned.shape}")
print(f"  - prices_aligned.csv: {prices_aligned.shape}")