# Phase 2B: Statistical Analysis by Asset Class

Comprehensive statistical analysis of DeFi vs TradFi trading for all asset classes:
- Traditional Commodities (Gold, Silver, Oil, Natural Gas)
- Traditional Equities (AAPL, GOOGL, MSFT, NVDA, TSLA)
- Crypto Coins (BTC, ETH, SOL)
- Crypto Memecoins (DOGE, PEPE, SHIB, WIF)

**Analyses:**
1. Volume statistics and comparisons (using notional volumes)
2. Rolling z-score pattern detection (3-day window)
3. Cross-correlation with lag shifts (-7 to +7 days)
4. Rolling average volumes (1d / 3d / 5d / 7d)
5. Price correlation and tracking error

In [None]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

PHASE_1B_DIR = os.path.join("output", "Phase 1B")

## Load All Assets from Phase 1B

In [None]:
def load_all_assets():
    """Load all asset Excel files from Phase 1B output directory."""
    assets = {}
    
    # Find all subdirectories (asset types) in Phase 1B output
    asset_type_dirs = [d for d in Path(PHASE_1B_DIR).iterdir() if d.is_dir()]
    
    for asset_type_dir in asset_type_dirs:
        asset_type = asset_type_dir.name
        
        # Find all Excel files in this asset type directory
        excel_files = list(asset_type_dir.glob("*.xlsx"))
        
        for excel_file in excel_files:
            asset_name = excel_file.stem  # Filename without extension
            
            try:
                df = pd.read_excel(excel_file)
                df["time"] = pd.to_datetime(df["time"])
                
                # Count overlapping data points
                overlap = df.dropna(subset=["defi_notional_volume", "tradfi_notional_volume"]).shape[0]
                
                assets[asset_name] = {
                    "data": df,
                    "asset_type": asset_type,
                    "overlap": overlap,
                }
                
                print(f"{asset_name:<15} ({asset_type:<30}) {len(df):>4} rows, {overlap:>3} overlapping")
            
            except Exception as e:
                print(f"Error loading {excel_file}: {e}")
    
    return assets

assets = load_all_assets()
print(f"\nTotal assets loaded: {len(assets)}")

## 1. Volume Statistics and Comparisons

Calculate summary statistics for notional volumes across all assets.

In [None]:
print("\n" + "="*100)
print("VOLUME STATISTICS (USD NOTIONAL)")
print("="*100)

for asset_name, asset_info in sorted(assets.items()):
    df = asset_info["data"]
    asset_type = asset_info["asset_type"]
    
    # Filter to overlapping period
    overlap_mask = df[["defi_notional_volume", "tradfi_notional_volume"]].notna().all(axis=1)
    
    if overlap_mask.any():
        overlap_df = df[overlap_mask]
        
        defi_mean = overlap_df["defi_notional_volume"].mean()
        tradfi_mean = overlap_df["tradfi_notional_volume"].mean()
        ratio = tradfi_mean / defi_mean if defi_mean > 0 else 0
        
        print(f"\n{asset_name} ({asset_type})")
        print(f"  DeFi:    ${defi_mean:>15,.0f}/day")
        print(f"  TradFi:  ${tradfi_mean:>15,.0f}/day")
        print(f"  Ratio:   {ratio:>15.1f}x (TradFi/DeFi)")
        print(f"  Days:    {overlap_mask.sum()} overlapping")

## 2. Daily Volume Z-Score Analysis â€” Rolling 3-Day Window

Compute z-scores within a rolling 3-day window to detect unusual volume patterns.  
Colored bands indicate severity thresholds:
- ðŸŸ¢ Normal: |z| < 1.0
- ðŸŸ¡ Moderate: 1.0 â‰¤ |z| < 1.5  
- ðŸŸ  Elevated: 1.5 â‰¤ |z| < 2.0
- ðŸ”´ Significant: |z| â‰¥ 2.0

In [None]:
WINDOW_3D = 3  # 3 days
MIN_PERIODS_3D = 2  # require at least 2 days

for asset_name, asset_info in sorted(assets.items()):
    df = asset_info["data"]
    asset_type = asset_info["asset_type"]
    
    # Skip if insufficient overlapping data
    overlap_mask = df[["defi_notional_volume", "tradfi_notional_volume"]].notna().all(axis=1)
    if overlap_mask.sum() < 5:
        print(f"Skipping {asset_name}: insufficient overlapping data")
        continue
    
    fig = make_subplots(
        rows=2, cols=1, 
        shared_xaxes=True, 
        vertical_spacing=0.08,
        subplot_titles=("DeFi Volume Z-Score (Daily)", "TradFi Volume Z-Score (Daily)")
    )

    for i, (col, label) in enumerate([("defi_notional_volume", "DeFi"), ("tradfi_notional_volume", "TradFi")], 1):
        # Calculate z-scores on daily data
        rolling_mean = df[col].rolling(WINDOW_3D, min_periods=MIN_PERIODS_3D).mean()
        rolling_std = df[col].rolling(WINDOW_3D, min_periods=MIN_PERIODS_3D).std()
        z = (df[col] - rolling_mean) / rolling_std
        z = z.replace([np.inf, -np.inf], 0)

        # Add colored threshold bands
        fig.add_hrect(y0=2.0, y1=10, fillcolor="red", opacity=0.1, line_width=0, row=i, col=1)
        fig.add_hrect(y0=-10, y1=-2.0, fillcolor="red", opacity=0.1, line_width=0, row=i, col=1)
        fig.add_hrect(y0=1.5, y1=2.0, fillcolor="orange", opacity=0.15, line_width=0, row=i, col=1)
        fig.add_hrect(y0=-2.0, y1=-1.5, fillcolor="orange", opacity=0.15, line_width=0, row=i, col=1)
        fig.add_hrect(y0=1.0, y1=1.5, fillcolor="yellow", opacity=0.1, line_width=0, row=i, col=1)
        fig.add_hrect(y0=-1.5, y1=-1.0, fillcolor="yellow", opacity=0.1, line_width=0, row=i, col=1)
        
        # Add reference lines
        for threshold, color, dash in [(2.0, "red", "dash"), (1.5, "orange", "dot"), (1.0, "gold", "dot")]:
            fig.add_hline(y=threshold, line_dash=dash, line_color=color, opacity=0.6, line_width=1, row=i, col=1)
            fig.add_hline(y=-threshold, line_dash=dash, line_color=color, opacity=0.6, line_width=1, row=i, col=1)
        fig.add_hline(y=0, line_dash="solid", line_color="gray", opacity=0.3, line_width=1, row=i, col=1)

        # Plot as bar chart
        bar_color = "darkblue" if i == 1 else "darkgreen"
        fig.add_trace(
            go.Bar(x=df["time"], y=z, name=f"{label} Z-Score", 
                  marker_color=bar_color, opacity=0.7), 
            row=i, col=1
        )

    fig.update_layout(
        title=f"Daily Volume Z-Scores (3-Day Rolling Window) â€” {asset_name} ({asset_type})",
        height=600,
        hovermode="x unified",
        template="plotly_white",
        showlegend=False,
    )
    fig.update_yaxes(range=[-3, 3], row=1, col=1)
    fig.update_yaxes(range=[-3, 3], row=2, col=1)
    fig.show()

## 3. Cross-Correlation â€” DeFi vs TradFi Volume

### 3A. Summary Curve
Pearson correlation at each lag from **-7 to +7 days**.  
Negative lag = DeFi leads, positive lag = TradFi leads.

In [None]:
LAGS = range(-7, 8)  # -7 to +7 days

for asset_name, asset_info in sorted(assets.items()):
    df = asset_info["data"]
    asset_type = asset_info["asset_type"]
    
    overlap = df.dropna(subset=["defi_notional_volume", "tradfi_notional_volume"]).copy()

    if len(overlap) < 10:
        print(f"{asset_name}: only {len(overlap)} overlapping days â€” skipping cross-correlation")
        continue

    lags, corrs = [], []
    for lag in LAGS:
        shifted = overlap["defi_notional_volume"].shift(lag)
        valid = shifted.notna() & overlap["tradfi_notional_volume"].notna()
        if valid.sum() >= 5:
            lags.append(lag)
            corrs.append(shifted[valid].corr(overlap["tradfi_notional_volume"][valid]))

    if len(corrs) == 0:
        print(f"{asset_name}: insufficient data for cross-correlation")
        continue

    peak_idx = int(np.argmax(np.abs(corrs)))
    peak_lag, peak_corr = lags[peak_idx], corrs[peak_idx]

    # Color bars based on correlation value
    colors = ['crimson' if c < 0 else 'steelblue' for c in corrs]

    fig = go.Figure()
    fig.add_trace(go.Bar(x=lags, y=corrs, marker_color=colors, name="Correlation"))
    fig.add_hline(y=0, line_dash="solid", line_color="gray", opacity=0.5)
    fig.add_annotation(
        x=peak_lag, y=peak_corr, 
        text=f"Peak: lag={peak_lag}d, r={peak_corr:.3f}",
        showarrow=True, arrowhead=2, bgcolor="white"
    )
    fig.update_layout(
        title=f"Cross-Correlation: DeFi vs TradFi Volume â€” {asset_name} ({asset_type})",
        xaxis_title="Lag (days, negative = DeFi leads)",
        yaxis_title="Pearson Correlation",
        hovermode="x unified",
        template="plotly_white",
        showlegend=False,
    )
    fig.show()

### 3B. Time-Varying Heatmap

Rolling 5-day correlation between DeFi and TradFi volumes at each lag,  
showing how the lead/lag relationship evolves over time.

In [None]:
WINDOW_5D = 5  # 5 days
MIN_PERIODS_5D = 3  # require at least 3 days

for asset_name, asset_info in sorted(assets.items()):
    df = asset_info["data"]
    asset_type = asset_info["asset_type"]
    
    overlap = df.dropna(subset=["defi_notional_volume", "tradfi_notional_volume"]).copy().reset_index(drop=True)

    if len(overlap) < 10:
        print(f"{asset_name}: only {len(overlap)} overlapping days â€” skipping heatmap")
        continue

    corr_matrix = pd.DataFrame(index=overlap["time"])
    for lag in LAGS:
        shifted = overlap["defi_notional_volume"].shift(lag)
        rolling_corr = shifted.rolling(WINDOW_5D, min_periods=MIN_PERIODS_5D).corr(overlap["tradfi_notional_volume"])
        corr_matrix[lag] = rolling_corr.values

    # Drop rows where all lags are NaN (early window warmup)
    corr_matrix = corr_matrix.dropna(how="all")

    if len(corr_matrix) == 0:
        print(f"{asset_name}: insufficient data for heatmap")
        continue

    fig = go.Figure(data=go.Heatmap(
        z=corr_matrix.values.T,
        x=corr_matrix.index,
        y=list(LAGS),
        colorscale="RdBu",
        zmin=-1, zmax=1,
        colorbar_title="Correlation",
    ))
    fig.update_layout(
        title=f"Time-Varying Cross-Correlation (5-Day Rolling) â€” {asset_name} ({asset_type})",
        xaxis_title="Date",
        yaxis_title="Lag (days)",
        template="plotly_white",
        height=500,
    )
    fig.show()

## 4. Rolling Average Volumes

Moving averages over 1-day, 3-day, 5-day, and 7-day windows for both DeFi and TradFi volumes.

In [None]:
ROLLING_WINDOWS = {"1d": 1, "3d": 3, "5d": 5, "7d": 7}
COLORS = {"1d": "#1f77b4", "3d": "#ff7f0e", "5d": "#2ca02c", "7d": "#d62728"}

for asset_name, asset_info in sorted(assets.items()):
    df = asset_info["data"]
    asset_type = asset_info["asset_type"]
    
    # Skip if insufficient data
    overlap_mask = df[["defi_notional_volume", "tradfi_notional_volume"]].notna().all(axis=1)
    if overlap_mask.sum() < 5:
        print(f"Skipping {asset_name}: insufficient overlapping data")
        continue
    
    fig = make_subplots(
        rows=2, cols=1, 
        shared_xaxes=True, 
        vertical_spacing=0.08,
        subplot_titles=("DeFi Rolling Avg Volume (USD)", "TradFi Rolling Avg Volume (USD)")
    )

    for label, days in ROLLING_WINDOWS.items():
        min_p = max(1, int(days * 0.66))  # At least 1 day
        defi_ma = df["defi_notional_volume"].rolling(days, min_periods=min_p).mean()
        tradfi_ma = df["tradfi_notional_volume"].rolling(days, min_periods=min_p).mean()

        # Use line charts for rolling averages
        fig.add_trace(
            go.Scatter(x=df["time"], y=defi_ma, name=label, mode="lines",
                      line=dict(color=COLORS[label], width=2), legendgroup=label), 
            row=1, col=1
        )
        fig.add_trace(
            go.Scatter(x=df["time"], y=tradfi_ma, name=label, mode="lines",
                      line=dict(color=COLORS[label], width=2), legendgroup=label,
                      showlegend=False), 
            row=2, col=1
        )

    fig.update_layout(
        title=f"Rolling Average Volumes (USD Notional) â€” {asset_name} ({asset_type})",
        height=600,
        hovermode="x unified",
        template="plotly_white",
    )
    fig.show()

## 5. Price Correlation and Tracking Error

Analyze how well DeFi prices track TradFi prices.

In [None]:
print("\n" + "="*100)
print("PRICE CORRELATION AND TRACKING ERROR")
print("="*100)

for asset_name, asset_info in sorted(assets.items()):
    df = asset_info["data"]
    asset_type = asset_info["asset_type"]
    
    # Filter to overlapping period
    overlap_mask = df[["defi_close", "tradfi_close"]].notna().all(axis=1)
    
    if overlap_mask.sum() < 5:
        print(f"\nSkipping {asset_name}: insufficient overlapping data")
        continue
    
    overlap_df = df[overlap_mask]
    
    # Price correlation
    price_corr = overlap_df["defi_close"].corr(overlap_df["tradfi_close"])
    
    # Tracking error (standard deviation of price difference)
    price_diff = overlap_df["defi_close"] - overlap_df["tradfi_close"]
    tracking_error = price_diff.std()
    
    # Average price difference (percentage)
    price_diff_pct = (price_diff / overlap_df["tradfi_close"] * 100)
    avg_diff_pct = price_diff_pct.mean()
    
    print(f"\n{asset_name} ({asset_type})")
    print(f"  Price Correlation:  {price_corr:.4f}")
    print(f"  Tracking Error:     ${tracking_error:,.2f}")
    print(f"  Avg Price Diff:     {avg_diff_pct:+.2f}% (DeFi vs TradFi)")
    print(f"  Days Analyzed:      {overlap_mask.sum()}")

## Summary Statistics by Asset Type

In [None]:
# Aggregate statistics by asset type
asset_type_stats = {}

for asset_name, asset_info in assets.items():
    df = asset_info["data"]
    asset_type = asset_info["asset_type"]
    
    if asset_type not in asset_type_stats:
        asset_type_stats[asset_type] = []
    
    # Calculate metrics for overlapping period
    overlap_mask = df[["defi_close", "tradfi_close", "defi_notional_volume", "tradfi_notional_volume"]].notna().all(axis=1)
    
    if overlap_mask.any():
        overlap_df = df[overlap_mask]
        
        asset_type_stats[asset_type].append({
            "asset": asset_name,
            "price_corr": overlap_df["defi_close"].corr(overlap_df["tradfi_close"]),
            "defi_avg_vol": overlap_df["defi_notional_volume"].mean(),
            "tradfi_avg_vol": overlap_df["tradfi_notional_volume"].mean(),
            "days": overlap_mask.sum(),
        })

# Print summary by asset type
print("\n" + "="*100)
print("SUMMARY BY ASSET TYPE")
print("="*100)

for asset_type, stats_list in sorted(asset_type_stats.items()):
    if not stats_list:
        continue
    
    print(f"\n{asset_type}")
    print(f"  Assets: {len(stats_list)}")
    
    # Average metrics
    avg_price_corr = np.mean([s["price_corr"] for s in stats_list if not np.isnan(s["price_corr"])])
    total_defi_vol = np.sum([s["defi_avg_vol"] for s in stats_list])
    total_tradfi_vol = np.sum([s["tradfi_avg_vol"] for s in stats_list])
    avg_days = np.mean([s["days"] for s in stats_list])
    
    print(f"  Avg Price Correlation: {avg_price_corr:.3f}")
    print(f"  Total DeFi Volume:     ${total_defi_vol:>15,.0f}/day")
    print(f"  Total TradFi Volume:   ${total_tradfi_vol:>15,.0f}/day")
    print(f"  Avg Days Analyzed:     {avg_days:.0f}")