# Phase 3 & 4: Label Optimization & Better Models

**Objectives:**
1. **Phase 3:** Find optimal prediction horizon and cost treatment
2. **Phase 4:** Test XGBoost and compare with Logistic Regression

---

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# ML imports
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score

# Try to import XGBoost
try:
    import xgboost as xgb
    HAS_XGBOOST = True
    print("XGBoost available")
except ImportError:
    HAS_XGBOOST = False
    print("XGBoost not available, using GradientBoosting instead")

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

import sys
sys.path.insert(0, str(Path.cwd().parent / 'src'))

from data import load_all_data
from features import compute_rolling_returns, compute_rolling_volatility, compute_rsi, compute_bollinger_bands
from labels import create_cost_adjusted_labels
from metrics import compute_all_metrics
from backtesting import compute_strategy_returns, compute_portfolio_returns, compute_equity_curve

## 1. Load and Prepare Data (Same as Phase 2)

In [None]:
# Load data
trade_log, prices, glassnode = load_all_data()

common_idx = trade_log.index.intersection(prices.index)
common_assets = trade_log.columns.intersection(prices.columns)
signals = trade_log.loc[common_idx, common_assets]
prices_aligned = prices.loc[common_idx, common_assets]

# Glassnode features
GLASSNODE_FEATURES = [
    'btc_mvrv_z_score', 'btc_puell_multiple', 'reserve_risk',
    'btc_fear_greed_index', 'btc_adjusted_sopr',
    'btc_percent_upply_in_profit', 'btc_network_value_to_transactions_signal',
    'btc_futures_perpetual_funding_rate_mean', 'vocdd', 'mvocdd',
]
available_gn = [f for f in GLASSNODE_FEATURES if f in glassnode.columns]
gn_selected = glassnode[available_gn].copy()

# Align Glassnode
signals_naive = signals.copy()
signals_naive.index = signals_naive.index.tz_localize(None)

gn_aligned = pd.DataFrame(index=signals_naive.index)
for col in available_gn:
    aligned_values = []
    for ts in signals_naive.index:
        date = ts.normalize()
        available_dates = gn_selected[col].dropna().index
        available_dates = available_dates[available_dates <= date]
        if len(available_dates) > 0:
            aligned_values.append(gn_selected[col].loc[available_dates[-1]])
        else:
            aligned_values.append(np.nan)
    gn_aligned[col] = aligned_values

# Technical features
return_features = compute_rolling_returns(prices_aligned, windows=[1, 8, 56])
vol_features = compute_rolling_volatility(prices_aligned, windows=[56])
rsi_features = compute_rsi(prices_aligned, window=112)
bb_features = compute_bollinger_bands(prices_aligned, window=160)

price_features = pd.concat([return_features, vol_features, rsi_features, bb_features], axis=1)

print(f"Data prepared: {len(signals)} timestamps, {len(common_assets)} assets")
print(f"Price features: {price_features.shape[1]}, Glassnode: {gn_aligned.shape[1]}")

In [None]:
def prepare_enhanced_data(price_features, glassnode_features, labels, signals):
    """Prepare stacked data with price and Glassnode features."""
    data_rows = []
    signals_naive = signals.copy()
    signals_naive.index = signals_naive.index.tz_localize(None)
    labels_naive = labels.copy()
    labels_naive.index = labels_naive.index.tz_localize(None)
    price_features_naive = price_features.copy()
    price_features_naive.index = price_features_naive.index.tz_localize(None)
    
    for timestamp in labels_naive.index:
        if timestamp not in price_features_naive.index or timestamp not in glassnode_features.index:
            continue
        for asset in labels_naive.columns:
            if signals_naive.loc[timestamp, asset] != 1:
                continue
            label_val = labels_naive.loc[timestamp, asset]
            if pd.isna(label_val):
                continue
            asset_cols = [c for c in price_features_naive.columns if c.startswith(asset + '_')]
            if not asset_cols:
                continue
            price_row = price_features_naive.loc[timestamp, asset_cols]
            if price_row.isna().any():
                continue
            gn_row = glassnode_features.loc[timestamp]
            if gn_row.isna().any():
                continue
            renamed_price = {col.replace(asset + '_', ''): price_row[col] for col in asset_cols}
            row_data = {'timestamp': timestamp, 'asset': asset, 'label': label_val,
                        **renamed_price, **gn_row.to_dict()}
            data_rows.append(row_data)
    
    df = pd.DataFrame(data_rows).set_index(['timestamp', 'asset'])
    return df.drop('label', axis=1), df['label']

---

## 2. Phase 3: Horizon Optimization

Test different prediction horizons:
- **8 periods (1 day)** - Current setting
- **24 periods (3 days)** - Medium term
- **56 periods (1 week)** - Longer term

In [None]:
def evaluate_horizon(horizon, price_features, gn_aligned, prices_aligned, signals, threshold=0.5):
    """Evaluate a specific prediction horizon."""
    # Create labels with this horizon
    labels = create_cost_adjusted_labels(
        prices_aligned, signals,
        horizon=horizon,
        entry_cost=0.001,
        exit_cost=0.001
    )
    
    # Prepare data
    X, y = prepare_enhanced_data(price_features, gn_aligned, labels, signals)
    
    if len(X) < 100:
        return None
    
    # Walk-forward split (simple: 60% train, 40% test)
    timestamps = X.index.get_level_values('timestamp').unique().sort_values()
    split_idx = int(len(timestamps) * 0.6)
    train_ts = timestamps[:split_idx]
    test_ts = timestamps[split_idx:]
    
    train_mask = X.index.get_level_values('timestamp').isin(train_ts)
    test_mask = X.index.get_level_values('timestamp').isin(test_ts)
    
    X_train, X_test = X[train_mask], X[test_mask]
    y_train, y_test = y[train_mask], y[test_mask]
    
    # Train logistic regression
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    model = LogisticRegression(C=1.0, class_weight='balanced', max_iter=1000, random_state=42)
    model.fit(X_train_scaled, y_train)
    
    y_prob = model.predict_proba(X_test_scaled)[:, 1]
    
    # Create predictions DataFrame
    predictions_df = pd.DataFrame({'probability': y_prob, 'actual': y_test.values}, index=y_test.index)
    
    # Evaluate strategy
    pred_timestamps = predictions_df.index.get_level_values('timestamp').unique()
    prices_naive = prices_aligned.copy()
    prices_naive.index = prices_naive.index.tz_localize(None)
    signals_naive = signals.copy()
    signals_naive.index = signals_naive.index.tz_localize(None)
    
    baseline_signals = signals_naive.loc[pred_timestamps]
    comparison_prices = prices_naive.loc[pred_timestamps]
    
    # Baseline
    baseline_returns = compute_strategy_returns(baseline_signals, comparison_prices, transaction_cost=0.001)
    baseline_portfolio = compute_portfolio_returns(baseline_returns)
    baseline_metrics = compute_all_metrics(baseline_portfolio.dropna())
    
    # ML-filtered
    filtered = signals_naive.loc[pred_timestamps].copy()
    for (ts, asset), row in predictions_df.iterrows():
        if asset in filtered.columns and row['probability'] <= threshold:
            filtered.loc[ts, asset] = 0
    
    filt_returns = compute_strategy_returns(filtered, comparison_prices, transaction_cost=0.001)
    filt_portfolio = compute_portfolio_returns(filt_returns)
    filt_metrics = compute_all_metrics(filt_portfolio.dropna())
    
    return {
        'horizon': horizon,
        'horizon_days': horizon / 8,
        'n_samples': len(X),
        'positive_rate': y.mean(),
        'auc': roc_auc_score(y_test, y_prob),
        'baseline_sharpe': baseline_metrics['sharpe_ratio'],
        'baseline_return': baseline_metrics['total_return'],
        'filtered_sharpe': filt_metrics['sharpe_ratio'],
        'filtered_return': filt_metrics['total_return'],
        'sharpe_improvement': (filt_metrics['sharpe_ratio'] - baseline_metrics['sharpe_ratio']) / baseline_metrics['sharpe_ratio'] * 100
    }

In [None]:
# Test different horizons
horizons = [8, 16, 24, 40, 56]  # 1d, 2d, 3d, 5d, 1w

print("HORIZON OPTIMIZATION")
print("=" * 70)

horizon_results = []
for horizon in horizons:
    result = evaluate_horizon(horizon, price_features, gn_aligned, prices_aligned, signals, threshold=0.5)
    if result:
        horizon_results.append(result)
        print(f"\nHorizon {horizon}p ({result['horizon_days']:.1f} days):")
        print(f"  Samples: {result['n_samples']}, Positive rate: {result['positive_rate']*100:.1f}%")
        print(f"  AUC: {result['auc']:.3f}")
        print(f"  Baseline Sharpe: {result['baseline_sharpe']:.3f}")
        print(f"  Filtered Sharpe: {result['filtered_sharpe']:.3f} ({result['sharpe_improvement']:+.1f}%)")
        print(f"  Filtered Return: {result['filtered_return']*100:.2f}%")

In [None]:
# Plot horizon comparison
horizon_df = pd.DataFrame(horizon_results)

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# AUC vs horizon
axes[0].plot(horizon_df['horizon_days'], horizon_df['auc'], 'bo-', markersize=10, linewidth=2)
axes[0].axhline(y=0.5, color='r', linestyle='--', label='Random')
axes[0].set_xlabel('Horizon (days)')
axes[0].set_ylabel('AUC')
axes[0].set_title('Model AUC vs Horizon')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Sharpe improvement vs horizon
axes[1].bar(horizon_df['horizon_days'], horizon_df['sharpe_improvement'], color='green', alpha=0.7)
axes[1].axhline(y=0, color='black', linewidth=0.5)
axes[1].set_xlabel('Horizon (days)')
axes[1].set_ylabel('Sharpe Improvement (%)')
axes[1].set_title('Sharpe Improvement vs Horizon')
axes[1].grid(True, alpha=0.3)

# Return vs horizon
axes[2].plot(horizon_df['horizon_days'], horizon_df['filtered_return']*100, 'go-', markersize=10, linewidth=2, label='Filtered')
axes[2].plot(horizon_df['horizon_days'], horizon_df['baseline_return']*100, 'ro--', markersize=8, linewidth=1, label='Baseline')
axes[2].set_xlabel('Horizon (days)')
axes[2].set_ylabel('Return (%)')
axes[2].set_title('Returns vs Horizon')
axes[2].legend()
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Best horizon
best_idx = horizon_df['filtered_sharpe'].idxmax()
best_horizon = horizon_df.loc[best_idx, 'horizon']
print(f"\nBest horizon: {best_horizon} periods ({best_horizon/8:.1f} days)")

---

## 3. Phase 4: Better Models

Compare:
1. **Logistic Regression** (baseline model)
2. **Random Forest** (ensemble of trees)
3. **XGBoost/Gradient Boosting** (boosted trees)

In [None]:
# Use best horizon from Phase 3 (or default to 8)
BEST_HORIZON = int(best_horizon) if 'best_horizon' in dir() else 8
print(f"Using horizon: {BEST_HORIZON} periods ({BEST_HORIZON/8:.1f} days)")

# Create labels with best horizon
labels = create_cost_adjusted_labels(
    prices_aligned, signals,
    horizon=BEST_HORIZON,
    entry_cost=0.001,
    exit_cost=0.001
)

# Prepare data
X, y = prepare_enhanced_data(price_features, gn_aligned, labels, signals)
print(f"Data: X={X.shape}, y={y.shape}")

In [None]:
def evaluate_model(model_name, model, X, y, threshold=0.5):
    """Evaluate a model with walk-forward validation."""
    timestamps = X.index.get_level_values('timestamp').unique().sort_values()
    split_idx = int(len(timestamps) * 0.6)
    train_ts = timestamps[:split_idx]
    test_ts = timestamps[split_idx:]
    
    train_mask = X.index.get_level_values('timestamp').isin(train_ts)
    test_mask = X.index.get_level_values('timestamp').isin(test_ts)
    
    X_train, X_test = X[train_mask], X[test_mask]
    y_train, y_test = y[train_mask], y[test_mask]
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train
    model.fit(X_train_scaled, y_train)
    
    # Predict
    y_prob = model.predict_proba(X_test_scaled)[:, 1]
    y_pred = (y_prob > 0.5).astype(int)
    
    # Classification metrics
    auc = roc_auc_score(y_test, y_prob)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    
    # Strategy evaluation
    predictions_df = pd.DataFrame({'probability': y_prob, 'actual': y_test.values}, index=y_test.index)
    pred_timestamps = predictions_df.index.get_level_values('timestamp').unique()
    
    prices_naive = prices_aligned.copy()
    prices_naive.index = prices_naive.index.tz_localize(None)
    signals_naive = signals.copy()
    signals_naive.index = signals_naive.index.tz_localize(None)
    
    baseline_signals = signals_naive.loc[pred_timestamps]
    comparison_prices = prices_naive.loc[pred_timestamps]
    
    # Baseline
    baseline_returns = compute_strategy_returns(baseline_signals, comparison_prices, transaction_cost=0.001)
    baseline_portfolio = compute_portfolio_returns(baseline_returns)
    baseline_metrics = compute_all_metrics(baseline_portfolio.dropna())
    
    # ML-filtered at different thresholds
    results = {'model': model_name, 'auc': auc, 'accuracy': acc, 'precision': prec, 'recall': rec}
    
    for thresh in [0.4, 0.5, 0.6]:
        filtered = signals_naive.loc[pred_timestamps].copy()
        for (ts, asset), row in predictions_df.iterrows():
            if asset in filtered.columns and row['probability'] <= thresh:
                filtered.loc[ts, asset] = 0
        
        filt_returns = compute_strategy_returns(filtered, comparison_prices, transaction_cost=0.001)
        filt_portfolio = compute_portfolio_returns(filt_returns)
        filt_metrics = compute_all_metrics(filt_portfolio.dropna())
        
        results[f'sharpe_{thresh}'] = filt_metrics['sharpe_ratio']
        results[f'return_{thresh}'] = filt_metrics['total_return']
    
    results['baseline_sharpe'] = baseline_metrics['sharpe_ratio']
    results['baseline_return'] = baseline_metrics['total_return']
    
    return results, model, scaler

In [None]:
# Define models to test
models = {
    'Logistic Regression': LogisticRegression(C=1.0, class_weight='balanced', max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=5, class_weight='balanced', random_state=42, n_jobs=-1),
}

if HAS_XGBOOST:
    models['XGBoost'] = xgb.XGBClassifier(
        n_estimators=100, max_depth=4, learning_rate=0.1,
        scale_pos_weight=(y == 0).sum() / (y == 1).sum(),
        random_state=42, n_jobs=-1, verbosity=0
    )
else:
    models['Gradient Boosting'] = GradientBoostingClassifier(
        n_estimators=100, max_depth=4, learning_rate=0.1, random_state=42
    )

print("MODEL COMPARISON")
print("=" * 70)

model_results = []
best_models = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    result, trained_model, scaler = evaluate_model(name, model, X, y)
    model_results.append(result)
    best_models[name] = (trained_model, scaler)
    
    print(f"  AUC: {result['auc']:.3f}, Accuracy: {result['accuracy']:.3f}")
    print(f"  Baseline Sharpe: {result['baseline_sharpe']:.3f}")
    print(f"  Filtered Sharpe (τ=0.5): {result['sharpe_0.5']:.3f}")
    print(f"  Filtered Sharpe (τ=0.6): {result['sharpe_0.6']:.3f}")

In [None]:
# Compare models
results_df = pd.DataFrame(model_results)

print("\n" + "=" * 70)
print("MODEL COMPARISON SUMMARY")
print("=" * 70)

print(f"\n{'Model':<25} {'AUC':>8} {'Sharpe(τ=0.5)':>15} {'Sharpe(τ=0.6)':>15} {'Return(τ=0.5)':>15}")
print("-" * 80)
print(f"{'Baseline':<25} {'-':>8} {results_df['baseline_sharpe'].iloc[0]:>15.3f} {'-':>15} {results_df['baseline_return'].iloc[0]*100:>14.2f}%")
for _, row in results_df.iterrows():
    print(f"{row['model']:<25} {row['auc']:>8.3f} {row['sharpe_0.5']:>15.3f} {row['sharpe_0.6']:>15.3f} {row['return_0.5']*100:>14.2f}%")

In [None]:
# Plot model comparison
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# AUC comparison
colors = ['steelblue', 'forestgreen', 'coral']
axes[0].bar(results_df['model'], results_df['auc'], color=colors[:len(results_df)], alpha=0.7)
axes[0].axhline(y=0.5, color='red', linestyle='--', label='Random')
axes[0].set_ylabel('AUC')
axes[0].set_title('Model AUC Comparison')
axes[0].tick_params(axis='x', rotation=15)
axes[0].legend()
axes[0].grid(True, alpha=0.3, axis='y')

# Sharpe comparison
x = np.arange(len(results_df))
width = 0.35
axes[1].bar(x - width/2, results_df['sharpe_0.5'], width, label='τ=0.5', color='steelblue', alpha=0.7)
axes[1].bar(x + width/2, results_df['sharpe_0.6'], width, label='τ=0.6', color='coral', alpha=0.7)
axes[1].axhline(y=results_df['baseline_sharpe'].iloc[0], color='red', linestyle='--', label='Baseline')
axes[1].set_ylabel('Sharpe Ratio')
axes[1].set_title('Sharpe Ratio by Model')
axes[1].set_xticks(x)
axes[1].set_xticklabels(results_df['model'], rotation=15)
axes[1].legend()
axes[1].grid(True, alpha=0.3, axis='y')

# Return comparison
axes[2].bar(x - width/2, results_df['return_0.5']*100, width, label='τ=0.5', color='steelblue', alpha=0.7)
axes[2].bar(x + width/2, results_df['return_0.6']*100, width, label='τ=0.6', color='coral', alpha=0.7)
axes[2].axhline(y=results_df['baseline_return'].iloc[0]*100, color='red', linestyle='--', label='Baseline')
axes[2].set_ylabel('Return (%)')
axes[2].set_title('Total Return by Model')
axes[2].set_xticks(x)
axes[2].set_xticklabels(results_df['model'], rotation=15)
axes[2].legend()
axes[2].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

---

## 4. Feature Importance (Best Model)

In [None]:
# Get feature importance from best tree-based model
tree_model_name = 'XGBoost' if HAS_XGBOOST else 'Gradient Boosting'
if tree_model_name in best_models:
    tree_model, _ = best_models[tree_model_name]
    
    if hasattr(tree_model, 'feature_importances_'):
        importance = pd.DataFrame({
            'feature': X.columns,
            'importance': tree_model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        print(f"Feature Importance ({tree_model_name}):")
        print("=" * 50)
        for i, row in importance.head(15).iterrows():
            bar = '█' * int(row['importance'] * 50)
            print(f"{row['feature']:40s} {row['importance']:.4f} {bar}")

In [None]:
# Plot feature importance
if 'importance' in dir():
    fig, ax = plt.subplots(figsize=(10, 8))
    
    top_n = 15
    top_features = importance.head(top_n)
    
    # Color by feature type
    colors = ['coral' if f in GLASSNODE_FEATURES else 'steelblue' for f in top_features['feature']]
    
    ax.barh(range(len(top_features)), top_features['importance'], color=colors, alpha=0.7)
    ax.set_yticks(range(len(top_features)))
    ax.set_yticklabels(top_features['feature'])
    ax.set_xlabel('Importance')
    ax.set_title(f'Top {top_n} Feature Importance ({tree_model_name})\n(Coral = Glassnode, Blue = Technical)')
    ax.invert_yaxis()
    ax.grid(True, alpha=0.3, axis='x')
    
    plt.tight_layout()
    plt.show()

---

## 5. Best Configuration Summary

In [None]:
# Find best overall configuration
best_model_idx = results_df['sharpe_0.5'].idxmax()
best_model_name = results_df.loc[best_model_idx, 'model']

print("=" * 70)
print("BEST CONFIGURATION")
print("=" * 70)
print(f"\nModel: {best_model_name}")
print(f"Horizon: {BEST_HORIZON} periods ({BEST_HORIZON/8:.1f} days)")
print(f"Features: {len(X.columns)}")
print(f"\nPerformance (τ=0.5):")
print(f"  AUC: {results_df.loc[best_model_idx, 'auc']:.3f}")
print(f"  Sharpe: {results_df.loc[best_model_idx, 'sharpe_0.5']:.3f}")
print(f"  Return: {results_df.loc[best_model_idx, 'return_0.5']*100:.2f}%")
print(f"\nBaseline:")
print(f"  Sharpe: {results_df.loc[best_model_idx, 'baseline_sharpe']:.3f}")
print(f"  Return: {results_df.loc[best_model_idx, 'baseline_return']*100:.2f}%")
print(f"\nImprovement:")
sharpe_imp = (results_df.loc[best_model_idx, 'sharpe_0.5'] - results_df.loc[best_model_idx, 'baseline_sharpe']) / results_df.loc[best_model_idx, 'baseline_sharpe'] * 100
print(f"  Sharpe: {sharpe_imp:+.1f}%")

---

## 6. Summary

### Phase 3 Findings (Horizon Optimization)
- [ ] Fill after running

### Phase 4 Findings (Model Comparison)
- [ ] Fill after running

### Next Steps
- Phase 5: Regime detection with HMM
- Phase 6: Robustness analysis