In [None]:
import numpy as np
import pandas as pd
from dataclasses import dataclass
from typing import List, Dict, Tuple, Optional
from datetime import datetime, timedelta
from pathlib import Path
import json
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Imports loaded")

## 1. Configuration & Data Structures

In [None]:
@dataclass
class FillRecord:
    """Single fill record from shadow trading."""
    timestamp: datetime
    symbol: str
    strategy: str
    side: str
    quantity: float
    expected_price: float
    fill_price: float
    slippage_bps: float
    fee_bps: float
    total_cost_bps: float
    alpha_bps: float  # Expected alpha before costs
    net_alpha_bps: float  # Alpha after costs
    latency_ms: float
    order_type: str
    venue: str

@dataclass
class CostCurve:
    """Cost curve for a strategy/venue combination."""
    strategy: str
    venue: str
    order_sizes: List[float]
    slippage_bps: List[float]
    fee_bps: float
    break_even_size: float
    optimal_size: float

# Fee structure by venue
VENUE_FEES = {
    'binance': {'maker': 0.02, 'taker': 0.04},  # bps
    'okx': {'maker': 0.02, 'taker': 0.05},
    'bybit': {'maker': 0.01, 'taker': 0.06},
    'delta': {'maker': 0.02, 'taker': 0.05}
}

# Expected alpha by strategy (basis points)
STRATEGY_ALPHA = {
    'momentum': 8.0,
    'mean_reversion': 5.0,
    'funding_arb': 3.0,
    'stat_arb': 4.0,
    'market_making': 2.0
}

print("‚úÖ Configuration defined")
print(f"   Venues: {list(VENUE_FEES.keys())}")
print(f"   Strategies: {list(STRATEGY_ALPHA.keys())}")

## 2. Load Fill Data

In [None]:
def load_fill_data(log_dir: str = None) -> pd.DataFrame:
    """
    Load fill data from shadow trading logs.
    Falls back to synthetic data if no logs found.
    """
    fills = []
    
    # Try to load from log directory
    if log_dir:
        log_path = Path(log_dir)
        fill_files = list(log_path.glob("**/fills*.json"))
        
        for file_path in fill_files:
            try:
                with open(file_path, 'r') as f:
                    data = json.load(f)
                    fills.extend(data if isinstance(data, list) else [data])
            except Exception as e:
                print(f"Warning: Could not load {file_path}: {e}")
    
    # Generate synthetic data if needed
    if not fills:
        fills = generate_synthetic_fills(n_days=30, fills_per_day=50)
    
    return pd.DataFrame(fills)


def generate_synthetic_fills(n_days: int = 30, fills_per_day: int = 50) -> List[dict]:
    """Generate synthetic fill data for demonstration."""
    import random
    random.seed(42)
    np.random.seed(42)
    
    fills = []
    base_time = datetime.now() - timedelta(days=n_days)
    
    strategies = list(STRATEGY_ALPHA.keys())
    venues = list(VENUE_FEES.keys())
    symbols = ['BTCUSD', 'ETHUSD', 'SOLUSD']
    
    for day in range(n_days):
        for _ in range(fills_per_day):
            strategy = random.choice(strategies)
            venue = random.choice(venues)
            symbol = random.choice(symbols)
            
            # Order size (notional USD)
            order_size = np.random.lognormal(mean=10, sigma=1)  # $10K-$100K typical
            order_size = min(order_size, 500000)  # Cap at $500K
            
            # Slippage model: sqrt(size) impact
            base_slip = 0.5  # 0.5 bps base
            size_impact = 0.1 * np.sqrt(order_size / 10000)  # Impact grows with sqrt(size)
            slippage = base_slip + size_impact + np.random.normal(0, 0.2)
            slippage = max(0, slippage)
            
            # Fees
            order_type = 'maker' if random.random() < 0.6 else 'taker'
            fee = VENUE_FEES[venue][order_type]
            
            # Total cost and alpha
            total_cost = slippage + fee
            expected_alpha = STRATEGY_ALPHA[strategy] + np.random.normal(0, 1)
            net_alpha = expected_alpha - total_cost
            
            # Price
            base_price = {'BTCUSD': 50000, 'ETHUSD': 3000, 'SOLUSD': 100}[symbol]
            expected_price = base_price + np.random.normal(0, base_price * 0.001)
            fill_price = expected_price * (1 + slippage / 10000 * (1 if random.random() < 0.5 else -1))
            
            fills.append({
                'timestamp': (base_time + timedelta(days=day, hours=random.randint(0, 23), minutes=random.randint(0, 59))).isoformat(),
                'symbol': symbol,
                'strategy': strategy,
                'side': random.choice(['buy', 'sell']),
                'quantity': order_size / expected_price,
                'notional_usd': order_size,
                'expected_price': expected_price,
                'fill_price': fill_price,
                'slippage_bps': slippage,
                'fee_bps': fee,
                'total_cost_bps': total_cost,
                'alpha_bps': expected_alpha,
                'net_alpha_bps': net_alpha,
                'latency_ms': np.random.lognormal(3, 0.5),
                'order_type': order_type,
                'venue': venue
            })
    
    return fills


# Load data
fills_df = load_fill_data()
fills_df['timestamp'] = pd.to_datetime(fills_df['timestamp'])

print(f"‚úÖ Loaded {len(fills_df):,} fill records")
print(f"   Period: {fills_df['timestamp'].min().date()} to {fills_df['timestamp'].max().date()}")
print(f"   Strategies: {fills_df['strategy'].nunique()}")
print(f"   Venues: {fills_df['venue'].nunique()}")
fills_df.head()

## 3. Cost Analysis by Strategy

In [None]:
def analyze_costs_by_strategy(df: pd.DataFrame) -> pd.DataFrame:
    """Analyze cost metrics grouped by strategy."""
    
    analysis = df.groupby('strategy').agg({
        'notional_usd': ['count', 'sum', 'mean'],
        'slippage_bps': ['mean', 'std', 'median'],
        'fee_bps': 'mean',
        'total_cost_bps': ['mean', 'std'],
        'alpha_bps': 'mean',
        'net_alpha_bps': ['mean', 'sum']
    }).round(2)
    
    analysis.columns = ['_'.join(col).strip() for col in analysis.columns]
    
    # Calculate derived metrics
    analysis['cost_ratio'] = (analysis['total_cost_bps_mean'] / analysis['alpha_bps_mean']).round(2)
    analysis['profitable_pct'] = (df.groupby('strategy')['net_alpha_bps'].apply(lambda x: (x > 0).mean()) * 100).round(1)
    
    return analysis


strategy_analysis = analyze_costs_by_strategy(fills_df)

print("\n" + "=" * 80)
print("üìä COST ANALYSIS BY STRATEGY")
print("=" * 80)
print(strategy_analysis.to_string())

In [None]:
# Visualize strategy costs
import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

strategies = fills_df['strategy'].unique()
colors = plt.cm.Set2(np.linspace(0, 1, len(strategies)))

# 1. Average costs by strategy
ax1 = axes[0, 0]
cost_data = fills_df.groupby('strategy')[['slippage_bps', 'fee_bps']].mean()
cost_data.plot(kind='bar', stacked=True, ax=ax1, color=['steelblue', 'coral'])
ax1.axhline(y=0, color='black', linewidth=0.5)
ax1.set_ylabel('Cost (bps)')
ax1.set_title('Average Execution Costs by Strategy')
ax1.legend(['Slippage', 'Fees'])
ax1.tick_params(axis='x', rotation=45)

# 2. Alpha vs Cost scatter
ax2 = axes[0, 1]
for i, strat in enumerate(strategies):
    strat_data = fills_df[fills_df['strategy'] == strat]
    ax2.scatter(strat_data['total_cost_bps'], strat_data['alpha_bps'], 
                alpha=0.3, label=strat, color=colors[i], s=20)

# Break-even line
ax2.plot([0, 15], [0, 15], 'r--', linewidth=2, label='Break-even')
ax2.set_xlabel('Total Cost (bps)')
ax2.set_ylabel('Alpha (bps)')
ax2.set_title('Alpha vs Cost per Fill')
ax2.legend(loc='upper left', fontsize=8)
ax2.set_xlim(0, 15)
ax2.set_ylim(-5, 20)

# 3. Net alpha distribution by strategy
ax3 = axes[1, 0]
fills_df.boxplot(column='net_alpha_bps', by='strategy', ax=ax3)
ax3.axhline(y=0, color='r', linestyle='--', linewidth=1)
ax3.set_ylabel('Net Alpha (bps)')
ax3.set_title('Net Alpha Distribution by Strategy')
plt.suptitle('')

# 4. Cost vs Order Size
ax4 = axes[1, 1]
# Bin by order size
fills_df['size_bucket'] = pd.cut(fills_df['notional_usd'], 
                                  bins=[0, 10000, 25000, 50000, 100000, 500000],
                                  labels=['<$10K', '$10-25K', '$25-50K', '$50-100K', '>$100K'])
size_costs = fills_df.groupby('size_bucket')['slippage_bps'].mean()
size_costs.plot(kind='bar', ax=ax4, color='teal')
ax4.set_ylabel('Avg Slippage (bps)')
ax4.set_xlabel('Order Size')
ax4.set_title('Slippage by Order Size')
ax4.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('cost_analysis_by_strategy.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n‚úÖ Charts saved to cost_analysis_by_strategy.png")

## 4. Cost Sensitivity Curves

In [None]:
def build_cost_curve(
    df: pd.DataFrame, 
    strategy: str,
    size_buckets: int = 20
) -> Tuple[np.ndarray, np.ndarray, float, float]:
    """
    Build empirical cost curve for a strategy.
    Returns (sizes, costs, break_even_size, optimal_size).
    """
    strat_data = df[df['strategy'] == strategy].copy()
    
    if len(strat_data) < 10:
        return np.array([]), np.array([]), 0, 0
    
    # Bin by order size
    strat_data['size_bin'] = pd.qcut(strat_data['notional_usd'], q=size_buckets, duplicates='drop')
    
    curve = strat_data.groupby('size_bin').agg({
        'notional_usd': 'mean',
        'slippage_bps': 'mean',
        'fee_bps': 'mean',
        'alpha_bps': 'mean',
        'net_alpha_bps': 'mean'
    }).dropna()
    
    sizes = curve['notional_usd'].values
    costs = (curve['slippage_bps'] + curve['fee_bps']).values
    alphas = curve['alpha_bps'].values
    net_alphas = curve['net_alpha_bps'].values
    
    # Find break-even size (where net alpha crosses zero)
    break_even_idx = np.where(net_alphas < 0)[0]
    break_even_size = sizes[break_even_idx[0]] if len(break_even_idx) > 0 else sizes[-1]
    
    # Find optimal size (maximum net alpha)
    optimal_idx = np.argmax(net_alphas)
    optimal_size = sizes[optimal_idx]
    
    return sizes, costs, break_even_size, optimal_size


# Build curves for each strategy
cost_curves = {}
for strategy in fills_df['strategy'].unique():
    sizes, costs, break_even, optimal = build_cost_curve(fills_df, strategy)
    cost_curves[strategy] = {
        'sizes': sizes,
        'costs': costs,
        'break_even_size': break_even,
        'optimal_size': optimal,
        'expected_alpha': STRATEGY_ALPHA.get(strategy, 5.0)
    }

print("\n" + "=" * 60)
print("üìà COST SENSITIVITY CURVES")
print("=" * 60)
for strat, curve in cost_curves.items():
    print(f"\n{strat.upper()}:")
    print(f"   Expected Alpha: {curve['expected_alpha']:.1f} bps")
    print(f"   Break-even Size: ${curve['break_even_size']:,.0f}")
    print(f"   Optimal Size: ${curve['optimal_size']:,.0f}")

In [None]:
# Plot cost sensitivity curves
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.flatten()

strategies = list(cost_curves.keys())

for idx, strategy in enumerate(strategies):
    if idx >= len(axes):
        break
        
    ax = axes[idx]
    curve = cost_curves[strategy]
    
    if len(curve['sizes']) == 0:
        ax.text(0.5, 0.5, 'Insufficient data', ha='center', va='center')
        ax.set_title(strategy.upper())
        continue
    
    sizes = curve['sizes']
    costs = curve['costs']
    alpha = curve['expected_alpha']
    
    # Plot cost curve
    ax.plot(sizes / 1000, costs, 'b-', linewidth=2, label='Total Cost')
    
    # Plot alpha line
    ax.axhline(y=alpha, color='g', linestyle='--', linewidth=2, label=f'Alpha ({alpha:.1f} bps)')
    
    # Mark break-even
    ax.axvline(x=curve['break_even_size']/1000, color='r', linestyle=':', 
               linewidth=2, label=f"Break-even (${curve['break_even_size']/1000:.0f}K)")
    
    # Mark optimal
    ax.axvline(x=curve['optimal_size']/1000, color='purple', linestyle=':', 
               linewidth=2, label=f"Optimal (${curve['optimal_size']/1000:.0f}K)")
    
    # Fill profitable zone
    ax.fill_between(sizes/1000, costs, alpha, where=(costs < alpha), 
                    alpha=0.3, color='green', label='Profitable')
    ax.fill_between(sizes/1000, costs, alpha, where=(costs >= alpha), 
                    alpha=0.3, color='red', label='Unprofitable')
    
    ax.set_xlabel('Order Size ($K)')
    ax.set_ylabel('Cost / Alpha (bps)')
    ax.set_title(f'{strategy.upper()} Cost Sensitivity')
    ax.legend(loc='upper left', fontsize=7)
    ax.grid(True, alpha=0.3)
    ax.set_ylim(0, max(alpha * 2, max(costs) * 1.2))

# Hide unused subplots
for idx in range(len(strategies), len(axes)):
    axes[idx].set_visible(False)

plt.tight_layout()
plt.savefig('cost_sensitivity_curves.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n‚úÖ Cost sensitivity curves saved to cost_sensitivity_curves.png")

## 5. Break-Even Analysis

In [None]:
def calculate_break_even_bps(df: pd.DataFrame, strategy: str) -> Dict:
    """
    Calculate break-even cost levels for a strategy.
    """
    strat_data = df[df['strategy'] == strategy]
    
    expected_alpha = STRATEGY_ALPHA.get(strategy, 5.0)
    avg_cost = strat_data['total_cost_bps'].mean()
    avg_slippage = strat_data['slippage_bps'].mean()
    avg_fee = strat_data['fee_bps'].mean()
    
    # Break-even: cost = alpha
    break_even_total = expected_alpha
    break_even_slippage = expected_alpha - avg_fee  # Max slippage if fees are fixed
    
    # Current margin
    cost_margin = expected_alpha - avg_cost
    cost_margin_pct = (cost_margin / expected_alpha) * 100 if expected_alpha > 0 else 0
    
    return {
        'strategy': strategy,
        'expected_alpha_bps': expected_alpha,
        'avg_total_cost_bps': avg_cost,
        'avg_slippage_bps': avg_slippage,
        'avg_fee_bps': avg_fee,
        'break_even_cost_bps': break_even_total,
        'max_slippage_bps': break_even_slippage,
        'cost_margin_bps': cost_margin,
        'cost_margin_pct': cost_margin_pct,
        'is_profitable': cost_margin > 0
    }


# Calculate for all strategies
break_even_analysis = [calculate_break_even_bps(fills_df, s) for s in fills_df['strategy'].unique()]
break_even_df = pd.DataFrame(break_even_analysis).round(2)

print("\n" + "=" * 80)
print("üíπ BREAK-EVEN ANALYSIS")
print("=" * 80)
print(break_even_df.to_string(index=False))

In [None]:
# Visualize break-even analysis
fig, ax = plt.subplots(figsize=(12, 6))

strategies = break_even_df['strategy'].values
x = np.arange(len(strategies))
width = 0.35

# Bars for alpha and cost
bars1 = ax.bar(x - width/2, break_even_df['expected_alpha_bps'], width, 
               label='Expected Alpha', color='green', alpha=0.7)
bars2 = ax.bar(x + width/2, break_even_df['avg_total_cost_bps'], width,
               label='Avg Total Cost', color='red', alpha=0.7)

# Add break-even line
ax.axhline(y=0, color='black', linewidth=0.5)

# Annotations
for i, (_, row) in enumerate(break_even_df.iterrows()):
    margin = row['cost_margin_bps']
    color = 'green' if margin > 0 else 'red'
    ax.annotate(f'{margin:+.1f}', xy=(i, max(row['expected_alpha_bps'], row['avg_total_cost_bps']) + 0.3),
                ha='center', fontsize=10, color=color, fontweight='bold')

ax.set_ylabel('Basis Points (bps)')
ax.set_title('Break-Even Analysis: Alpha vs Cost by Strategy')
ax.set_xticks(x)
ax.set_xticklabels([s.replace('_', '\n') for s in strategies])
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('break_even_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n‚úÖ Break-even analysis saved to break_even_analysis.png")

## 6. Venue Comparison

In [None]:
# Analyze costs by venue
venue_analysis = fills_df.groupby('venue').agg({
    'notional_usd': ['count', 'sum'],
    'slippage_bps': ['mean', 'median', 'std'],
    'fee_bps': 'mean',
    'total_cost_bps': 'mean',
    'latency_ms': ['mean', 'median'],
    'net_alpha_bps': 'mean'
}).round(2)

venue_analysis.columns = ['_'.join(col).strip() for col in venue_analysis.columns]

print("\n" + "=" * 80)
print("üè¶ VENUE COMPARISON")
print("=" * 80)
print(venue_analysis.to_string())

In [None]:
# Venue comparison visualization
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

venues = fills_df['venue'].unique()

# 1. Cost comparison
ax1 = axes[0]
venue_costs = fills_df.groupby('venue')[['slippage_bps', 'fee_bps']].mean()
venue_costs.plot(kind='bar', stacked=True, ax=ax1, color=['steelblue', 'coral'])
ax1.set_ylabel('Cost (bps)')
ax1.set_title('Execution Costs by Venue')
ax1.legend(['Slippage', 'Fees'])
ax1.tick_params(axis='x', rotation=45)

# 2. Latency comparison
ax2 = axes[1]
fills_df.boxplot(column='latency_ms', by='venue', ax=ax2)
ax2.set_ylabel('Latency (ms)')
ax2.set_title('Latency by Venue')
plt.suptitle('')

# 3. Net alpha by venue
ax3 = axes[2]
venue_alpha = fills_df.groupby('venue')['net_alpha_bps'].mean()
colors = ['green' if v > 0 else 'red' for v in venue_alpha.values]
venue_alpha.plot(kind='bar', ax=ax3, color=colors)
ax3.axhline(y=0, color='black', linewidth=1)
ax3.set_ylabel('Avg Net Alpha (bps)')
ax3.set_title('Net Alpha by Venue')
ax3.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('venue_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n‚úÖ Venue comparison saved to venue_comparison.png")

## 7. Recommendations & Summary

In [None]:
def generate_recommendations(
    break_even_df: pd.DataFrame,
    cost_curves: Dict,
    venue_analysis: pd.DataFrame
) -> List[str]:
    """Generate actionable recommendations from the analysis."""
    recommendations = []
    
    # 1. Strategy recommendations
    for _, row in break_even_df.iterrows():
        strategy = row['strategy']
        margin = row['cost_margin_bps']
        margin_pct = row['cost_margin_pct']
        
        if margin < 0:
            recommendations.append(
                f"üî¥ {strategy.upper()}: Unprofitable (margin={margin:.1f}bps). "
                f"Consider pausing or reducing position sizes."
            )
        elif margin_pct < 20:
            recommendations.append(
                f"üü° {strategy.upper()}: Thin margin ({margin_pct:.0f}%). "
                f"Focus on execution quality to preserve edge."
            )
        else:
            recommendations.append(
                f"üü¢ {strategy.upper()}: Healthy margin ({margin_pct:.0f}%). "
                f"Can increase size up to ${cost_curves[strategy]['break_even_size']/1000:.0f}K."
            )
    
    # 2. Venue recommendations
    best_venue = venue_analysis['net_alpha_bps_mean'].idxmax()
    worst_venue = venue_analysis['net_alpha_bps_mean'].idxmin()
    
    recommendations.append(
        f"\nüìç VENUE: Prefer {best_venue.upper()} (best net alpha), "
        f"reduce allocation to {worst_venue.upper()}."
    )
    
    # 3. Size recommendations
    avg_optimal = np.mean([c['optimal_size'] for c in cost_curves.values() if c['optimal_size'] > 0])
    recommendations.append(
        f"\nüìè SIZE: Target order sizes around ${avg_optimal/1000:.0f}K for optimal cost/alpha balance."
    )
    
    return recommendations


recommendations = generate_recommendations(break_even_df, cost_curves, venue_analysis)

print("\n" + "=" * 80)
print("üìã RECOMMENDATIONS")
print("=" * 80)
for rec in recommendations:
    print(f"\n{rec}")

In [None]:
# Export summary
summary = {
    'analysis_date': datetime.now().isoformat(),
    'period': {
        'start': fills_df['timestamp'].min().isoformat(),
        'end': fills_df['timestamp'].max().isoformat(),
        'total_fills': len(fills_df)
    },
    'break_even_by_strategy': break_even_df.to_dict(orient='records'),
    'optimal_sizes': {
        strategy: {
            'break_even_usd': float(curve['break_even_size']),
            'optimal_usd': float(curve['optimal_size'])
        }
        for strategy, curve in cost_curves.items()
    },
    'best_venue': venue_analysis['net_alpha_bps_mean'].idxmax(),
    'recommendations': recommendations
}

with open('cost_sensitivity_summary.json', 'w') as f:
    json.dump(summary, f, indent=2, default=str)

print("\n" + "=" * 80)
print("üìÅ EXPORTED FILES")
print("=" * 80)
print("\n‚Ä¢ cost_analysis_by_strategy.png")
print("‚Ä¢ cost_sensitivity_curves.png")
print("‚Ä¢ break_even_analysis.png")
print("‚Ä¢ venue_comparison.png")
print("‚Ä¢ cost_sensitivity_summary.json")

print("\n‚úÖ Cost sensitivity analysis complete!")