# Signal Profitability Stats

This notebook measures how predictive your logged signals are using forward returns.

Questions it answers:
- Do strong/moderate/weak signals have different realized edge?
- Which horizons (1d / 3d / 5d) carry the best signal quality?
- Which symbols and magnitude buckets are most/least profitable?


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from io import StringIO
from IPython.display import display

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

from QuantConnect import *
from QuantConnect.Research import QuantBook

qb = QuantBook()
print('QuantBook initialized')


def read_csv_from_store(key):
    try:
        if not qb.ObjectStore.ContainsKey(key):
            print(f'ObjectStore key not found: {key}')
            return None
        content = qb.ObjectStore.Read(key)
        if not content:
            print(f'Empty ObjectStore key: {key}')
            return None
        return pd.read_csv(StringIO(content))
    except Exception as e:
        print(f'Error reading {key}: {e}')
        return None


In [None]:
df_signals = read_csv_from_store('wolfpack/signals.csv')
df_positions = read_csv_from_store('wolfpack/positions.csv')

if df_signals is None:
    raise ValueError('signals.csv is required. Run a backtest with signal logging enabled.')

required_cols = ['date', 'symbol', 'direction', 'magnitude', 'price']
missing = [c for c in required_cols if c not in df_signals.columns]
if missing:
    raise ValueError(f'signals.csv missing required columns: {missing}')

signals = df_signals.copy()
signals['date'] = pd.to_datetime(signals['date'])
for col in ['magnitude', 'price']:
    signals[col] = pd.to_numeric(signals[col], errors='coerce')

signals['direction'] = signals['direction'].astype(str).str.title()
signals['direction_sign'] = np.where(signals['direction'].eq('Up'), 1.0, -1.0)
signals['abs_magnitude'] = signals['magnitude'].abs()
signals['tier'] = np.select(
    [signals['abs_magnitude'] >= 0.7, signals['abs_magnitude'] >= 0.3],
    ['strong', 'moderate'],
    default='weak'
)

price_parts = [
    signals[['date', 'symbol', 'price']].assign(source='signals', source_priority=1)
]

if df_positions is not None and {'date', 'symbol', 'price'}.issubset(df_positions.columns):
    pos_px = df_positions[['date', 'symbol', 'price']].copy()
    pos_px['date'] = pd.to_datetime(pos_px['date'])
    pos_px['price'] = pd.to_numeric(pos_px['price'], errors='coerce')
    price_parts.append(pos_px.assign(source='positions', source_priority=0))

prices = pd.concat(price_parts, ignore_index=True)
prices = prices.dropna(subset=['date', 'symbol', 'price'])
prices = prices.sort_values(['symbol', 'date', 'source_priority'])
prices = prices.drop_duplicates(['symbol', 'date'], keep='first')
prices = prices.sort_values(['symbol', 'date'])

horizons = [1, 3, 5]
for h in horizons:
    prices[f'fwd_price_{h}d'] = prices.groupby('symbol')['price'].shift(-h)
    prices[f'fwd_ret_{h}d'] = prices[f'fwd_price_{h}d'] / prices['price'] - 1.0

merge_cols = ['date', 'symbol'] + [f'fwd_ret_{h}d' for h in horizons]
df = signals.merge(prices[merge_cols], on=['date', 'symbol'], how='left')

for h in horizons:
    df[f'signed_ret_{h}d'] = df['direction_sign'] * df[f'fwd_ret_{h}d']

coverage = {
    f'{h}d_coverage': float(df[f'signed_ret_{h}d'].notna().mean())
    for h in horizons
}
print(f"signal rows: {len(df):,}")
print('forward-return coverage:', {k: f'{v:.1%}' for k, v in coverage.items()})
display(df.head())


In [None]:
tier_order = ['strong', 'moderate', 'weak']

fig, axes = plt.subplots(1, 3, figsize=(20, 5))

sns.boxplot(data=df, x='tier', y='signed_ret_1d', order=tier_order, ax=axes[0])
axes[0].axhline(0, color='black', linewidth=1)
axes[0].set_title('Signed 1D Return by Tier')
axes[0].set_xlabel('Tier')
axes[0].set_ylabel('Signed return')
axes[0].grid(alpha=0.3)

means = (
    df.groupby('tier')[[f'signed_ret_{h}d' for h in [1, 3, 5]]]
      .mean()
      .reindex(tier_order)
)
for tier in tier_order:
    if tier not in means.index:
        continue
    axes[1].plot([1, 3, 5], means.loc[tier].values, marker='o', label=tier)
axes[1].axhline(0, color='black', linewidth=1)
axes[1].set_title('Average Signed Return by Horizon')
axes[1].set_xlabel('Horizon (days)')
axes[1].set_ylabel('Mean signed return')
axes[1].legend()
axes[1].grid(alpha=0.3)

sns.histplot(df['signed_ret_5d'].dropna(), bins=35, kde=True, ax=axes[2], color='#ff7f0e')
axes[2].axvline(0, color='black', linewidth=1)
axes[2].set_title('Signed 5D Return Distribution')
axes[2].set_xlabel('Signed 5D return')
axes[2].grid(alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
records = []
for tier, grp in df.groupby('tier'):
    for h in [1, 3, 5]:
        col = f'signed_ret_{h}d'
        x = grp[col].dropna()
        n = len(x)
        hit = (x > 0).mean() if n else np.nan
        avg = x.mean() if n else np.nan
        med = x.median() if n else np.nan
        std = x.std(ddof=1) if n > 1 else np.nan
        t_stat = (avg / (std / np.sqrt(n))) if (n > 1 and std and std > 0) else np.nan
        records.append({
            'tier': tier,
            'horizon_days': h,
            'signals': n,
            'hit_rate': hit,
            'avg_signed_return': avg,
            'median_signed_return': med,
            't_stat': t_stat,
        })

edge_by_tier = pd.DataFrame(records).sort_values(['horizon_days', 'tier'])
edge_by_tier['hit_rate'] = 100 * edge_by_tier['hit_rate']

bucket_n = min(5, int(df['abs_magnitude'].nunique()))
if bucket_n >= 2:
    df['magnitude_bucket'] = pd.qcut(df['abs_magnitude'], q=bucket_n, duplicates='drop')
    bucket_summary = (
        df.groupby('magnitude_bucket', as_index=False)
          .agg(
              signals=('signed_ret_5d', 'count'),
              hit_rate_5d=('signed_ret_5d', lambda s: (s > 0).mean()),
              avg_signed_ret_5d=('signed_ret_5d', 'mean'),
              median_signed_ret_5d=('signed_ret_5d', 'median')
          )
          .sort_values('magnitude_bucket')
    )
    bucket_summary['hit_rate_5d'] = 100 * bucket_summary['hit_rate_5d']
else:
    bucket_summary = pd.DataFrame()

symbol_edge = (
    df.groupby('symbol', as_index=False)
      .agg(
          signals=('signed_ret_5d', 'count'),
          hit_rate_5d=('signed_ret_5d', lambda s: (s > 0).mean()),
          avg_signed_ret_5d=('signed_ret_5d', 'mean')
      )
)
symbol_edge = symbol_edge[symbol_edge['signals'] >= 5].sort_values('avg_signed_ret_5d', ascending=False)
symbol_edge['hit_rate_5d'] = 100 * symbol_edge['hit_rate_5d']

print('Edge by tier and horizon')
display(edge_by_tier)

if not bucket_summary.empty:
    print('Edge by |magnitude| bucket (5D signed return)')
    display(bucket_summary)

print('Top symbols by 5D signal edge (min 5 observations)')
display(symbol_edge.head(10))

print('Bottom symbols by 5D signal edge (min 5 observations)')
display(symbol_edge.tail(10).sort_values('avg_signed_ret_5d'))


In [None]:
daily_edge = (
    df.dropna(subset=['signed_ret_5d'])
      .groupby('date', as_index=False)
      .agg(mean_signed_ret_5d=('signed_ret_5d', 'mean'))
      .sort_values('date')
)

if not daily_edge.empty:
    daily_edge['cum_signal_edge'] = (1 + daily_edge['mean_signed_ret_5d']).cumprod() - 1

    fig, ax = plt.subplots(figsize=(14, 5))
    ax.plot(daily_edge['date'], 100 * daily_edge['cum_signal_edge'], color='#2ca02c', linewidth=2)
    ax.axhline(0, color='black', linewidth=1)
    ax.set_title('Cumulative Mean 5D Signed Signal Return (Illustrative)')
    ax.set_ylabel('Cumulative return (%)')
    ax.set_xlabel('Date')
    ax.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()
