# Symbol Stickiness and Selection Drivers

This notebook quantifies concentration and persistence to explain why similar names keep appearing.

Primary outputs:
- Per-symbol stickiness metrics
- Week-to-week retention diagnostics
- Concentration trends (top-name share / HHI proxy)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from io import StringIO
from IPython.display import display

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

from QuantConnect import *
from QuantConnect.Research import QuantBook

qb = QuantBook()
print('QuantBook initialized')


def read_csv_from_store(key):
    try:
        if not qb.ObjectStore.ContainsKey(key):
            print(f'ObjectStore key not found: {key}')
            return None
        content = qb.ObjectStore.Read(key)
        if not content:
            print(f'Empty ObjectStore key: {key}')
            return None
        return pd.read_csv(StringIO(content))
    except Exception as e:
        print(f'Error reading {key}: {e}')
        return None


In [None]:
UNIVERSE_SYMBOLS = [
    'AAPL', 'AMGN', 'AXP', 'BA', 'CAT', 'CRM',
    'CSCO', 'CVX', 'DIS', 'DOW', 'GS', 'HD',
    'HON', 'IBM', 'INTC', 'JNJ', 'JPM', 'KO',
    'MCD', 'MMM', 'MRK', 'MSFT', 'NKE', 'PG',
    'TRV', 'UNH', 'V', 'VZ', 'WBA', 'WMT'
]

# k controls how much tanh saturation is reduced (larger k => more separation)
score_scale_k = 2.0

df_signals = read_csv_from_store('wolfpack/signals.csv')
df_targets = read_csv_from_store('wolfpack/targets.csv')

if df_signals is None or df_targets is None:
    raise ValueError('signals.csv and targets.csv are required.')

df_signals['date'] = pd.to_datetime(df_signals['date'])
for col in ['magnitude', 'price', 'sma_short', 'sma_medium', 'sma_long', 'atr']:
    if col in df_signals.columns:
        df_signals[col] = pd.to_numeric(df_signals[col], errors='coerce')

df_signals['abs_magnitude'] = df_signals['magnitude'].abs()
df_signals['signal_sign'] = np.sign(df_signals['magnitude'])
df_signals['week_id'] = df_signals['date']

needed_for_composite = {'price', 'sma_short', 'sma_medium', 'sma_long', 'atr'}
if needed_for_composite.issubset(df_signals.columns):
    safe_atr = df_signals['atr'].replace(0, np.nan)
    df_signals['dist_short'] = (df_signals['price'] - df_signals['sma_short']) / safe_atr
    df_signals['dist_medium'] = (df_signals['price'] - df_signals['sma_medium']) / safe_atr
    df_signals['dist_long'] = (df_signals['price'] - df_signals['sma_long']) / safe_atr
    df_signals['composite_score'] = (
        0.5 * df_signals['dist_short'] +
        0.3 * df_signals['dist_medium'] +
        0.2 * df_signals['dist_long']
    )
else:
    clipped_mag = df_signals['magnitude'].clip(-0.999999, 0.999999)
    df_signals['composite_score'] = np.arctanh(clipped_mag)
    print('Warning: price/SMA/ATR columns missing; composite_score recovered from arctanh(magnitude).')

df_signals['abs_composite_score'] = df_signals['composite_score'].abs()
df_signals['scaled_magnitude_k'] = np.tanh(df_signals['composite_score'] / score_scale_k)
df_signals['abs_scaled_magnitude_k'] = df_signals['scaled_magnitude_k'].abs()

if 'week_id' in df_targets.columns:
    df_targets['week_id'] = pd.to_datetime(df_targets['week_id'], errors='coerce')
else:
    df_targets['week_id'] = pd.to_datetime(df_targets['date'])

for col in ['weekly_target_w', 'actual_w']:
    if col in df_targets.columns:
        df_targets[col] = pd.to_numeric(df_targets[col], errors='coerce').fillna(0.0)

weekly_targets = (
    df_targets[df_targets['week_id'].notna()]
      .sort_values('date')
      .groupby(['week_id', 'symbol'], as_index=False)
      .agg(weekly_target_w=('weekly_target_w', 'last'))
)
weekly_targets['selected'] = weekly_targets['weekly_target_w'].abs() > 1e-6

weekly_signals = (
    df_signals[
        [
            'week_id', 'symbol', 'magnitude', 'abs_magnitude', 'signal_sign',
            'composite_score', 'abs_composite_score',
            'scaled_magnitude_k', 'abs_scaled_magnitude_k'
        ]
    ]
    .dropna(subset=['week_id'])
    .copy()
)

week_ids = sorted(set(weekly_targets['week_id']).union(set(weekly_signals['week_id'])))
panel = pd.MultiIndex.from_product([week_ids, UNIVERSE_SYMBOLS], names=['week_id', 'symbol']).to_frame(index=False)
panel = panel.merge(weekly_signals, on=['week_id', 'symbol'], how='left')
panel = panel.merge(weekly_targets[['week_id', 'symbol', 'weekly_target_w', 'selected']], on=['week_id', 'symbol'], how='left')

panel['has_signal'] = panel['abs_magnitude'].notna()
panel['selected'] = panel['selected'].fillna(False)
panel['weekly_target_w'] = panel['weekly_target_w'].fillna(0.0)


In [None]:
symbol_metrics = (
    panel.groupby('symbol', as_index=False)
         .agg(
             weeks_observed=('week_id', 'count'),
             weeks_signaled=('has_signal', 'sum'),
             weeks_selected=('selected', 'sum'),
             mean_abs_magnitude=('abs_magnitude', 'mean'),
             median_abs_magnitude=('abs_magnitude', 'median'),
             mean_abs_scaled_magnitude_k=('abs_scaled_magnitude_k', 'mean'),
             median_abs_scaled_magnitude_k=('abs_scaled_magnitude_k', 'median'),
             mean_abs_composite_score=('abs_composite_score', 'mean'),
             median_abs_composite_score=('abs_composite_score', 'median'),
             avg_abs_target_when_selected=('weekly_target_w', lambda s: s[s.abs() > 1e-6].abs().mean())
         )
)

# Selection rate conditional on signal existing
def conditional_selection_rate(g):
    dg = g[g['has_signal']]
    if len(dg) == 0:
        return np.nan
    return dg['selected'].mean()

rates = panel.groupby('symbol').apply(conditional_selection_rate).rename('selection_rate_when_signaled').reset_index()
direction_consistency = (
    weekly_signals.groupby('symbol', as_index=False)['signal_sign']
                  .mean()
                  .rename(columns={'signal_sign': 'mean_signal_sign'})
)
direction_consistency['direction_consistency_abs'] = direction_consistency['mean_signal_sign'].abs()

symbol_metrics = symbol_metrics.merge(rates, on='symbol', how='left')
symbol_metrics = symbol_metrics.merge(direction_consistency[['symbol', 'direction_consistency_abs']], on='symbol', how='left')

# Max consecutive selected-week streak
panel_sorted = panel.sort_values(['symbol', 'week_id']).copy()

def max_streak(selected_values):
    max_run = 0
    run = 0
    for val in selected_values:
        if bool(val):
            run += 1
            max_run = max(max_run, run)
        else:
            run = 0
    return max_run

streaks = (
    panel_sorted.groupby('symbol')['selected']
               .apply(max_streak)
               .rename('max_selected_streak')
               .reset_index()
)

symbol_metrics = symbol_metrics.merge(streaks, on='symbol', how='left')
symbol_metrics = symbol_metrics.sort_values(['weeks_selected', 'selection_rate_when_signaled'], ascending=[False, False])

display(symbol_metrics)


In [None]:
top = symbol_metrics.head(15)

plt.figure(figsize=(14, 6))
plt.bar(top['symbol'], top['weeks_selected'], color='#1f77b4')
plt.title('Top Symbols by Weeks Selected')
plt.ylabel('Weeks selected')
plt.xticks(rotation=35)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

y = 100 * symbol_metrics['selection_rate_when_signaled']
size = 25 + 18 * symbol_metrics['weeks_selected'].fillna(0)
color = symbol_metrics['direction_consistency_abs'].fillna(0)

fig, axes = plt.subplots(1, 3, figsize=(22, 6), sharey=True)

sc0 = axes[0].scatter(
    symbol_metrics['median_abs_magnitude'],
    y,
    s=size,
    c=color,
    cmap='viridis',
    alpha=0.85
)
axes[0].set_xlabel('Median abs tanh(score)')
axes[0].set_title('Saturated')
axes[0].grid(alpha=0.3)

axes[1].scatter(
    symbol_metrics['median_abs_scaled_magnitude_k'],
    y,
    s=size,
    c=color,
    cmap='viridis',
    alpha=0.85
)
axes[1].set_xlabel(f'Median abs tanh(score/{score_scale_k:g})')
axes[1].set_title('Reduced Saturation')
axes[1].grid(alpha=0.3)

axes[2].scatter(
    symbol_metrics['median_abs_composite_score'],
    y,
    s=size,
    c=color,
    cmap='viridis',
    alpha=0.85
)
for _, row in symbol_metrics.head(12).iterrows():
    axes[2].annotate(
        row['symbol'],
        (row['median_abs_composite_score'], 100 * row['selection_rate_when_signaled']),
        fontsize=8
    )
axes[2].set_xlabel('Median abs composite_score (unsaturated)')
axes[2].set_title('Unsaturated')
axes[2].grid(alpha=0.3)

axes[0].set_ylabel('Selection rate when signaled (%)')
fig.suptitle('Signal Strength vs Selection Probability (bubble size = weeks selected)', y=1.02)
fig.colorbar(sc0, ax=axes, label='Direction consistency (abs mean sign)')
plt.tight_layout()
plt.show()

display(
    symbol_metrics[
        [
            'symbol',
            'median_abs_magnitude',
            'median_abs_scaled_magnitude_k',
            'median_abs_composite_score',
            'selection_rate_when_signaled',
            'weeks_selected'
        ]
    ]
    .sort_values('median_abs_composite_score', ascending=False)
    .head(20)
)


In [None]:
# Weekly concentration diagnostics
weekly_abs_weights = (
    panel.groupby(['week_id', 'symbol'], as_index=False)['weekly_target_w']
         .last()
)
weekly_abs_weights['abs_w'] = weekly_abs_weights['weekly_target_w'].abs()

concentration_rows = []
for week, g in weekly_abs_weights.groupby('week_id'):
    total = g['abs_w'].sum()
    if total <= 0:
        concentration_rows.append({
            'week_id': week,
            'gross_target': 0.0,
            'top_5_share': np.nan,
            'top_10_share': np.nan,
            'hhi_proxy': np.nan
        })
        continue

    shares = g['abs_w'] / total
    concentration_rows.append({
        'week_id': week,
        'gross_target': total,
        'top_5_share': g['abs_w'].nlargest(5).sum() / total,
        'top_10_share': g['abs_w'].nlargest(10).sum() / total,
        'hhi_proxy': (shares ** 2).sum()
    })

concentration = pd.DataFrame(concentration_rows).sort_values('week_id')

display(concentration.tail(20))

fig, axes = plt.subplots(2, 1, figsize=(14, 9), sharex=True)
axes[0].plot(concentration['week_id'], 100 * concentration['top_5_share'], label='Top 5 share %', color='#ff7f0e')
axes[0].plot(concentration['week_id'], 100 * concentration['top_10_share'], label='Top 10 share %', color='#1f77b4')
axes[0].set_ylabel('Share of gross target (%)')
axes[0].set_title('Target Concentration Over Time')
axes[0].legend()
axes[0].grid(alpha=0.3)

axes[1].plot(concentration['week_id'], concentration['hhi_proxy'], color='#2ca02c')
axes[1].set_ylabel('HHI proxy')
axes[1].set_xlabel('Week')
axes[1].set_title('Concentration HHI Proxy (higher = stickier universe)')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Week-over-week retention diagnostics
selected_sets = (
    panel[panel['selected']]
      .groupby('week_id')['symbol']
      .apply(set)
      .sort_index()
)

rows = []
weeks = list(selected_sets.index)
for i, week in enumerate(weeks):
    current_set = selected_sets.loc[week]
    prev_set = selected_sets.loc[weeks[i - 1]] if i > 0 else set()
    stayed = sorted(current_set & prev_set)
    entered = sorted(current_set - prev_set)
    exited = sorted(prev_set - current_set)
    rows.append({
        'week_id': week,
        'selected_count': len(current_set),
        'stayed_count': len(stayed),
        'entered_count': len(entered),
        'exited_count': len(exited),
        'retention_from_prev_week': len(stayed) / len(prev_set) if prev_set else np.nan,
        'entered_symbols': ', '.join(entered),
        'exited_symbols': ', '.join(exited)
    })

retention_report = pd.DataFrame(rows)
display(retention_report.tail(20))

latest_week = panel['week_id'].max()
latest = panel[panel['week_id'] == latest_week].copy()
latest['abs_magnitude'] = latest['abs_magnitude'].fillna(0.0)
latest_non_selected = latest[~latest['selected']].sort_values('abs_magnitude', ascending=False)

print(f'Latest week: {latest_week.date()}')
print('Top non-selected symbols by abs signal magnitude:')
display(latest_non_selected.head(10)[['symbol', 'abs_magnitude', 'weekly_target_w', 'has_signal']])
