# Concentration Risk

Concentration diagnostics for the live portfolio weights using `wolfpack/positions.csv`.

What this notebook covers:
- Daily top-name and top-bucket concentration (`top_1`, `top_3`, `top_5`, `top_10`)
- Herfindahl-Hirschman concentration index (HHI) and effective number of bets
- Long/short concentration split
- Latest-day concentration leaderboard


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from io import StringIO
from IPython.display import display

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

from QuantConnect import *
from QuantConnect.Research import QuantBook

qb = QuantBook()
print('QuantBook initialized')


def read_csv_from_store(key):
    try:
        if not qb.ObjectStore.ContainsKey(key):
            print(f'ObjectStore key not found: {key}')
            return None
        content = qb.ObjectStore.Read(key)
        if not content:
            print(f'Empty ObjectStore key: {key}')
            return None
        return pd.read_csv(StringIO(content))
    except Exception as e:
        print(f'Error reading {key}: {e}')
        return None


In [None]:
df_positions = read_csv_from_store('wolfpack/positions.csv')
df_snapshots = read_csv_from_store('wolfpack/daily_snapshots.csv')

if df_positions is None:
    raise ValueError('positions.csv is required for concentration analysis.')

if 'date' not in df_positions.columns or 'symbol' not in df_positions.columns:
    raise ValueError('positions.csv must include date and symbol columns.')

df_positions['date'] = pd.to_datetime(df_positions['date'], errors='coerce')
df_positions = df_positions[df_positions['date'].notna()].copy()

if 'weight' not in df_positions.columns:
    if 'market_value' in df_positions.columns and df_snapshots is not None and 'nav' in df_snapshots.columns:
        df_snapshots['date'] = pd.to_datetime(df_snapshots['date'], errors='coerce')
        merged = df_positions.merge(df_snapshots[['date', 'nav']], on='date', how='left')
        merged['weight'] = np.where(merged['nav'].abs() > 1e-9, merged['market_value'] / merged['nav'], 0.0)
        df_positions = merged.drop(columns=['nav'])
    else:
        raise ValueError('positions.csv must include weight or market_value with snapshots nav.')

df_positions['weight'] = pd.to_numeric(df_positions['weight'], errors='coerce').fillna(0.0)
if 'invested' in df_positions.columns:
    df_positions = df_positions[df_positions['invested'].astype(str).isin(['1', 'True', 'true']) | (df_positions['weight'].abs() > 1e-9)].copy()
else:
    df_positions = df_positions[df_positions['weight'].abs() > 1e-9].copy()

print(f'Rows after filtering: {len(df_positions):,}')
print(f'Date range: {df_positions["date"].min().date()} to {df_positions["date"].max().date()}')
print(f'Unique symbols: {df_positions["symbol"].nunique():,}')


In [None]:
def _top_bucket_sum(weights, n):
    if len(weights) == 0:
        return np.nan
    return weights.nlargest(n).sum()

records = []
for date, group in df_positions.groupby('date'):
    abs_w = group['weight'].abs().astype(float)
    gross_w = abs_w.sum()
    if gross_w <= 1e-12:
        continue

    normalized = abs_w / gross_w
    hhi = float((normalized ** 2).sum())
    eff_n = float(1.0 / hhi) if hhi > 0 else np.nan

    long_abs = group.loc[group['weight'] > 0, 'weight'].abs()
    short_abs = group.loc[group['weight'] < 0, 'weight'].abs()

    records.append({
        'date': date,
        'positions': int((abs_w > 0).sum()),
        'gross_weight_sum': float(gross_w),
        'top_1_abs_w': _top_bucket_sum(abs_w, 1),
        'top_3_abs_w': _top_bucket_sum(abs_w, 3),
        'top_5_abs_w': _top_bucket_sum(abs_w, 5),
        'top_10_abs_w': _top_bucket_sum(abs_w, 10),
        'top_1_share_of_gross': _top_bucket_sum(abs_w, 1) / gross_w,
        'top_3_share_of_gross': _top_bucket_sum(abs_w, 3) / gross_w,
        'top_5_share_of_gross': _top_bucket_sum(abs_w, 5) / gross_w,
        'top_10_share_of_gross': _top_bucket_sum(abs_w, 10) / gross_w,
        'hhi_abs_weight': hhi,
        'effective_n_bets': eff_n,
        'long_gross_abs': float(long_abs.sum()),
        'short_gross_abs': float(short_abs.sum()),
        'long_top_1_abs_w': _top_bucket_sum(long_abs, 1) if len(long_abs) > 0 else np.nan,
        'short_top_1_abs_w': _top_bucket_sum(short_abs, 1) if len(short_abs) > 0 else np.nan,
    })

concentration = pd.DataFrame(records).sort_values('date').reset_index(drop=True)
if concentration.empty:
    raise ValueError('No active daily positions found for concentration analysis.')

summary = {
    'Days analyzed': len(concentration),
    'Avg positions': concentration['positions'].mean(),
    'Avg top-1 share of gross': concentration['top_1_share_of_gross'].mean(),
    'Avg top-5 share of gross': concentration['top_5_share_of_gross'].mean(),
    'Avg HHI': concentration['hhi_abs_weight'].mean(),
    'Avg effective N': concentration['effective_n_bets'].mean(),
}

print('Concentration summary:')
for k, v in summary.items():
    if 'share' in k.lower() or k == 'Avg HHI':
        print(f'  {k}: {100 * v:.2f}%')
    elif 'effective' in k.lower() or 'positions' in k.lower():
        print(f'  {k}: {v:.2f}')
    else:
        print(f'  {k}: {v}')

display(concentration.tail(15))


In [None]:
fig, axes = plt.subplots(2, 1, figsize=(16, 10), sharex=True)

axes[0].plot(concentration['date'], 100 * concentration['top_1_share_of_gross'], label='Top 1', linewidth=2)
axes[0].plot(concentration['date'], 100 * concentration['top_3_share_of_gross'], label='Top 3', linewidth=2)
axes[0].plot(concentration['date'], 100 * concentration['top_5_share_of_gross'], label='Top 5', linewidth=2)
axes[0].plot(concentration['date'], 100 * concentration['top_10_share_of_gross'], label='Top 10', linewidth=2, alpha=0.8)
axes[0].set_title('Concentration Buckets as Share of Gross Exposure')
axes[0].set_ylabel('Share of gross (%)')
axes[0].legend(loc='upper left')
axes[0].grid(alpha=0.3)

axes[1].plot(concentration['date'], concentration['hhi_abs_weight'], label='HHI', color='#d62728', linewidth=2)
axes[1].plot(concentration['date'], concentration['effective_n_bets'], label='Effective N Bets', color='#1f77b4', linewidth=2)
axes[1].set_title('HHI and Effective Number of Bets')
axes[1].set_ylabel('Index / Count')
axes[1].set_xlabel('Date')
axes[1].legend(loc='upper left')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
latest_date = concentration['date'].max()
latest_positions = (
    df_positions[df_positions['date'] == latest_date][['symbol', 'weight']]
      .copy()
)
latest_positions['abs_weight'] = latest_positions['weight'].abs()
latest_positions = latest_positions.sort_values('abs_weight', ascending=False)

latest_total_abs = latest_positions['abs_weight'].sum()
latest_positions['share_of_gross_pct'] = np.where(
    latest_total_abs > 1e-12,
    100 * latest_positions['abs_weight'] / latest_total_abs,
    0.0
)

print(f'Latest date: {latest_date.date()}')
display(latest_positions.head(20))

plot_n = min(12, len(latest_positions))
plot_df = latest_positions.head(plot_n).iloc[::-1]

plt.figure(figsize=(12, 6))
plt.barh(plot_df['symbol'], plot_df['share_of_gross_pct'], color='#4c78a8')
plt.title(f'Top {plot_n} Symbol Concentration ({latest_date.date()})')
plt.xlabel('Share of gross exposure (%)')
plt.ylabel('Symbol')
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
thresholds = {
    'top_1_share_of_gross': 0.20,
    'top_3_share_of_gross': 0.45,
    'top_5_share_of_gross': 0.65,
    'hhi_abs_weight': 0.12,
}

breach_rows = []
for metric, threshold in thresholds.items():
    series = concentration[metric].dropna()
    breach_days = int((series > threshold).sum())
    breach_rows.append({
        'metric': metric,
        'threshold': threshold,
        'days_above': breach_days,
        'pct_days_above': breach_days / len(series) if len(series) else np.nan,
        'latest_value': concentration[metric].iloc[-1]
    })

breaches = pd.DataFrame(breach_rows)
for col in ['threshold', 'pct_days_above', 'latest_value']:
    breaches[col] = breaches[col].astype(float)

print('Concentration breach scorecard:')
display(breaches)
