# Cluster-Wise Dimensionality Reduction with LightGBM
This notebook implements a sophisticated pipeline:
1.  **Clustering**: Use KMeans to partition the feature space into 6 distinct regimes.
2.  **Regime-Specific Compression**: Apply Nystroem-approximated Kernel PCA within each cluster to capture local non-linearities.
3.  **Modeling**: Train a global LightGBM model on these regime-distilled features.
4.  **Backtesting**: Evaluate performance using the interactive Bokeh dashboard.

In [None]:
!pip install lightgbm --quiet


In [1]:
print("Training LightGBM on cluster-distilled features...")
model = lgb.LGBMRegressor(n_estimators=300, learning_rate=0.03, max_depth=8, n_jobs=-1, random_state=SEED, verbose=-1)
model.fit(X_train_reduced, y_train)

preds = model.predict(X_test_reduced)

test_syms = df_test['Asset_ID'].unique()
close_prices = align_close_prices(load_cleaned_assets(symbols=test_syms))
close_prices = close_prices[close_prices.index >= CUTOFF]

df_out = df_test.copy(); df_out['y_pred'] = preds
w = df_out.pivot(columns='Asset_ID', values='y_pred').reindex(close_prices.index).fillna(0)
w_rank = w.rank(axis=1, ascending=False)
w_raw = ((w_rank <= 5) & (w > 0)).astype(float)
w_raw = w_raw.div(w_raw.sum(axis=1).replace(0, 1), axis=0)

# Reduce turnover: Rebalance only every 5 days
w_final = w_raw.copy()
for i in range(1, len(w_final)):
    if i % 5 != 0:
        w_final.iloc[i] = w_final.iloc[i-1]

res = run_backtest(close_prices, w_final, BacktestConfig())
report = compute_backtest_report(result=res, close_prices=close_prices)
print(report)

mkt = pd.DataFrame(index=close_prices.index); mkt['Close'] = close_prices.iloc[:, 0]
for c in ['Open', 'High', 'Low']: mkt[c] = mkt['Close']
mkt['Volume'] = 0

show(build_interactive_portfolio_layout(
    market_ohlcv=mkt, 
    equity=res.equity, 
    returns=res.returns, 
    weights=res.weights, 
    turnover=res.turnover, 
    costs=res.costs, 
    title=f"KMeans({N_CLUSTERS}) + KPCA({COMPONENTS_PER_CLUSTER}) + LightGBM (5-day rebalance)"
))

## 1) Data Preparation

In [2]:
print("Loading features...")
df = pd.read_parquet(FEATURES_PATH)
if 'Date' in df.columns: df = df.set_index('Date')

# Create target
df[TARGET_FWD] = df.groupby('Asset_ID', sort=False)[TARGET_COL].shift(-1)
df = df.dropna(subset=[TARGET_FWD])

feature_cols = [c for c in df.columns if c not in ['Asset_ID', TARGET_FWD, TARGET_COL, 'close', 'volume']]

# Clean and Scale
imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()

X = df[feature_cols].replace([np.inf, -np.inf], np.nan)
X_clean = imputer.fit_transform(X)
X_scaled = scaler.fit_transform(X_clean)

df_train = df[df.index < CUTOFF]
df_test = df[df.index >= CUTOFF]

X_train = X_scaled[:len(df_train)]
X_test = X_scaled[len(df_train):]
y_train = df_train[TARGET_FWD]

Loading features...


## 2) KMeans Clustering & Cluster-wise Kernel PCA

In [3]:
print(f"Clustering into {N_CLUSTERS} regimes...")
kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=SEED, n_init=10)
clusters_train = kmeans.fit_predict(X_train)
clusters_test = kmeans.predict(X_test)

feature_names = [f"c{i}_pca{j}" for i in range(N_CLUSTERS) for j in range(COMPONENTS_PER_CLUSTER)]
X_train_reduced = pd.DataFrame(0.0, index=X_train.index, columns=feature_names)
X_test_reduced = pd.DataFrame(0.0, index=X_test.index, columns=feature_names)

for i in range(N_CLUSTERS):
    print(f"  Processing Cluster {i} features...")
    mask_tr = (clusters_train == i)
    
    # Using Nystroem for non-linear mapping
    nystroem = Nystroem(kernel='rbf', n_components=100, random_state=SEED)
    pca = PCA(n_components=COMPONENTS_PER_CLUSTER)
    
    if mask_tr.any():
        # Fit on cluster data
        X_cluster = X_train[mask_tr]
        nystroem.fit(X_cluster)
        X_mapped = nystroem.transform(X_cluster)
        pca.fit(X_mapped)
        
        # Transform ALL data
        cluster_cols = [f"c{i}_pca{j}" for j in range(COMPONENTS_PER_CLUSTER)]
        
        X_train_reduced[cluster_cols] = pca.transform(nystroem.transform(X_train))
        X_test_reduced[cluster_cols] = pca.transform(nystroem.transform(X_test))

print(f"Reduced feature shape: {X_train_reduced.shape}")

Clustering into 6 regimes...
  Processing Cluster 0 features...
  Processing Cluster 1 features...
  Processing Cluster 2 features...
  Processing Cluster 3 features...
  Processing Cluster 4 features...
  Processing Cluster 5 features...
Reduced feature shape: (174800, 18)


## 3) LightGBM Training & Backtesting

In [4]:
print("Training LightGBM on cluster-distilled features...")
model = lgb.LGBMRegressor(n_estimators=300, learning_rate=0.03, max_depth=8, n_jobs=-1, random_state=SEED, verbose=-1)
model.fit(X_train_reduced, y_train)

preds = model.predict(X_test_reduced)

test_syms = df_test['Asset_ID'].unique()
close_prices = align_close_prices(load_cleaned_assets(symbols=test_syms))
close_prices = close_prices[close_prices.index >= CUTOFF]

df_out = df_test.copy(); df_out['y_pred'] = preds
w = df_out.pivot(columns='Asset_ID', values='y_pred').reindex(close_prices.index).fillna(0)
w_rank = w.rank(axis=1, ascending=False)
w_raw = ((w_rank <= 5) & (w > 0)).astype(float)
w_raw = w_raw.div(w_raw.sum(axis=1).replace(0, 1), axis=0)

# Reduce turnover: Rebalance only every 5 days
w_final = w_raw.copy()
for i in range(1, len(w_final)):
    if i % 5 != 0:
        w_final.iloc[i] = w_final.iloc[i-1]

res = run_backtest(close_prices, w_final, BacktestConfig())
report = compute_backtest_report(result=res, close_prices=close_prices)
print(report)

mkt = pd.DataFrame(index=close_prices.index); mkt['Close'] = close_prices.iloc[:, 0]
for c in ['Open', 'High', 'Low']: mkt[c] = mkt['Close']
mkt['Volume'] = 0

show(build_interactive_portfolio_layout(
    market_ohlcv=mkt, 
    equity=res.equity, 
    returns=res.returns, 
    weights=res.weights, 
    turnover=res.turnover, 
    costs=res.costs, 
    title=f"KMeans({N_CLUSTERS}) + KPCA({COMPONENTS_PER_CLUSTER}) + LightGBM (5-day rebalance)"
))

Training LightGBM on cluster-distilled features...




Start                         2023-01-03 00:00:00
End                           2026-01-16 00:00:00
Duration                       1109 days 00:00:00
Initial Equity                          1000000.0
Final Equity                       1067919.100706
Equity Peak                        1138770.712949
Total Return [%]                          6.79191
CAGR [%]                                 2.196937
Volatility (ann) [%]                    17.503723
Sharpe                                    0.21132
Sortino                                  0.345136
Max Drawdown [%]                       -21.124486
Calmar                                      0.104
Best Day [%]                             7.828659
Worst Day [%]                           -4.970383
Avg Gross Exposure                       0.999572
Avg Net Exposure                         0.999572
Exposure Time [%]                       99.868938
Rebalance Days                                763
Total Turnover                  1332044640.569925
