# 02: Candlestick Pattern Dimensionality Reduction
This notebook applies PCA, Kernel PCA, and ICA to candlestick features and evaluates their performance in predicting 5-7 day returns.

In [1]:
import os, sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, FastICA, KernelPCA
from bokeh.io import output_notebook, show
output_notebook()

if os.getcwd().endswith('notebooks/candlestick_analysis'):
    os.chdir('../..')
ROOT = os.getcwd()
if ROOT not in sys.path: sys.path.insert(0, ROOT)

from src.features.candlestick_patterns import extract_candlestick_patterns
from src.backtester.data import align_close_prices, load_cleaned_assets
from src.backtester.engine import BacktestConfig, run_backtest
from src.backtester.report import compute_backtest_report
from src.backtester.bokeh_plots import build_interactive_portfolio_layout

DATA_PATH = 'dataset/cleaned/cleaned_stock_data.parquet'
TARGET_HORIZON = 5
CUTOFF_DATE = '2023-01-01'
SEED = 42

## 1) Data Load & Pattern Extraction

In [2]:
df = pd.read_parquet(DATA_PATH)
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.set_index('Date')

def process_asset(group):
    # Get Asset_ID from group name (more robust in pandas apply)
    aid = group.name
    patterns = extract_candlestick_patterns(group)
    patterns['Asset_ID'] = aid
    patterns['y_target'] = group['Close'].shift(-TARGET_HORIZON) / group['Close'] - 1
    return patterns

print("Extracting patterns...")
processed_df = df.groupby('Asset_ID', group_keys=False).apply(process_asset)
processed_df = processed_df.dropna(subset=['y_target'])
feature_cols = [c for c in processed_df.columns if c not in ['Asset_ID', 'y_target']]

train_df = processed_df[processed_df.index < CUTOFF_DATE]
test_df = processed_df[processed_df.index >= CUTOFF_DATE]
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_df[feature_cols])
X_test_scaled = scaler.transform(test_df[feature_cols])

Extracting patterns...


## 2) Dimensionality Reduction Pipeline

In [3]:
def run_dimred_experiment(name, reducer_obj):
    print(f'\n--- Running {name} ---')
    X_tr_red = reducer_obj.fit_transform(X_train_scaled)
    X_te_red = reducer_obj.transform(X_test_scaled)
    
    rf = RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1, random_state=SEED)
    rf.fit(X_tr_red, train_df['y_target'])
    preds = rf.predict(X_te_red)
    
    test_syms = test_df['Asset_ID'].unique()
    close_prices = align_close_prices(load_cleaned_assets(symbols=test_syms))
    close_prices = close_prices[close_prices.index >= CUTOFF_DATE]
    
    out = test_df.copy(); out['y_pred'] = preds
    w = out.pivot(columns='Asset_ID', values='y_pred').reindex(close_prices.index).fillna(0)
    w_rank = w.rank(axis=1, ascending=False)
    w_final = ((w_rank <= 5) & (w > 0)).astype(float)
    w_final = w_final.div(w_final.sum(axis=1).replace(0, 1), axis=0)
    
    res = run_backtest(close_prices, w_final, BacktestConfig())
    rep = compute_backtest_report(result=res, close_prices=close_prices)
    print(rep)
    
    mkt = pd.DataFrame(index=close_prices.index)
    mkt['Close'] = close_prices.iloc[:, 0]
    for c in ['Open', 'High', 'Low']: mkt[c] = mkt['Close']
    mkt['Volume'] = 0
    p = build_interactive_portfolio_layout(
        market_ohlcv=mkt, 
        equity=res.equity, 
        returns=res.returns, 
        weights=res.weights, 
        turnover=res.turnover, 
        costs=res.costs, 
        title=f"{name} analysis"
    )
    show(p)
    return rep

pca_rep = run_dimred_experiment('PCA', PCA(n_components=5, random_state=SEED))
ica_rep = run_dimred_experiment('ICA', FastICA(n_components=5, random_state=SEED))
kpca_rep = run_dimred_experiment('KernelPCA', KernelPCA(n_components=5, kernel='rbf', random_state=SEED))


--- Running PCA ---
Start                         2023-01-03 00:00:00
End                           2026-01-16 00:00:00
Duration                       1109 days 00:00:00
Initial Equity                          1000000.0
Final Equity                        843161.114608
Equity Peak                        1224488.015616
Total Return [%]                       -15.683889
CAGR [%]                                  -5.4856
Volatility (ann) [%]                    17.649121
Sharpe                                  -0.231274
Sortino                                 -0.385765
Max Drawdown [%]                       -31.141742
Calmar                                  -0.176149
Best Day [%]                             6.546247
Worst Day [%]                           -4.712312
Avg Gross Exposure                       0.933976
Avg Net Exposure                         0.933976
Exposure Time [%]                       93.315858
Rebalance Days                                753
Total Turnover               


--- Running ICA ---
Start                         2023-01-03 00:00:00
End                           2026-01-16 00:00:00
Duration                       1109 days 00:00:00
Initial Equity                          1000000.0
Final Equity                        830384.581816
Equity Peak                        1207103.876467
Total Return [%]                       -16.961542
CAGR [%]                                -5.961659
Volatility (ann) [%]                    17.393637
Sharpe                                  -0.266258
Sortino                                 -0.447089
Max Drawdown [%]                       -31.208523
Calmar                                  -0.191027
Best Day [%]                             6.546247
Worst Day [%]                           -4.712312
Avg Gross Exposure                       0.939224
Avg Net Exposure                         0.939224
Exposure Time [%]                       93.840105
Rebalance Days                                755
Total Turnover               


--- Running KernelPCA ---


MemoryError: Unable to allocate 228. GiB for an array with shape (174800, 174800) and data type float64