# 01: Candlestick Pattern Baseline Model
This notebook extracts standard candlestick patterns and trains a Random Forest model to predict subsequent 5-7 day returns.

In [5]:
import os, sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from bokeh.io import output_notebook, show
output_notebook()

if os.getcwd().endswith('notebooks/candlestick_analysis'):
    os.chdir('../..')
ROOT = os.getcwd()
if ROOT not in sys.path: sys.path.insert(0, ROOT)

from src.features.candlestick_patterns import extract_candlestick_patterns
from src.backtester.data import align_close_prices, load_cleaned_assets
from src.backtester.engine import BacktestConfig, run_backtest
from src.backtester.report import compute_backtest_report
from src.backtester.bokeh_plots import build_interactive_portfolio_layout

DATA_PATH = 'dataset/cleaned/cleaned_stock_data.parquet'
TARGET_HORIZON = 5 # 5-7 days requested
CUTOFF_DATE = '2023-01-01'
SEED = 42

## 1) Data Load & Pattern Extraction

In [6]:
df = pd.read_parquet(DATA_PATH)
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.set_index('Date')

def process_asset(group):
    # Get Asset_ID from group name (more robust in pandas apply)
    aid = group.name
    patterns = extract_candlestick_patterns(group)
    patterns['Asset_ID'] = aid
    # Target: Return over the next N days
    patterns['y_target'] = group['Close'].shift(-TARGET_HORIZON) / group['Close'] - 1
    return patterns

print("Extracting patterns... (may take a moment for 100 assets)")
processed_df = df.groupby('Asset_ID', group_keys=False).apply(process_asset)
processed_df = processed_df.dropna(subset=['y_target'])
feature_cols = [c for c in processed_df.columns if c not in ['Asset_ID', 'y_target']]
print(f"Features created: {feature_cols}")

Extracting patterns... (may take a moment for 100 assets)
Features created: ['bullish_engulfing', 'bearish_engulfing', 'morning_star', 'evening_star', 'three_white_soldiers', 'three_black_crows', 'piercing_line', 'hanging_man', 'hammer', 'inverse_hammer', 'tweezer_tops', 'doji', 'spinning_tops']


## 2) Model Training (Time-Wise Split)

In [7]:
train = processed_df[processed_df.index < CUTOFF_DATE]
test = processed_df[processed_df.index >= CUTOFF_DATE]

X_train, y_train = train[feature_cols], train['y_target']
X_test, y_test = test[feature_cols], test['y_target']

rf = RandomForestRegressor(n_estimators=100, max_depth=8, n_jobs=-1, random_state=SEED)
rf.fit(X_train, y_train)

preds = rf.predict(X_test)
test_out = test.copy()
test_out['y_pred'] = preds

## 3) Backtesting

In [8]:
test_syms = test_out['Asset_ID'].unique()
close_prices = align_close_prices(load_cleaned_assets(symbols=test_syms))
close_prices = close_prices[close_prices.index >= CUTOFF_DATE]

w = test_out.pivot(columns='Asset_ID', values='y_pred').reindex(close_prices.index).fillna(0)
w_rank = w.rank(axis=1, ascending=False)
w_final = ((w_rank <= 5) & (w > 0)).astype(float)
w_final = w_final.div(w_final.sum(axis=1), axis=0).fillna(0)

res = run_backtest(close_prices, w_final, BacktestConfig(rebalance='D'))
report = compute_backtest_report(result=res, close_prices=close_prices)
print(report)

# Bokeh Plot
mkt = pd.DataFrame(index=close_prices.index)
mkt['Close'] = close_prices.iloc[:, 0] # Proxy
for c in ['Open', 'High', 'Low']: mkt[c] = mkt['Close']
mkt['Volume'] = 0

p = build_interactive_portfolio_layout(
    market_ohlcv=mkt, 
    equity=res.equity, 
    returns=res.returns, 
    weights=res.weights, 
    turnover=res.turnover, 
    costs=res.costs, 
    title="Candlestick Baseline RF"
)
show(p)

Start                         2023-01-03 00:00:00
End                           2026-01-16 00:00:00
Duration                       1109 days 00:00:00
Initial Equity                          1000000.0
Final Equity                        826492.912075
Equity Peak                        1197147.845277
Total Return [%]                       -17.350709
CAGR [%]                                -6.107638
Volatility (ann) [%]                    17.182667
Sharpe                                  -0.280593
Sortino                                 -0.465967
Max Drawdown [%]                         -30.9615
Calmar                                  -0.197266
Best Day [%]                             6.546247
Worst Day [%]                           -4.712312
Avg Gross Exposure                       0.936601
Avg Net Exposure                         0.936601
Exposure Time [%]                       93.577982
Rebalance Days                                755
Total Turnover                   1315725430.32319
