In [4]:
import sys
import os
import pandas as pd
# Add the parent directory (where 'src' folder is located) to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.stock_features import build_stock_features_orchestrator, apply_kalman_filter_with_lag
import warnings
warnings.filterwarnings('ignore')

# -------- Output folder --------
base_output_dir = r"C:\Users\epoch_bpjmdqk\Documents\Code\data\processed"
macro_folder = r"C:\Users\epoch_bpjmdqk\Documents\Code\data\raw"
os.makedirs(base_output_dir, exist_ok=True)

In [5]:
# -------- Sector definitions --------
SECTORS = {
    "staples": {
        "tickers": ["WMT","PG","KO","PEP","COST","CL","CLX","KMB","GIS","MDLZ","KR","TGT","XLP","^GSPC"],
        "sector_etf": "XLP",
    },
    "discretionary": {
        "tickers": ["AMZN","HD","MCD","NKE","SBUX","TJX","LOW","BKNG","ROST","MAR","XLY","^GSPC"],
        "sector_etf": "XLY",
    },
    "healthcare": {
        "tickers": ["UNH","LLY","JNJ","ABBV","MRK","TMO","ABT","PFE","MDT","ISRG","CVS","HUM","XLV","^GSPC"],
        "sector_etf": "XLV",
    },
    "technology": {
        "tickers": ["AAPL","MSFT","NVDA","AVGO","ADBE","CRM","AMD","INTC","CSCO","QCOM","ORCL","TXN","XLK","^GSPC"],
        "sector_etf": "XLK",
    },
    "financials": {
        "tickers": ["JPM","BAC","WFC","MS","GS","C","BLK","PGR","AXP","USB","SCHW","CB","XLF","^GSPC"],
        "sector_etf": "XLF",
    },
    "energy": {
        "tickers": ["XOM","CVX","COP","EOG","SLB","OXY","PSX","MPC","VLO","HAL","KMI","XLE","^GSPC"],
        "sector_etf": "XLE",
    },
    "industrials": {
        "tickers": ["CAT","BA","HON","GE","UPS","UNP","DE","RTX","LMT","ETN","EMR","MMM","XLI","^GSPC"],
        "sector_etf": "XLI",
    },
    "utilities": {
        "tickers": ["NEE","SO","DUK","AEP","EXC","SRE","XEL","D","PEG","ED","XLU","^GSPC"],
        "sector_etf": "XLU",
    },
    "materials": {
        "tickers": ["LIN","APD","ECL","NEM","FCX","NUE","SHW","ALB","MLM","VMC","XLB","^GSPC"],
        "sector_etf": "XLB",
    },
    "communication_services": {
        "tickers": ["META","GOOGL","GOOG","NFLX","CMCSA","DIS","T","VZ","XLC","^GSPC"],
        "sector_etf": "XLC",
    },
    "real_estate": {
        "tickers": ["AMT","PLD","EQIX","PSA","SPG","CCI","O","WELL","XLRE","^GSPC"],
        "sector_etf": "XLRE",
    },
}

In [3]:
from src.macro_features import macro_data_orchestrator
# (optional) date range
start_date_str = None  
end_date_str   = None  

FRED_series_ids = {
        'CPI': 'CPIAUCSL',
        'FEDERAL_FUNDS_RATE': 'DFF',
        'TREASURY_YIELD': 'DGS10',
        'UNEMPLOYMENT': 'UNRATE',
        'REAL_GDP': 'GDPC1',
        'RETAIL_SALES': 'RSAFS',
        'PAYEMS': 'PAYEMS' 
    }

macro_funcs = { 'CPI', 'FEDERAL_FUNDS_RATE', 'TREASURY_YIELD', 
                'UNEMPLOYMENT', 'REAL_GDP', 'RETAIL_SALES', 'PAYEMS' }

# Try load macro data, if no macro data run orchestraotr:
try:
    macro_df = pd.read_csv(
        r'C:\Users\epoch_bpjmdqk\Documents\Code\data\raw\macros.csv'
    )
    print("Loaded existing macro data from CSV.")
    print(f"Data loaded: {macro_df.shape}")
except FileNotFoundError:
    macro_df = macro_data_orchestrator(
        macro_funcs_to_fetch=macro_funcs,
        fred_series_ids_dict=FRED_series_ids,
        start_date=start_date_str,
        save_path=macro_folder,
    )

Loaded existing macro data from CSV.
Data loaded: (31652, 8)


In [None]:
# -------- Build sector datasets --------
for sector_name, cfg in SECTORS.items():
    tickers = cfg["tickers"]
    sector_etf = cfg.get("sector_etf")

    # equities = all non-index, non-ETF names
    equities = [t for t in tickers if not t.startswith("^") and t != sector_etf]

    for target in equities:
        suppliers = [t for t in equities if t != target]  # peers (exclude target)
        print(f"\n--- Building {sector_name} :: target={target} ---")

        df = build_stock_features_orchestrator(
            tickers=tickers,
            target_ticker=target,
            supplier_tickers=suppliers,
            benchmark_ticker="^GSPC",
            kalman_lags=[1,5,10],           # keep inside orchestrator
            kalman_targets="all",
            sector_etf=sector_etf,
            dropna_frac=0.90
        )

        # make sure index is tz-naive before merging macro
        if getattr(df.index, "tz", None) is not None:
            df.index = df.index.tz_localize(None)

        if not macro_df.empty:
            merged = pd.merge(df, macro_df, left_index=True, right_index=True, how='left')
            macro_cols = list(FRED_series_ids.values())
            merged[macro_cols] = merged[macro_cols].shift(1)  # avoid lookahead
        else:
            merged = df

        out_path = os.path.join(base_output_dir, f"{sector_name}__{target}.csv")
        merged.to_csv(out_path, index=True)
        print(f"Saved {sector_name}:: {target} → {out_path}  rows={len(merged):,} cols={merged.shape[1]}")


--- Building staples (14 tickers) ---

--- Starting Stock Feature Pipeline ---
Fetching data for 14 tickers...
Date range: All available history to Current date
Fetching full history for WMT...
Fetching full history for PG...
Fetching full history for KO...
Fetching full history for PEP...
Fetching full history for COST...
Fetching full history for CL...
Fetching full history for CLX...
Fetching full history for KMB...
Fetching full history for GIS...
Fetching full history for MDLZ...
Fetching full history for KR...
Fetching full history for TGT...
Fetching full history for XLP...
Fetching full history for ^GSPC...

Final merged DataFrame has 6090 common entries.

Discovered stock prefixes: ['CL', 'CLX', 'COST', 'GIS', 'KMB', 'KO', 'KR', 'MDLZ', 'PEP', 'PG', 'TGT', 'WMT', 'XLP', '^GSPC']

Processing features for stock prefix: CL

Processing features for stock prefix: CLX

Processing features for stock prefix: COST

Processing features for stock prefix: GIS

Processing features for sto

TypeError: sector_and_market_relatives() got an unexpected keyword argument 'sector_etf'

In [None]:
import pandas as pd
import numpy as np
import optuna
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import average_precision_score

from src.modelling_functions import build_datasets_for_models, base_rule_ma_cross, purged_time_series_splits

def _fit_with_es(pipe, X_tr, y_tr, X_va, y_va, rounds=50):
    """Fit pipeline with XGBoost early stopping (handles <2.0 and ≥2.0)."""
    common = dict(model__eval_set=[(X_va, y_va)], model__verbose=False)
    try:
        pipe.fit(X_tr, y_tr, model__early_stopping_rounds=rounds, **common)
    except TypeError:
        es = xgb.callback.EarlyStopping(rounds=rounds, save_best=True)
        pipe.fit(X_tr, y_tr, model__callbacks=[es], **common)
    return pipe

def quick_score_ticker(tkr, engineered_df, H=10, pt=1.0, sl=1.0, span_vol=20, seed=42, n_trials=40):
    close = engineered_df[f"Close_{tkr}"].astype(float)
    X_base = engineered_df.drop(
        columns=[f'Open_{tkr}', f'High_{tkr}', f'Low_{tkr}', f'Close_{tkr}'],
        errors='ignore'
    )
    base_side = base_rule_ma_cross(close, fast=10, slow=20)

    packs = build_datasets_for_models(
        X=X_base, close=close, span_vol=span_vol, H=H, pt_sl=(pt, sl), base_side=base_side
    )
    X, y, t1 = packs["meta_X"], packs["meta_y"], packs["meta_t1"]

    # class stats for imbalance + baseline PR
    n_events = len(y)
    n_pos = int(y.sum())
    base_pr = (n_pos / n_events) if n_events else np.nan
    cw = ((n_events - n_pos) / max(n_pos, 1))  # scale_pos_weight

    def build_pipe(params, sel='median'):
        base_est = XGBClassifier(**{**params, "verbosity": 0})
        return Pipeline([
            ("imputer", SimpleImputer(strategy="mean")),
            ("selector", SelectFromModel(base_est, threshold=sel)),
            ("model", XGBClassifier(**{**params, "verbosity": 0}))
        ])

    def objective(trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 300, 1200),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
            "max_depth": trial.suggest_int("max_depth", 3, 8),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
            "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
            "gamma": trial.suggest_float("gamma", 0.0, 5.0),
            "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 20.0),
            "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 5.0),
            "scale_pos_weight": cw,           # <- imbalance help
            "eval_metric": "logloss",
            "random_state": seed, "n_jobs": -1, "tree_method": "hist",
        }
        sel = trial.suggest_categorical("selector_threshold", ["median", "mean", "0.75*mean"])
        pipe = build_pipe(params, sel)

        splits = list(purged_time_series_splits(X.index, t1.reindex(X.index), n_splits=3, test_size=None, embargo=H))
        scores = []
        for tr, va in splits:
            X_tr, X_va = X.iloc[tr], X.iloc[va]
            y_tr, y_va = y.iloc[tr], y.iloc[va]
            _fit_with_es(pipe, X_tr, y_tr, X_va, y_va, rounds=50)
            p = pipe.predict_proba(X_va)[:, 1]
            scores.append(average_precision_score(y_va, p))  # PR-AUC
        return float(np.nanmean(scores)) if scores else -10.0

    # very small samples → PR-AUC is unstable; short-circuit with NaN
    if (n_events < 250) or (n_pos < 40):
        return {"ticker": tkr, "cv_pr_auc": np.nan, "n_events": n_events, "n_pos": n_pos,
                "base_pr": base_pr, "lift": np.nan}

    study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=seed))
    study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
    best = study.best_value
    return {
        "ticker": tkr,
        "cv_pr_auc": best,
        "n_events": n_events,
        "n_pos": n_pos,
        "base_pr": base_pr,
        "lift": (best / (base_pr + 1e-12)) if np.isfinite(base_pr) and base_pr > 0 else np.nan
    }

def scan_tickers(tickers, engineered_df, n_trials=40):
    rows = []
    for t in tickers:
        try:
            rows.append(quick_score_ticker(t, engineered_df, n_trials=n_trials))
        except Exception as e:
            print(f"[warn] {t}: {e}")
            rows.append({"ticker": t, "cv_pr_auc": np.nan, "n_events": np.nan,
                         "n_pos": np.nan, "base_pr": np.nan, "lift": np.nan})
    res = pd.DataFrame(rows).sort_values(["lift", "cv_pr_auc"], ascending=False)
    print(res)
    return res
