In [None]:
import sys
import os
import pandas as pd
import numpy as np
from datetime import datetime
# Add the parent directory (where 'src' folder is located) to sys.path
# sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.stock_features import build_stock_features_orchestrator, build_sector_base_features, make_target_view 
import warnings
warnings.filterwarnings('ignore')

# --- Pathing: make src importable no matter where you run the script/notebook ---
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

from src.stock_features import (
    build_stock_features_orchestrator,
    build_sector_base_features,
    make_target_view
)

# NEW: lean on your macro helpers instead of a manual merge
from src.macro_features import (
    macro_data_orchestrator,
    normalize_date_col,
    prepare_macro_for_daily_merge,
    merge_stocks_and_macros,
)

# -------- Output folder --------
base_output_dir = r"C:\Users\epoch_bpjmdqk\Documents\Code\data\processed"
macro_folder = r"C:\Users\epoch_bpjmdqk\Documents\Code\data\raw"
os.makedirs(base_output_dir, exist_ok=True)

In [None]:
# -------- Sector definitions --------
SECTORS = {
    "staples": {
        "tickers": ["WMT","PG","KO","PEP","COST","CL","CLX","KMB","GIS","MDLZ","KR","TGT","XLP","^GSPC"],
        "sector_etf": "XLP",
    },
    "discretionary": {
        "tickers": ["AMZN","HD","MCD","NKE","SBUX","TJX","LOW","BKNG","ROST","MAR","XLY","^GSPC"],
        "sector_etf": "XLY",
    },
    "healthcare": {
        "tickers": ["UNH","LLY","JNJ","ABBV","MRK","TMO","ABT","PFE","MDT","ISRG","CVS","HUM","XLV","^GSPC"],
        "sector_etf": "XLV",
    },
    "technology": {
        "tickers": ["AAPL","MSFT","NVDA","AVGO","ADBE","CRM","AMD","INTC","CSCO","QCOM","ORCL","TXN","XLK","^GSPC"],
        "sector_etf": "XLK",
    },
    "financials": {
        "tickers": ["JPM","BAC","WFC","MS","GS","C","BLK","PGR","AXP","USB","SCHW","CB","XLF","^GSPC"],
        "sector_etf": "XLF",
    },
    "energy": {
        "tickers": ["XOM","CVX","COP","EOG","SLB","OXY","PSX","MPC","VLO","HAL","KMI","XLE","^GSPC"],
        "sector_etf": "XLE",
    },
    "industrials": {
        "tickers": ["CAT","BA","HON","GE","UPS","UNP","DE","RTX","LMT","ETN","EMR","MMM","XLI","^GSPC"],
        "sector_etf": "XLI",
    },
    "utilities": {
        "tickers": ["NEE","SO","DUK","AEP","EXC","SRE","XEL","D","PEG","ED","XLU","^GSPC"],
        "sector_etf": "XLU",
    },
    "materials": {
        "tickers": ["LIN","APD","ECL","NEM","FCX","NUE","SHW","ALB","MLM","VMC","XLB","^GSPC"],
        "sector_etf": "XLB",
    },
    "communication_services": {
        "tickers": ["META","GOOGL","GOOG","NFLX","CMCSA","DIS","T","VZ","XLC","^GSPC"],
        "sector_etf": "XLC",
    },
    "real_estate": {
        "tickers": ["AMT","PLD","EQIX","PSA","SPG","CCI","O","WELL","XLRE","^GSPC"],
        "sector_etf": "XLRE",
    },
}

In [None]:
# -------------------- Data quality helper --------------------
def validate_df(df: pd.DataFrame, name: str, require_dtindex: bool = True) -> pd.DataFrame:
    """Basic QA: datetime index, monotonic, no duplicates, finite values, minimal all-NaN cols."""
    if df is None or len(df) == 0:
        print(f"[validate] {name}: empty dataframe")
        return df

    # Ensure DateTimeIndex
    if require_dtindex and not isinstance(df.index, pd.DatetimeIndex):
        try:
            # Try to coerce if a 'date' column exists
            if 'date' in df.columns:
                df['date'] = pd.to_datetime(df['date'], utc=False, errors='coerce')
                df = df.set_index('date')
            else:
                df.index = pd.to_datetime(df.index, utc=False, errors='coerce')
        except Exception as e:
            print(f"[validate] {name}: failed to coerce datetime index ({e})")

    # Drop timezone
    if getattr(df.index, "tz", None) is not None:
        df.index = df.index.tz_localize(None)

    # Sort index + drop duplicates
    before = len(df)
    df = df[~df.index.duplicated(keep='first')].sort_index()
    if len(df) != before:
        print(f"[validate] {name}: removed {before - len(df)} duplicate index rows")

    # Replace infs
    inf_mask = ~np.isfinite(df.select_dtypes(include=[np.number]))
    if inf_mask.values.any():
        n_infs = int(inf_mask.values.sum())
        df = df.replace([np.inf, -np.inf], np.nan)
        print(f"[validate] {name}: replaced {n_infs} ±inf with NaN")

    # Drop columns that are entirely NaN
    all_na_cols = [c for c in df.columns if df[c].isna().all()]
    if all_na_cols:
        print(f"[validate] {name}: dropping all-NaN columns: {all_na_cols}")
        df = df.drop(columns=all_na_cols)

    # Optional: enforce monotonic index
    if not df.index.is_monotonic_increasing:
        df = df.sort_index()
        print(f"[validate] {name}: index sorted to be monotonic increasing")

    return df

In [None]:
from src.macro_features import macro_data_orchestrator, normalize_date_col, prepare_macro_for_daily_merge, merge_stocks_and_macros
# (optional) date range
start_date_str = None  
end_date_str   = None  

FRED_series_ids = {
        'CPI': 'CPIAUCSL',
        'FEDERAL_FUNDS_RATE': 'DFF',
        'TREASURY_YIELD': 'DGS10',
        'UNEMPLOYMENT': 'UNRATE',
        'REAL_GDP': 'GDPC1',
        'RETAIL_SALES': 'RSAFS',
        'PAYEMS': 'PAYEMS' 
    }

macro_funcs = { 'CPI', 'FEDERAL_FUNDS_RATE', 'TREASURY_YIELD', 
                'UNEMPLOYMENT', 'REAL_GDP', 'RETAIL_SALES', 'PAYEMS' }

# Try load macro data, if no macro data run orchestraotr:
try:
    macro_df = pd.read_csv(
        r'C:\Users\epoch_bpjmdqk\Documents\Code\data\raw\macros.csv'
    )
    print("Loaded existing macro data from CSV.")
    print(f"Data loaded: {macro_df.shape}")
except FileNotFoundError:
    macro_df = macro_data_orchestrator(
        macro_funcs_to_fetch=macro_funcs,
        fred_series_ids_dict=FRED_series_ids,
        start_date=start_date_str,
        save_path=macro_folder,
    )

In [None]:
# -------- Build sector datasets --------
for sector_name, cfg in SECTORS.items():
    tickers = cfg["tickers"]
    sector_etf = cfg.get("sector_etf")

    # 1) build or load BASE once
    base_path = os.path.join(base_output_dir, f"{sector_name}__BASE.parquet")
    if os.path.exists(base_path):
        base_df = pd.read_parquet(base_path)
        print(f"[cache] loaded {base_path}")
    else:
        base_df = build_sector_base_features(
            tickers=tickers,
            kalman_lags=[1,5,10],
            dropna_frac=0.90,
            output_path=base_path
        )
    if getattr(base_df.index, "tz", None) is not None:
        base_df.index = base_df.index.tz_localize(None)

    equities = [t for t in tickers if not t.startswith("^") and t != sector_etf]

    # 2) cheap target views
    for target in equities:
        suppliers = [t for t in equities if t != target]
        print(f"\n--- {sector_name} :: target={target} ---")
        df_t = make_target_view(base_df, target_ticker=target, supplier_tickers=suppliers,
                                benchmark_ticker="^GSPC", sector_etf=sector_etf)

        # merge macro (shift 1d to avoid lookahead)
        if not macro_df.empty:
            merged = pd.merge(df_t, macro_df, left_index=True, right_index=True, how='left')
            macro_cols = list(macro_df.columns)
            merged[macro_cols] = merged[macro_cols].shift(1)
        else:
            merged = df_t

        out_path = os.path.join(base_output_dir, f"{sector_name}__{target}.csv")
        merged.to_csv(out_path, index=True)
        print(f"Saved {sector_name}::{target} → {out_path} rows={len(merged):,} cols={merged.shape[1]}")