In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from polygon import RESTClient
import datetime as dt

import factor_analysis_functions
import stock_data_functions
import ewma_beta_tuning
from matplotlib.dates import MonthLocator, DateFormatter
from stock_data_functions import TickerComparison
from typing import List, Optional, Dict, Any, Tuple
import requests 
import time
import json
import seaborn as sns
import math
import importlib
importlib.reload(stock_data_functions)
importlib.reload(factor_analysis_functions)
importlib.reload(ewma_beta_tuning)


client = RESTClient('tt2gOLH0fHAmPX70a4QURLFy59PRCZr3')
API_key = 'tt2gOLH0fHAmPX70a4QURLFy59PRCZr3'


import scipy.stats as stats
from fredapi import Fred
fred_api = Fred('e48d0413b1cd0a3b30b58d42225373de')

In [2]:
def missing_per_day(df):
    df = df.copy()
    df['time'] = df.index.strftime('%H:%M')

    return df.groupby('time').count()

def select_time(df):
    t = (df.index.time > dt.time(9,30)) & (df.index.time < dt.time(16,30))

    return df[t]

def test(df):
    df = df.copy()
    df = df.isnull().sum()
    return df

def missing_by_time(df):
    df = df.copy()
    df['time'] = df.index.strftime('%H:%M')
    df = df.groupby('time').apply(test) / df.groupby('time').agg(len)

    return df

### Backtest 1

Idea :
- If we see e.g., 5% + move intraday, does it revert? Does it revert in post market? 
- Categorise based on news
- Categorise based on market cap
- Categorise based on volume etc
- Adjust for beta - calculate EWMA beta signal adjusted

Time frame : last 6 months

##### Initialise Data

In [855]:
import factor_analysis_functions
import importlib
importlib.reload(factor_analysis_functions)
from factor_analysis_functions import run_full_pipeline, rolling_r2_from_intraday, _drop_weekends_index, _select_ext_hours_index, run_full_pipeline_multifactor_r2
importlib.reload(factor_analysis_functions)

<module 'factor_analysis_functions' from '/Users/phillip/Desktop/Moon2/factor_analysis_functions.py'>

In [None]:
def setup_clean_experiment(
    *,
    filing_date_gte: str,
    stock: List[str],
    regressor: List[str],                 # e.g., ["SPY"] or ["SPY","I:NDX"]; we use regressor[0] as the factor
    dt_updated: bool = False,
    dt_updated_reg: bool = False,
    # regime lookbacks
    short_lookback: int = 20,
    long_lookback: int = 60,
    # residual z-scoring
    sigma_lookback_days: int = 20,        # days for time-of-day residual std
    event_windows: Tuple[int, ...] = (5, 10, 15),   # m-minute windows for agg residuals / z
    # minute fetch knobs (pass-through to your pipeline)
    minute_waiting_time: int = 60,
    minute_chunksize: int = 200,
    minute_fetch_in_chunks: bool = False,
    # optional date bounds
    daily_start_date: Optional[str] = None,
    daily_end_date: Optional[str] = None,
    minute_start_date: Optional[str] = None,
    minute_end_date: Optional[str] = None,
    use_optimal_betas: bool = True,
    optimal_beta_kwargs: Dict[str, str] = None
) -> Dict[str, Any]:
    """
    Sets up and runs a clean experiment for multifactor intraday R² analysis.
    """
    # ----------------------- 0) Run your pipeline once -----------------------
    pipe = run_full_pipeline_multifactor_r2(
        filing_date_gte=filing_date_gte,
        dt_updated=dt_updated,
        dt_updated_reg=dt_updated_reg,
        stock=stock,
        regressor=regressor,
        lookback_long=long_lookback, 
        lookback_short=short_lookback, 
        minute_waiting_time=minute_waiting_time,
        minute_chunksize=minute_chunksize,
        minute_fetch_in_chunks=minute_fetch_in_chunks,
        daily_start_date=daily_start_date,
        minute_start_date=minute_start_date,
        daily_end_date=daily_end_date,
        minute_end_date=minute_end_date,
        use_optimal_betas = use_optimal_betas,
        optimal_beta_kwargs = optimal_beta_kwargs
    )

    # Convenience handles
    stock_min_prices = pipe["stock_object_minute"].tickers_stocks_prices     # MultiIndex: (ticker, field)
    reg_min_prices   = pipe["regressor_object_minute"].tickers_stocks_prices
    stock_rets_min   = pipe["stock_returns_minute"]                          # minute close log returns
    reg_rets_min     = pipe["regressor_returns_minute"]
    stock_rets_day   = pipe["stock_returns_daily"]                           # daily close log returns
    reg_rets_day     = pipe["regressor_returns_daily"]
    beta_daily       = pipe["betas_df"]                                      # EWMA betas (daily) vs factor_ticker

    # Minute factor return (Series)
    factor_min_ret = reg_rets_min.copy()

    # ----------------------- 1) Minute meta: trade_date & time-of-day -----------------------
    # Ensure tz-aware US/Eastern minute index
    min_index = stock_rets_min.index
    if getattr(min_index, "tz", None) is None:
        # if not tz-aware, assume US/Eastern per your invariants
        min_index = min_index.tz_localize("US/Eastern")
        stock_rets_min.index = min_index
        reg_rets_min.index   = min_index
        factor_min_ret.index = min_index
    td_series = pd.Series(min_index.tz_convert("US/Eastern").date, index=min_index, name="trade_date")
    tod_series = pd.Series(min_index.tz_convert("US/Eastern").strftime("%H:%M"), index=min_index, name="tod")

    return {'stock_min_prices' : stock_min_prices,
            'reg_min_prices'   : reg_min_prices,
            'stock_rets_min'   : stock_rets_min,
            'reg_rets_min'     : reg_rets_min,
            'stock_rets_day'   : stock_rets_day,
            'reg_rets_day'     : reg_rets_day,
            'beta_daily'       : beta_daily,
            'factor_min_ret'   : factor_min_ret,
            'td_series'        : td_series,
            'tod_series'       : tod_series,
            'min_index'        : min_index,
            'pipe'             : pipe}

In [1680]:
# Regressors and Stocks 
regressor                   = ['SPY','SMH','QQQ','SPLV', 'QVAL', 'SRVR']
stock                       = ['META', 'MU','ALAB', 'TER', 'CRDO', 'AMKR', 'AMD', 'NVDA', 'INTC', 'MSFT', 'AAPL', 'GOOG', 'AAPL']

# Data Pulling
daily_start_date            = (dt.datetime.today() - pd.Timedelta(weeks=104)).strftime('%Y-%m-%d')
dt_updated_reg              = False
dt_updated                  = False

minute_waiting_time     = 30
minute_chunksize        = 200
minute_fetch_in_chunks  = True

daily_start_date        = '2023-11-01'
minute_start_date       = '2023-11-01'
daily_end_date          = '2025-11-03'
minute_end_date         = '2025-11-03'

# Beta Tuning
factor_sets = [
    ['SPY'], ['QQQ'], ['SMH'], ['SRVR'],
    ['SPY', 'SMH'], ['QQQ', 'SMH'],
    ['QQQ', 'SRVR'], ['SPY', 'SRVR'],
    ['SPY', 'QVAL', 'SMH'],
    ['QQQ', 'QVAL', 'SMH'],
    ['QQQ', 'SPLV'], ['SMH', 'SPLV'],
    ['SPLV', 'SRVR'], ['SPLV', 'QVAL','SRVR']
]


ridge_grid  = [1e-8, 1e-6, 1e-4, 1e-3]
lambdas     = [0.90, 0.94, 0.96, 0.98, 0.99]
thresh      = 0.1                       # Threshold for parameter tuning where e.g., 10% difference in pseudo OOS MSE is treated as the same
# Idiosyncratic Return Params
warmup                  = 30 
short_lookback          = 5
long_lookback           = 20
sigma_lookback_days     = 5
event_windows           = (1, 3, 12, 24)

optimal_beta_kwargs = {
    'factor_sets': factor_sets,
    'lambdas': lambdas,
    'thresh': thresh,
    'ridge_grid':ridge_grid,
    'daily_start_date':daily_start_date,
    'date_updated':dt_updated,
    'date_updated_regressor':dt_updated_reg,
}

data = setup_clean_experiment(
    filing_date_gte="2023-09-01",
    stock=stock,
    regressor=regressor,
    dt_updated=dt_updated,
    dt_updated_reg=dt_updated_reg,
    short_lookback=short_lookback,
    long_lookback=long_lookback,
    sigma_lookback_days=sigma_lookback_days,
    event_windows=event_windows,
    minute_waiting_time=minute_waiting_time,
    minute_chunksize=minute_chunksize,
    minute_fetch_in_chunks=minute_fetch_in_chunks,
    optimal_beta_kwargs=optimal_beta_kwargs,
    minute_end_date=minute_end_date,
    daily_end_date=daily_end_date
)

[1/13] META: start
last date saved :  2025-11-03 19:55:00-05:00
Minute level data loaded from CSV (earliest): META_5minute_2023-09_minute_level_data.csv
saving because we have the file and we do not want to update date : 2025-11-03 19:55:00-05:00
2025-11-03 19:55:00-05:00 META
[1/13] META: done
[2/13] MU: start
last date saved :  2025-11-03 19:55:00-05:00
Minute level data loaded from CSV (earliest): MU_5minute_2023-11_minute_level_data.csv
saving because we have the file and we do not want to update date : 2025-11-03 19:55:00-05:00
2025-11-03 19:55:00-05:00 MU
[2/13] MU: done
[3/13] ALAB: start
last date saved :  2025-11-03 19:55:00-05:00
Minute level data loaded from CSV (earliest): ALAB_5minute_2024-03_minute_level_data.csv
saving because we have the file and we do not want to update date : 2025-11-03 19:55:00-05:00
2025-11-03 19:55:00-05:00 ALAB
[3/13] ALAB: done
[4/13] TER: start
last date saved :  2025-11-03 19:30:00-05:00
Minute level data loaded from CSV (earliest): TER_5minute


The behavior of 'isin' with dtype=datetime64[ns] and castable values (e.g. strings) is deprecated. In a future version, these will not be considered matching by isin. Explicitly cast to the appropriate dtype before calling isin instead.



In [314]:
def run_clean_experiment(
        stock            = stock,
        factor_ticker    = 'SPY',
        stock_min_prices = data['stock_min_prices'],
        reg_min_prices   = data['reg_min_prices'],
        stock_rets_min   = data['stock_rets_min'],
        reg_rets_min     = data['reg_rets_min'],
        stock_rets_day   = data['stock_rets_day'],
        reg_rets_day     = data['reg_rets_day'],
        beta_daily       = data['beta_daily'],
        factor_min_ret   = data['factor_min_ret'],
        td_series        = data['td_series'],
        tod_series       = data['tod_series'],
        min_index        = data['min_index'],
        pipe             = data['pipe']
):
   # ----------------------- 2) Lagged betas (no look-ahead) & minute mapping -----------------------
    beta_daily_lag1 = beta_daily.shift(1)  # strict D-1
    # Map D-1 betas to minutes by trade_date
    beta_minute_lag1 = pd.DataFrame(index=min_index, columns=stock, dtype=float)
    for s in stock:
        m = td_series.map(beta_daily_lag1[s])
        beta_minute_lag1[s] = m.values

    # ----------------------- 3) Minute residuals (stock - beta_{D-1} * factor) -----------------------
    residual_1m = pd.DataFrame(index=min_index, columns=stock, dtype=float)
    for s in stock:
        residual_1m[s] = stock_rets_min[s] - beta_minute_lag1[s] * factor_min_ret

    # ----------------------- 4) m-minute residual sums & explained-share -----------------------
    def _roll_sum(df: pd.DataFrame, m: int) -> pd.DataFrame:
        return df.rolling(m, min_periods=m).sum()

    residual_m: Dict[int, pd.DataFrame] = {m: _roll_sum(residual_1m, m) for m in event_windows}
    # For explained share, need stock m-min move and factor m-min move
    stock_m = {m: _roll_sum(stock_rets_min, m) for m in event_windows}
    factor_m = {m: _roll_sum(factor_min_ret.to_frame("f"), m)["f"] for m in event_windows}

    explained_share_m: Dict[int, pd.DataFrame] = {}
    for m in event_windows:
        es = pd.DataFrame(index=min_index, columns=stock, dtype=float)
        fmove = factor_m[m]
        for s in stock:
            # predicted = beta_{D-1} * factor move (use minute-mapped beta)
            pred = beta_minute_lag1[s] * fmove
            actual = stock_m[m][s]
            num = pred.abs()
            den = actual.abs().replace(0, np.nan)
            es[s] = (num / den).clip(0.0, 1.0)
        explained_share_m[m] = es

    # ----------------------- 5) Time-of-day sigma for residual m-min (lagged, no leakage) ---------
    # Helper to compute lagged TOD std by minute bucket across past N days
    def _tod_sigma_lagged(x: pd.Series, td: pd.Series, tod: pd.Series, window_days: int) -> pd.Series:
        df = pd.DataFrame({"val": x, "td": td, "tod": tod})
        df = df.dropna(subset=["val"])
        # sort by (tod, trade_date) so each minute-of-day series is ordered by day
        df = df.sort_values(["tod", "td"])
        # rolling std over past N days PER time-of-day, using shift(1) to avoid using same-day value
        grp = df.groupby("tod", sort=False)["val"]
        sigma = grp.apply(lambda s: s.shift(1).rolling(window_days, min_periods=max(5, window_days//2)).std())
        # restore original index order
        df["sigma"] = sigma.values
        df = df.sort_index()
        out = pd.Series(index=x.index, dtype=float)
        out.loc[df.index] = df["sigma"].values
        return out

    sigma_tod_m: Dict[int, pd.DataFrame] = {}
    z_m: Dict[int, pd.DataFrame] = {}
    for m in event_windows:
        # Compute sigma per stock independently
        sig_df = pd.DataFrame(index=min_index, columns=stock, dtype=float)
        z_df   = pd.DataFrame(index=min_index, columns=stock, dtype=float)
        for s in stock:
            rs = residual_m[m][s]
            sig = _tod_sigma_lagged(rs, td_series, tod_series, sigma_lookback_days)
            sig_df[s] = sig
            z_df[s] = rs / sig
        sigma_tod_m[m] = sig_df
        z_m[m] = z_df

    # ----------------------- 6) Daily regime features: R2 short/long (lagged) -----------------------
    # Recompute R² (short & long) from your minute-labelled table to keep consistency
    labelled_min = pipe["labelled_combined_returns_min"]  # has 'trade_date' already
    # short
    r2_short_daily = rolling_r2_from_intraday(
        labelled_min, beta_daily, lookback_days=short_lookback, factor_col=factor_ticker
    )
    # long
    r2_long_daily = rolling_r2_from_intraday(
        labelled_min, beta_daily, lookback_days=long_lookback, factor_col=factor_ticker
    )
    # Align index types & sort
    r2_short_daily = r2_short_daily.sort_index()
    r2_long_daily  = r2_long_daily.sort_index()
    delta_daily    = (r2_short_daily - r2_long_daily).reindex_like(r2_short_daily)

    r2_short_lag1 = r2_short_daily.shift(1)
    delta_lag1    = delta_daily.shift(1)

    # Map lagged regime features to each minute by trade_date (prior day values)
    regime_short_minute = pd.DataFrame(index=min_index, columns=stock, dtype=float)
    regime_delta_minute = pd.DataFrame(index=min_index, columns=stock, dtype=float)
    td_no_tz = pd.Series(pd.to_datetime(td_series).values.astype("datetime64[D]"), index=td_series.index)
    for s in stock:
        # map by date only
        r2_map   = r2_short_lag1[s].copy()
        r2_map.index = pd.to_datetime(r2_map.index).date
        dlt_map  = delta_lag1[s].copy()
        dlt_map.index = pd.to_datetime(dlt_map.index).date
        regime_short_minute[s] = td_series.map(r2_map).values
        regime_delta_minute[s] = td_series.map(dlt_map).values

    # ----------------------- 7) Volume TOD z-scores (microstructure proxy) -----------------------
    # Pull 5-min volume from minute price tables
    vol_5m = pd.DataFrame(index=min_index, columns=stock, dtype=float)
    for s in stock:
        vol_5m[s] = stock_min_prices[(s, "volume")].reindex(min_index)

    def _tod_z_lagged(x: pd.Series, td: pd.Series, tod: pd.Series, window_days: int) -> pd.Series:
        # like sigma, but z-score
        df = pd.DataFrame({"val": x, "td": td, "tod": tod}).dropna(subset=["val"])
        df = df.sort_values(["tod", "td"])
        grp = df.groupby("tod", sort=False)["val"]
        mean_ = grp.apply(lambda s: s.shift(1).rolling(window_days, min_periods=max(5, window_days//2)).mean())
        std_  = grp.apply(lambda s: s.shift(1).rolling(window_days, min_periods=max(5, window_days//2)).std())
        df["z"] = (df["val"] - mean_.values) / std_.values
        df = df.sort_index()
        out = pd.Series(index=x.index, dtype=float)
        out.loc[df.index] = df["z"].values
        return out

    volume_z_tod = pd.DataFrame(index=min_index, columns=stock, dtype=float)
    for s in stock:
        volume_z_tod[s] = _tod_z_lagged(vol_5m[s], td_series, tod_series, sigma_lookback_days)

    # ----------------------- 8) Package & return -----------------------
    return {
        "pipe": pipe,
        "minute_meta": {"trade_date": td_series, "tod": tod_series},
        "minute_stock_rets": stock_rets_min,
        "minute_factor_ret": factor_min_ret,
        "beta_daily": beta_daily,
        "beta_daily_lag1": beta_daily_lag1,
        "beta_minute_lag1": beta_minute_lag1,
        "residual_1m": residual_1m,
        "residual_m": residual_m,
        "sigma_tod_m": sigma_tod_m,
        "z_m": z_m,
        "explained_share_m": explained_share_m,
        "volume_5m": vol_5m,
        "volume_z_tod": volume_z_tod,
        "regime_daily": {
            "r2_short_daily": r2_short_daily,
            "r2_long_daily":  r2_long_daily,
            "delta_daily":    delta_daily,
            "r2_short_lag1":  r2_short_lag1,
            "delta_lag1":     delta_lag1,
        },
        "regime_minute_lag1": {
            "r2_short": regime_short_minute,
            "delta":    regime_delta_minute,
        },
    }

In [None]:
beta_daily      = data['beta_daily'].copy()
td_series       = data['td_series'].copy()
min_index       = data['min_index'].copy()
stock_rets_min  = data['stock_rets_min'].copy()
factor_min_ret  = data['factor_min_ret'].copy()
tod_series      = data['tod_series'].copy()

# Filter out weekends and times outside when market is open
min_index            = _drop_weekends_index(min_index)
min_index            = _select_ext_hours_index(min_index)
factor_min_ret_index = _drop_weekends_index(factor_min_ret.index)
factor_min_ret_index = _select_ext_hours_index(factor_min_ret_index)

# Filter out when markets are closed
tod_series     = tod_series[min_index].copy()
td_series      = td_series[min_index].copy()
factor_min_ret = factor_min_ret.loc[factor_min_ret_index].copy()

##### Create beta and idioscryntaic matrix


In [None]:
# ----------------------- Single Beta Version (Not Used) -----------------------
# beta_daily_lag1 = beta_daily.shift(1)  # strict D-1
# # Map D-1 betas to minutes by trade_date
# beta_minute_lag1 = pd.DataFrame(index=min_index, columns=stock, dtype=float)
# for s in stock:
#     m = td_series.map(beta_daily_lag1[s])
#     beta_minute_lag1[s] = m.values

# # ----------------------- 3) Minute residuals (stock - beta_{D-1} * factor) -----------------------
# residual_1m = pd.DataFrame(index=min_index, columns=stock, dtype=float)
# for s in stock:
#     if s not in stock_rets_min.columns or stock_rets_min[s].isnull().all():
#         residual_1m.drop(columns=[s], inplace=True)
#         stock.remove(s)
#         print(f'Stock {s} : Data not loaded properly - removing from backtest')
#     else:
#         residual_1m[s] = stock_rets_min[s] - beta_minute_lag1[s] * factor_min_ret


In [None]:
# We want to shift betas back by 1 Day, then assign D-1 betas to each 5 minute interval by trade_date
beta_daily_lag1 = beta_daily.shift(1)  
# Map D-1 betas to minutes by trade_date
beta_minute_lag1 = pd.DataFrame(index=min_index, columns=stock, dtype=float)
beta_minute_lag1 = {}
for s in stock:
    # go by each stock
    cols                  = beta_daily_lag1[s].columns
    beta_daily_lag1_stock = beta_daily_lag1[s].copy()

    beta_daily_lag1        = beta_daily.shift(1)  # strict D-1
    beta_minute_lag1_stock = pd.DataFrame(index=min_index, columns=cols, dtype=float)
    for i in cols:
        # Assign each regressor to the D-1 trade date
        m = td_series.map(beta_daily_lag1_stock[i])
        beta_minute_lag1_stock[i] = m.values

    beta_minute_lag1[s] = beta_minute_lag1_stock.copy()

In [1090]:
#### Calculate residuals
def calc_idio_rets(s, *, stock_rets_min, factor_min_ret, beta_minute_lag1, dtype='float32'):
    # stock returns: drop missing upfront
    Sr    = stock_rets_min[s].dropna().astype(dtype)

    # betas: drop any row with any missing leg
    B_all = beta_minute_lag1[s]
    B     = B_all.dropna(how='any')

    # factors: use only the columns this stock actually uses, then align to B's minutes and drop rows with any NaN
    cols = B.columns.intersection(factor_min_ret.columns)
    F    = factor_min_ret[cols].reindex(B.index).dropna(how='any')

    # final time index = minutes present (and fully observed) in B, F, and Sr
    idx = B.index.intersection(F.index, sort=False).intersection(Sr.index, sort=False)

    B = B.loc[idx, cols].astype(dtype)
    F = F.loc[idx, cols].astype(dtype)
    Sr = Sr.loc[idx].astype(dtype)
    # row-wise dot for factor contribution; no fills anywhere
    fac = np.einsum('ij,ij->i', B.to_numpy(copy=False), F.to_numpy(copy=False), optimize=True)
    fac = pd.Series(fac, index=idx, dtype=dtype)
    resid = Sr - fac
    return resid, fac

# single pass to build both outputs (no double compute)
results = {s: calc_idio_rets(s,
                             stock_rets_min=stock_rets_min,
                             factor_min_ret=factor_min_ret,
                             beta_minute_lag1=beta_minute_lag1)
           for s in stock}

residual_1m = pd.DataFrame({s: r[0] for s, r in results.items()})
fac_rets    = pd.DataFrame({s: r[1] for s, r in results.items()})


In [1091]:
def _roll_sum(df: pd.DataFrame, m: int) -> pd.DataFrame:
    return df.rolling(m, min_periods=m).sum()

stocks_aligned = stock_rets_min[residual_1m.columns].reindex(residual_1m.index)

residual_m: Dict[int, pd.DataFrame] = {m: _roll_sum(residual_1m, m) for m in event_windows}
stock_m:    Dict[int, pd.DataFrame] = {m: _roll_sum(stocks_aligned, m) for m in event_windows}

factor_1m = stocks_aligned - residual_1m
factor_m:  Dict[int, pd.DataFrame] = {m: _roll_sum(factor_1m, m) for m in event_windows}


#### Idiosyncratic Vol & Diagnostics - Event Detection 

In [534]:
import factor_analysis_functions
import importlib
importlib.reload(factor_analysis_functions)
from factor_analysis_functions import sanity_check_sigma_and_z, build_sigma_and_z_from_tod, z_exceedance_diagnostic, plot_hourly_net_resid_for_exceedance_hours

Idio syncratic z score prior days diurnal approach:
- Idea is to get a score that sees how much of the stock's returns are explained by the regressor
- Normalize by the rolling standard deviation of idiosyncratic returns at that time of the day (e.g., 9:30-9:35 past m days)
- in the _tod_sigma_lagged function, we require either half the window length or 5 observations to form an actual sigma observation
- The tod_sigma_lagged function gets you the volatility to normalize - but uses the volatility of the hour not of 5 minutes. 

In [1092]:
sigma_tod_m, z_m = build_sigma_and_z_from_tod(
    residual_m=residual_m,
    td_series=td_series,
    tod_series=tod_series,
    stocks=stock,              
    window_days=sigma_lookback_days
)

This checks the z score anomalies e.g., Z > 10. This groups by
- Session (pre post regular) and ticker and date
- Plots where this happens
- Provides table of the top 10 occurance of these deviations
- This is idiosyncratic movements
- Count is number of times we had something exceed threshold in that session
- z_avg is the average of the abs

In [867]:
m = 3
out = z_exceedance_diagnostic(
    sigma_tod_m=sigma_tod_m,
    z_m=z_m,
    residual_m=residual_m,
    m=m,                   
    Z=3.0,
    top_k=10,
    return_plot=False,
    integrity_atol=1e-2,
    integrity_rtol=1e-2
)

summary     = out["summary_wide"].round(2)        # MultiIndex columns: ticker → ['date','session','z_avg','z_count']
reversion   = out["reversion_wide"]    # ticker → ['date','session','spread','count_above_spread','z_count']
events      = out["events_long"]          # per-bar exceedances with z, sigma, resid

In [876]:
import plotly.io as pio
pio.renderers.default = "browser"

In [877]:
res = plot_hourly_net_resid_for_exceedance_hours(
    sigma_tod_m=sigma_tod_m,
    z_m=z_m,
    residual_m=residual_m,
    m=1,
    Z=3.0,
    tickers=['AMD', 'NVDA'],   # or None to auto-pick top-2
    session=None)
fig = res["fig"]
pio.renderers.default = "browser"
fig.show()


#### Sanity Check

In [552]:
from factor_analysis_functions import sanity_check_sigma_and_z
importlib.reload(factor_analysis_functions)

checks = sanity_check_sigma_and_z(sigma_tod_m=sigma_tod_m, z_m=z_m)
print(checks[1].keys())

dict_keys(['alignment', 'sigma_summary', 'z_summary', 'z_by_hour', 'flags'])


#### Regime Analysis : R2 plots with multi factor 

In [810]:
from factor_analysis_functions import sanity_check_sigma_and_z, run_full_pipeline, rolling_r2_intraday_multifactor_perstock, run_full_pipeline_multifactor_r2
importlib.reload(factor_analysis_functions)

<module 'factor_analysis_functions' from '/Users/phillip/Desktop/Moon2/factor_analysis_functions.py'>

In [None]:
import plotly.graph_objects as go
import plotly.io as pio

def plot_interactive_r2_dashboard_multifactor(
    *,
    filing_date_gte: str,
    stock: Sequence[str],
    regressor: Sequence[str],
    short_lookback: int = 20,
    long_lookback: int = 60,
    # fetch knobs
    dt_updated: bool = False,
    dt_updated_reg: bool = False,
    minute_waiting_time: int = 60,
    minute_chunksize: int = 200,
    minute_fetch_in_chunks: bool = False,
    daily_start_date: str | None = None,
    daily_end_date: str | None = None,
    minute_start_date: str | None = None,
    minute_end_date: str | None = None,
    # presentation
    init_stock: str | None = None,
    title: str | None = None,
    optimal_beta_kwargs: Dict[str, Any] = None,
):
    # ---------- build data ----------
    pipe = run_full_pipeline_multifactor_r2(
        filing_date_gte=filing_date_gte,
        stock=stock,
        regressor=regressor,
        lookback_short=short_lookback,
        lookback_long=long_lookback,
        dt_updated=dt_updated,
        dt_updated_reg=dt_updated_reg,
        minute_waiting_time=minute_waiting_time,
        minute_chunksize=minute_chunksize,
        minute_fetch_in_chunks=minute_fetch_in_chunks,
        daily_start_date=daily_start_date,
        daily_end_date=daily_end_date,
        minute_start_date=minute_start_date,
        minute_end_date=minute_end_date,
        use_optimal_betas=True,
        optimal_beta_kwargs=optimal_beta_kwargs,
    )

    r2s = pipe["r2_short_daily"]   # index: trading_day, cols: tickers
    r2l = pipe["r2_long_daily"]

    # ensure aligned index across both frames
    idx = r2s.index.union(r2l.index).sort_values()
    r2s = r2s.reindex(idx)
    r2l = r2l.reindex(idx)

    # ---------- figure ----------
    fig = go.Figure()
    tickers = list(r2s.columns)
    init = init_stock or (tickers[0] if tickers else None)
    if init is None:
        return go.Figure()

    # add 2 traces per stock, default hidden; show only init
    for t in tickers:
        fig.add_trace(go.Scatter(
            x=idx, y=r2s[t], mode="lines", name=f"{t} R² (short {short_lookback}d)",
            hovertemplate="%{x|%Y-%m-%d}<br>R²=%{y:.3f}<extra></extra>",
            visible=(t == init)
        ))
        fig.add_trace(go.Scatter(
            x=idx, y=r2l[t], mode="lines", name=f"{t} R² (long {long_lookback}d)",
            line=dict(dash="dash"),  # dashed line for long lookback
            hovertemplate="%{x|%Y-%m-%d}<br>R²=%{y:.3f}<extra></extra>",
            visible=(t == init)
        ))

    # dropdown to toggle stocks: flip visibility for the pair of traces belonging to that ticker
    buttons = []
    total = len(fig.data)
    traces_per_stock = 2
    for i, t in enumerate(tickers):
        vis = [False] * total
        base = i * traces_per_stock
        vis[base] = True
        vis[base + 1] = True
        buttons.append(dict(
            label=t,
            method="update",
            args=[{"visible": vis},
                  {"title": f"{t} — Rolling multi-factor R² (short vs long)"}],
        ))

    fig.update_layout(
        updatemenus=[dict(
            type="dropdown", direction="down", x=1.0, xanchor="right", y=1.12, yanchor="top",
            buttons=buttons, showactive=True, bgcolor="white", bordercolor="#ccc", pad=dict(r=8, t=2, b=2, l=2),
        )],
        xaxis=dict(title="Trading day"),
        yaxis=dict(title="R²", rangemode="tozero"),
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="left", x=0),
        template="plotly_white",
        margin=dict(l=70, r=70, t=70, b=60),
        title=title or f"{init} — Rolling multi-factor R² (short vs long)"
    )
    pio.templates.default = "plotly_white"
    return fig


In [None]:
fig = plot_interactive_r2_dashboard_multifactor(
        filing_date_gte="2023-09-01",
        stock=stock,
        regressor=['QQQ', 'SPY', 'SMH'],
        short_lookback=short_lookback,
        long_lookback=long_lookback,
        dt_updated=dt_updated,
        dt_updated_reg=dt_updated_reg,
        minute_waiting_time=minute_waiting_time,
        minute_chunksize=minute_chunksize,
        minute_fetch_in_chunks=minute_fetch_in_chunks,
        optimal_beta_kwargs=optimal_beta_kwargs,
        minute_end_date = '2025-11-04'
    )

In [1762]:
import plotly.io as pio
pio.renderers.default = "browser"
fig.show()  # pops a new browser tab

python(98753) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
