In [2]:
import pandas as pd
import numpy as np
from pathlib import Path


In [4]:
def load_artifacts_for_ticker(
    out_dir: str = "results_model_test",
    ticker: str = None,
    winsor: str | tuple | None = None,
    engine: str ='pyarrow',   # e.g. "pyarrow" or "fastparquet"
    verbose: bool = True
):
    """
    Load all parquet artifacts for a given ticker (and optional winsor label)
    into a nested dict: artifacts[winsor_label][tag] -> DataFrame.

    winsor:
      - None  -> load ALL winsor variants found for the ticker
      - tuple -> e.g. (0.0, 1.0) becomes "0.0000-1.0000"
      - str   -> use as-is, e.g. "0.0000-1.0000"
    """
    if ticker is None:
        raise ValueError("Please provide `ticker`.")

    def _wlab(w):
        if w is None: return "*"
        if isinstance(w, tuple) and len(w) == 2:
            return f"{w[0]:.4f}-{w[1]:.4f}"
        return str(w)

    wlabel = _wlab(winsor)
    base = Path(out_dir)
    pattern = f"{ticker}__winsor_{wlabel}__*.parquet"
    files = sorted(base.glob(pattern))

    if verbose:
        print(f"[load] scanning {base} for {pattern} → {len(files)} files")

    artifacts: dict[str, dict[str, pd.DataFrame]] = {}
    for fp in files:
        # filename format: {ticker}__winsor_{wlab}__{tag}.parquet or {tag} may contain "__"
        stem = fp.stem  # without .parquet
        parts = stem.split("__")
        if len(parts) < 3:
            if verbose: print(f"[skip] unexpected name: {fp.name}")
            continue
        tkr, _, rest = parts[0], parts[1], "__".join(parts[2:])  # keep any extra "__" inside tag
        # extract winsor label from the second part: "winsor_{wlab}"
        if not parts[1].startswith("winsor_"):
            if verbose: print(f"[skip] missing winsor in name: {fp.name}")
            continue
        wlab = parts[1].replace("winsor_", "", 1)
        tag = rest  # e.g. simple_coef, simple_diag__simple_diag_VIF, rolling_betas, etc.

        try:
            df = pd.read_parquet(fp, engine=engine)
            artifacts.setdefault(wlab, {})[tag] = df
            if verbose:
                print(f"[ok] {ticker} winsor={wlab} tag={tag} rows={len(df)}")
        except Exception as e:
            if verbose:
                print(f"[warn] failed to read {fp.name}: {e}")
            continue

    return artifacts


In [16]:
out_dir   = "results_model_test"
ticker    = 'AAOI'
artifacts = load_artifacts_for_ticker(
    ticker=ticker, out_dir=out_dir,
    winsor=None, verbose=False
    )

In [56]:
from pathlib import Path
import pandas as pd

def load_all_metrics_table(
    out_dir: str = "results_model_test",
    tickers: list[str] | None = None,   # None = auto-discover from filenames
    winsor: str | tuple | None = None,  # None = all; tuple -> "0.0100-0.9900"
    engine: str | None = None,          # e.g. "pyarrow" or "fastparquet"
    verbose: bool = True,
) -> pd.DataFrame:
    """
    Returns a single DataFrame with one row per (ticker, winsor), merging:
      - simple_ols diagnostics (scalars only)  -> columns prefixed 'simple_'
      - ortho_ols  diagnostics (scalars only)  -> columns prefixed 'ortho_'
      - rolling_ols metrics (oos_r2, window)   -> columns prefixed 'rolling_'
    Missing pieces are skipped; rows are created from whatever is available.

    Parquet reading uses pandas.read_parquet; pass `engine` if you want to force
    'pyarrow' or 'fastparquet'.
    """
    base = Path(out_dir)
    if not base.exists():
        raise FileNotFoundError(f"{out_dir} does not exist")

    def _wlab(w):
        if w is None: return "*"
        if isinstance(w, tuple) and len(w) == 2:
            return f"{w[0]:.4f}-{w[1]:.4f}"
        return str(w)

    # discover tickers if not provided
    if tickers is None:
        tickers = sorted({p.name.split("__", 1)[0] for p in base.glob("*__winsor_*__*.parquet")})

    want_wlabel = _wlab(winsor)
    rows = []

    for t in tickers:
        # enumerate winsor labels available for this ticker (optionally filter)
        wlabels = sorted({
            p.name.split("__")[1].replace("winsor_", "", 1)
            for p in base.glob(f"{t}__winsor_*__*.parquet")
        })
        if want_wlabel != "*":
            wlabels = [w for w in wlabels if w == want_wlabel]

        for wlab in wlabels:
            rec = {"ticker": t, "winsor": wlab}

            # simple diag scalars
            try:
                df = pd.read_parquet(base / f"{t}__winsor_{wlab}__simple_diag_scalars.parquet", engine=engine)
                for c, v in df.iloc[0].items():
                    rec[f"simple_{c}"] = v
            except Exception as e:
                if verbose: print(f"[miss] {t} {wlab} simple_diag_scalars: {e}")

            # ortho diag scalars
            try:
                df = pd.read_parquet(base / f"{t}__winsor_{wlab}__ortho_diag_scalars.parquet", engine=engine)
                for c, v in df.iloc[0].items():
                    rec[f"ortho_{c}"] = v
            except Exception as e:
                if verbose: print(f"[miss] {t} {wlab} ortho_diag_scalars: {e}")

            # rolling metrics
            try:
                df = pd.read_parquet(base / f"{t}__winsor_{wlab}__rolling_metrics.parquet", engine=engine)
                # expect columns: ticker, winsor, window, oos_r2
                if "window" in df.columns: rec["rolling_window"] = df["window"].iloc[0]
                if "oos_r2" in df.columns: rec["rolling_oos_r2"] = df["oos_r2"].iloc[0]
            except Exception as e:
                if verbose: print(f"[miss] {t} {wlab} rolling_metrics: {e}")

            # only append if we found something beyond (ticker, winsor)
            if len(rec) > 2:
                rows.append(rec)
            elif verbose:
                print(f"[skip] {t} {wlab}: no metrics found")

    out = pd.DataFrame(rows)
    if not out.empty:
        out = out.sort_values(["ticker", "winsor"]).reset_index(drop=True)
    return out


In [71]:
out = load_all_metrics_table(
    out_dir="results_model_test",
    tickers=None,
    winsor=None,
    engine='pyarrow',
    verbose=False
)

out_100 = load_all_metrics_table(
    out_dir="results_model_test_100",
    tickers=None,
    winsor=None,
    engine='pyarrow',
    verbose=False
)

out_60 = load_all_metrics_table(
    out_dir="results_model_test_60",
    tickers=None,
    winsor=None,
    engine='pyarrow',
    verbose=False
)

In [175]:
out_100

Unnamed: 0,ticker,winsor,rolling_window,rolling_oos_r2
0,AAOI,0.0000-1.0000,100,0.299380
1,AAOI,0.0100-0.9900,100,0.301446
2,AAOI,0.0500-0.9500,100,0.308549
3,ACLS,0.0000-1.0000,100,0.474009
4,ACLS,0.0100-0.9900,100,0.473911
...,...,...,...,...
697,ZM,0.0100-0.9900,100,0.364870
698,ZM,0.0500-0.9500,100,0.368852
699,ZS,0.0000-1.0000,100,0.411632
700,ZS,0.0100-0.9900,100,0.429224


In [177]:
rolling_200 = out.groupby('ticker').apply(lambda df : df['rolling_oos_r2'].max()).rename("rolling_oos_r2_200")
rolling_100 = out_100.groupby('ticker').apply(lambda df : df['rolling_oos_r2'].max()).rename("rolling_oos_r2_100")
rolling_60 = out_60.groupby('ticker').apply(lambda df : df['rolling_oos_r2'].max()).rename("rolling_oos_r2_60")

rolling = pd.concat([rolling_200, rolling_100, rolling_60], axis=1)
rolling['winner'] = rolling.idxmax(axis=1)

  rolling_200 = out.groupby('ticker').apply(lambda df : df['rolling_oos_r2'].max()).rename("rolling_oos_r2_200")
  rolling_100 = out_100.groupby('ticker').apply(lambda df : df['rolling_oos_r2'].max()).rename("rolling_oos_r2_100")
  rolling_60 = out_60.groupby('ticker').apply(lambda df : df['rolling_oos_r2'].max()).rename("rolling_oos_r2_60")


In [178]:
best_window_per_ticker = rolling.groupby('ticker').apply(lambda df: df[df['winner']].max()).to_frame('oos_r2')
best_window_per_ticker = best_window_per_ticker.reset_index().set_index('level_1').sort_index()
best_window_per_ticker = best_window_per_ticker.sort_values(by=['level_1', 'oos_r2'], ascending=[True, False])

In [179]:
tech_mkt_caps = long[['Symbol', 'Market Cap']].reset_index()
tech_mkt_caps = tech_mkt_caps.drop(columns=['level_0', 'level_1'])
tech_mkt_caps = tech_mkt_caps.set_index('Symbol')

In [236]:
df = pd.concat([out, out_100, out_60], axis=0, join='inner')
winsor_best = df.groupby(['ticker', 'rolling_window'])[['winsor', 'rolling_oos_r2']].apply(lambda df: df[df['rolling_oos_r2'] == df['rolling_oos_r2'].max()])
winsor_ = winsor_best.groupby(['ticker','winsor']).count().reset_index()

In [162]:
best_window_per_ticker['Market Cap'] = best_window_per_ticker['ticker'].map(tech_mkt_caps['Market Cap'])

In [171]:
best_window_per_ticker.sort_values('Market Cap', ascending=False)

Unnamed: 0_level_0,ticker,oos_r2,Market Cap
level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rolling_oos_r2_100,NVDA,0.586656,4570000.00
rolling_oos_r2_100,MSFT,0.536114,3690000.00
rolling_oos_r2_100,AVGO,0.441710,1650000.00
rolling_oos_r2_100,TSM,0.533832,1220000.00
rolling_oos_r2_200,ORCL,0.289863,682080.00
...,...,...,...
rolling_oos_r2_200,BTCT,0.011856,18.94
rolling_oos_r2_200,SVRE,-0.019983,13.84
rolling_oos_r2_60,WATT,0.001084,13.16
rolling_oos_r2_200,SCKT,-0.010166,8.76


In [147]:
from Modules.read_in_data_functions import get_sub_sectors_tickers, SUB_SECTOR_DICT

cut_off_cap           = 10_000 
tech_stocks_marketcap = get_sub_sectors_tickers(sector='Technology', n = 60) 
tech_sectors          = list(tech_stocks_marketcap.columns.get_level_values(0).unique())

mask = (tech_stocks_marketcap[tech_stocks_marketcap.loc[:,(slice(None), 'Market Cap')] > cut_off_cap] > 0)
tech_stocks_marketcap = tech_stocks_marketcap.loc[mask.any(axis=1), :]

long = (
    tech_stocks_marketcap
    .stack(level=0, future_stack=True)                       # index: (row, sector); columns: ['Symbol','Market Cap']
    .dropna(subset=['Market Cap'])        # keep rows with a market cap
)

result = (
    long
    .groupby(level=1)                     # group by sector (the stacked level)
    .agg(avg_market_cap=('Market Cap', lambda x : x.mean().round(2)),  # average market cap per sector
        n_stocks=('Symbol', 'count'))    # counts non-null symbols
    .sort_index()
)

semi_tickers                             = tech_stocks_marketcap['semiconductors']['Symbol']
semi_equipment_tickers                   = tech_stocks_marketcap['semiconductor-equipment-and-materials']['Symbol']
software_tickers                         = tech_stocks_marketcap['software-application']['Symbol']
computer_hardware_tickers                = tech_stocks_marketcap['computer-hardware']['Symbol']
comm_equipment_tickers                   = tech_stocks_marketcap['communication-equipment']['Symbol']
software_infrastructure_tickers          = tech_stocks_marketcap['software-infrastructure']['Symbol']
electronic_components_tickers            = tech_stocks_marketcap['electronic-components']['Symbol']
scientific_technical_instruments_tickers = tech_stocks_marketcap['scientific-and-technical-instruments']['Symbol']

stock_tickers = {
            'semi' : semi_tickers,
            'semi_equipment' : semi_equipment_tickers,
            'software' : software_tickers, 'computer_hardware' : computer_hardware_tickers,
            'comm_equipment' : comm_equipment_tickers,
            'software_infrastructure' : software_infrastructure_tickers,
            'electronic_components' : electronic_components_tickers,
            'scientific_technical_instruments' : scientific_technical_instruments_tickers
        }

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  symbols['Market Cap'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  symbols['Market Cap'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  symbols['Market Cap'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

Unnamed: 0_level_0,ticker,oos_r2
level_1,Unnamed: 1_level_1,Unnamed: 2_level_1
rolling_oos_r2_100,MSTR,0.221952
rolling_oos_r2_100,MSFT,0.536114
rolling_oos_r2_100,IONQ,0.197482
rolling_oos_r2_100,TER,0.429309
rolling_oos_r2_100,INTT,0.213073
...,...,...
rolling_oos_r2_60,INDI,0.290508
rolling_oos_r2_60,TTMI,0.370866
rolling_oos_r2_60,OLED,0.468915
rolling_oos_r2_60,BSY,0.263897
