# Dataset Screening — Fast Triage

This notebook screens all `{sector}__{ticker}.csv` files for:
- Sample depth & continuity
- Target viability
- Missingness
- Feature diversity
- **Preliminary signal (fast Pearson IC, with optional Spearman confirm)**
- Regime robustness
- Leakage / live-feasibility checks

It then scores and ranks datasets, and writes `dataset_screen_report.json`.

### Speed-up strategies in this version
- Two-stage IC: **fast rolling Pearson on winsorized z-scores** over recent history, then **Spearman confirm** only on promoted features (default top 30 or abs(medIC) ≥ 0.01).
- Trim feature set before IC (regex + variance + correlation reps).
- Reuse the fast pass wherever possible.

You can tune the speed/rigor tradeoff in the **Configuration** cell.


In [None]:
import os, sys, glob, time, math, re
import numpy as np
import pandas as pd
from concurrent.futures import ProcessPoolExecutor, as_completed

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.modelling_functions import create_target_variable

# Path where your processed datasets live (Windows raw string recommended)
DATA_DIR = r"C:\Users\epoch_bpjmdqk\Documents\Code\data\processed"

# Triage knobs — tweak freely
RECENT_ROWS = 750           # use only the most recent N rows in wide triage
MIN_SAMPLES = 150           # minimum aligned samples to score a feature
MAX_NANS = 0.3              # drop features with >30% missing after alignment
TOP_FEATURES_PER_SET = 5    # aggregate score: mean of top-K feature scores
TOP_DATASETS_TO_KEEP = 30   # how many datasets to pass to triage_fine

# Target generation knobs (used via your create_target_variable)
TARGET_WINDOW = 1
TARGET_THRESHOLD = 0.0

# Output
RESULTS_CSV = os.path.join(DATA_DIR, "triage_wide_results.csv")
TOPLIST_CSV = os.path.join(DATA_DIR, "triage_wide_toplist.csv")

CANDIDATE_PATHS = [
    os.getcwd(),
    os.path.abspath(os.path.join(os.getcwd(), "..")),
    os.path.abspath(os.path.join(os.path.dirname(DATA_DIR), "..")),  # project root guess
]
for p in CANDIDATE_PATHS:
    if p not in sys.path and os.path.isdir(p):
        sys.path.append(p)


In [2]:
def parse_sector_ticker(filename):
    # expects format {sector}__{ticker}.csv (as in staples__WMT.csv)
    base = os.path.basename(filename)
    name, _ = os.path.splitext(base)
    if "__" in name:
        sector, ticker = name.split("__", 1)
    else:
        # fallback if single underscore was used
        parts = name.split("_")
        sector, ticker = parts[0], parts[-1]
    return sector, ticker

def _point_biserial_fast(x, y_bin):
    """
    Pearson correlation between numeric x and binary y (0/1).
    Returns absolute value; NaN if insufficient variance.
    """
    if x.size < MIN_SAMPLES:
        return np.nan
    if (y_bin.max() == y_bin.min()):
        return np.nan  # constant target
    # standardize to avoid overflow
    x = x.astype(np.float64)
    y = y_bin.astype(np.float64)
    x_mean = np.nanmean(x)
    y_mean = np.nanmean(y)
    x_std = np.nanstd(x)
    y_std = np.nanstd(y)
    if x_std == 0 or y_std == 0 or np.isnan(x_std) or np.isnan(y_std):
        return np.nan
    cov = np.nanmean((x - x_mean) * (y - y_mean))
    r = cov / (x_std * y_std)
    return abs(r)


In [3]:
def score_dataset(file_path):
    sector, ticker = parse_sector_ticker(file_path)

    # Load
    try:
        # parse_dates is optional if Date exists; use dtype downcasting later
        df = pd.read_csv(file_path)
    except Exception as e:
        return {
            "dataset": os.path.basename(file_path),
            "sector": sector,
            "ticker": ticker,
            "score": np.nan,
            "n_rows": 0,
            "pos_rate": np.nan,
            "n_features_scored": 0,
            "error": f"read_error: {e}",
        }

    # Ensure Close column exists
    close_col = f"Close_{ticker}"
    if close_col not in df.columns:
        # try common alternatives
        alt = "Close"
        if alt in df.columns:
            df.rename(columns={alt: close_col}, inplace=True)
        else:
            return {
                "dataset": os.path.basename(file_path),
                "sector": sector,
                "ticker": ticker,
                "score": np.nan,
                "n_rows": len(df),
                "pos_rate": np.nan,
                "n_features_scored": 0,
                "error": f"missing_close_col ({close_col})",
            }

    # Sort by date if present
    if "Date" in df.columns:
        try:
            df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
            df = df.sort_values("Date")
        except Exception:
            pass

    # Use only recent rows for speed
    if RECENT_ROWS and len(df) > RECENT_ROWS:
        df = df.tail(RECENT_ROWS).copy()

    # Create target using your function
    try:
        df = create_target_variable(df, ticker=ticker, window=TARGET_WINDOW, threshold=TARGET_THRESHOLD)
    except Exception as e:
        return {
            "dataset": os.path.basename(file_path),
            "sector": sector,
            "ticker": ticker,
            "score": np.nan,
            "n_rows": len(df),
            "pos_rate": np.nan,
            "n_features_scored": 0,
            "error": f"target_error: {e}",
        }

    target_col = f"{ticker}_Target"
    ret_col = f"{ticker}_target_return_{TARGET_WINDOW}D_{TARGET_THRESHOLD}"
    if target_col not in df.columns:
        return {
            "dataset": os.path.basename(file_path),
            "sector": sector,
            "ticker": ticker,
            "score": np.nan,
            "n_rows": len(df),
            "pos_rate": np.nan,
            "n_features_scored": 0,
            "error": f"missing_target_col ({target_col})",
        }

    # Prepare candidate features
    exclude = {target_col, ret_col, "Date"}
    feature_cols = [c for c in df.columns
                    if c not in exclude and df[c].dtype != "O" and not c.endswith("_Target")]
    if not feature_cols:
        return {
            "dataset": os.path.basename(file_path),
            "sector": sector,
            "ticker": ticker,
            "score": np.nan,
            "n_rows": len(df),
            "pos_rate": df[target_col].mean() if target_col in df else np.nan,
            "n_features_scored": 0,
            "error": "no_numeric_features",
        }

    y = df[target_col].astype("float32").to_numpy()
    scores = []

    # Score each feature with fast point-biserial correlation vs binary target
    for col in feature_cols:
        x = pd.to_numeric(df[col], errors="coerce").to_numpy()
        mask = ~np.isnan(x) & ~np.isnan(y)
        if mask.sum() < MIN_SAMPLES:
            continue
        x_m = x[mask]
        y_m = y[mask]
        # drop columns with too many NaNs after alignment
        if 1 - (mask.sum() / len(mask)) > MAX_NANS:
            continue
        r = _point_biserial_fast(x_m, y_m)
        if not np.isnan(r):
            scores.append((col, r))

    if not scores:
        return {
            "dataset": os.path.basename(file_path),
            "sector": sector,
            "ticker": ticker,
            "score": np.nan,
            "n_rows": len(df),
            "pos_rate": float(np.nanmean(y)) if y.size else np.nan,
            "n_features_scored": 0,
            "error": "no_scored_features",
        }

    # Aggregate: mean of top-K absolute correlations
    scores.sort(key=lambda t: t[1], reverse=True)
    top_k = scores[:TOP_FEATURES_PER_SET]
    agg_score = float(np.mean([s for _, s in top_k]))

    return {
        "dataset": os.path.basename(file_path),
        "sector": sector,
        "ticker": ticker,
        "score": agg_score,
        "n_rows": len(df),
        "pos_rate": float(np.nanmean(y)) if y.size else np.nan,
        "n_features_scored": len(scores),
        "top_features": [c for c, _ in top_k],
        "error": "",
    }


In [4]:
from concurrent.futures import ThreadPoolExecutor, as_completed

def run_wide_triage():
    all_files = glob.glob(os.path.join(DATA_DIR, "*.csv"))
    if not all_files:
        raise FileNotFoundError(f"No CSVs found in {DATA_DIR}")

    start = time.time()
    results = []

    # Good default for I/O-bound work
    max_workers = min(32, (os.cpu_count() or 4) * 4)

    def _safe_score(path):
        try:
            return score_dataset(path)
        except Exception as e:
            # Ensure a single bad file doesn't kill the batch
            sector, ticker = parse_sector_ticker(path)
            return {
                "dataset": os.path.basename(path),
                "sector": sector,
                "ticker": ticker,
                "score": np.nan,
                "n_rows": 0,
                "pos_rate": np.nan,
                "n_features_scored": 0,
                "error": f"worker_exception: {type(e).__name__}: {e}",
            }

    with ThreadPoolExecutor(max_workers=max_workers) as ex:
        fut2file = {ex.submit(_safe_score, f): f for f in all_files}
        for fut in as_completed(fut2file):
            res = fut.result()  # _safe_score already guards exceptions
            results.append(res)

    dfres = pd.DataFrame(results)
    # If everything failed, be explicit
    if dfres["score"].notna().sum() == 0:
        print("Warning: all datasets failed; check 'error' column for details.")
    dfres = dfres.sort_values(["score"], ascending=[False], na_position="last")
    dfres.to_csv(RESULTS_CSV, index=False)

    toplist = dfres.head(TOP_DATASETS_TO_KEEP).copy()
    toplist[["dataset", "sector", "ticker", "score"]].to_csv(TOPLIST_CSV, index=False)

    elapsed = time.time() - start
    print(f"Triage complete in {elapsed:,.1f}s")
    print(f"Saved: {RESULTS_CSV}")
    print(f"Saved top {TOP_DATASETS_TO_KEEP}: {TOPLIST_CSV}")
    return dfres, toplist