# Dataset Screening — Fast Triage

This notebook screens all `{sector}__{ticker}.csv` files for:
- Sample depth & continuity
- Target viability
- Missingness
- Feature diversity
- **Preliminary signal (fast Pearson IC, with optional Spearman confirm)**
- Regime robustness
- Leakage / live-feasibility checks

It then scores and ranks datasets, and writes `dataset_screen_report.json`.

### Speed-up strategies in this version
- Two-stage IC: **fast rolling Pearson on winsorized z-scores** over recent history, then **Spearman confirm** only on promoted features (default top 30 or abs(medIC) ≥ 0.01).
- Trim feature set before IC (regex + variance + correlation reps).
- Reuse the fast pass wherever possible.

You can tune the speed/rigor tradeoff in the **Configuration** cell.


In [None]:
import os
import sys
import glob
import time
import json
import numpy as np
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.modelling_functions import create_target_variable 
from src.qa.file_checks import qa_one_file 

# === Paths ===
DATA_DIR = r"C:\Users\epoch_bpjmdqk\Documents\Code\data\processed"  # where {sector}__{ticker}.csv 
SAVE_DIR = r"C:\Users\epoch_bpjmdqk\Documents\Code\data\raw"  # where to save aligned/processed data

QA_RESULTS_CSV = os.path.join(SAVE_DIR, "qa_sweep_results.csv")
QA_SUMMARY_JSON= os.path.join(SAVE_DIR, "qa_sweep_results.json")


In [None]:
def parse_sector_ticker(filename: str):
    base = os.path.basename(filename)
    name, _ = os.path.splitext(base)
    if "__" in name:
        sector, ticker = name.split("__", 1)
    else:
        parts = name.split("_")
        sector, ticker = parts[0], parts[-1]
    return sector, ticker

In [None]:
# %%
# discover files
all_files = sorted(glob.glob(os.path.join(DATA_DIR, "*.csv")))
total = len(all_files)
if total == 0:
    raise FileNotFoundError(f"No CSVs found in {DATA_DIR}")

print(f"[qa] Found {total} datasets in: {DATA_DIR}")
if total <= 10:
    print("[qa] Files:", ", ".join(os.path.basename(f) for f in all_files))
else:
    head = ", ".join(os.path.basename(f) for f in all_files[:5])
    tail = ", ".join(os.path.basename(f) for f in all_files[-3:])
    print(f"[qa] First 5: {head}")
    print(f"[qa] Last  3: {tail}")

print("-" * 80)
results = []
ok_count, err_count = 0, 0
t0_all = time.time()

for i, path in enumerate(all_files, start=1):
    ds = os.path.basename(path)
    sector, ticker = parse_sector_ticker(ds)

    print(f"[qa {i:>{len(str(total))}}/{total}] START  {ds}  (sector={sector}, ticker={ticker})", flush=True)
    t0 = time.time()
    try:
        row = qa_one_file(path)
        took = time.time() - t0
        # normalize row & add extras
        row = dict(row)
        row.setdefault("dataset", ds)
        row.setdefault("ok", True)
        row.setdefault("error", "")
        row["sector"] = sector
        row["ticker"] = ticker
        row["seconds"] = round(took, 3)

        results.append(row)
        ok_count += 1 if row.get("ok", True) and not row.get("error") else 0
        err_count += 0 if row.get("ok", True) and not row.get("error") else 1

        status = "OK" if (row.get("ok", True) and not row.get("error")) else "WARN/ERR"
        print(f"[qa {i:>{len(str(total))}}/{total}] DONE   {ds}  → {status}  ({took:,.2f}s)", flush=True)
    except Exception as e:
        took = time.time() - t0
        err = {"dataset": ds, "sector": sector, "ticker": ticker, "ok": False,
               "error": f"qa_exception: {type(e).__name__}: {e}", "seconds": round(took, 3)}
        results.append(err)
        err_count += 1
        print(f"[qa {i:>{len(str(total))}}/{total}] ERROR  {ds}  → {err['error']}  ({took:,.2f}s)", flush=True)

elapsed = time.time() - t0_all
print("-" * 80)
print(f"[qa] Finished {total} datasets in {elapsed:,.1f}s  ✓ ok={ok_count}  ✖ errors|warnings={err_count}")

# persist
qa_df = pd.DataFrame(results)
qa_df.to_csv(QA_RESULTS_CSV, index=False)
with open(QA_SUMMARY_JSON, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2)

print(f"[qa] Wrote CSV  → {QA_RESULTS_CSV}")
print(f"[qa] Wrote JSON → {QA_SUMMARY_JSON}")

# pretty print a compact table in the notebook/stdout
cols_pref = [
    "dataset","ok","rows","cols","nan_overall","n_sparse_cols","n_near_constant",
    "dup_index_rows","median_gap_days","worst_macro_staleness_days","n_identical_pairs",
    "seconds","error"
]
show_cols = [c for c in cols_pref if c in qa_df.columns]
print("\n[qa] Summary (first 25 rows):")
print(qa_df[show_cols].head(25).to_string(index=False))
