In [None]:
# --- Calibrate Stage 2 (scalar temperature on eval) --------------------------
import json
print("\nðŸ§ª Calibrating Stage 2 (scalar T on eval set)...")

pred_eval = trainer_s2.predict(eval_dataset_s2)
logits_s2_val = torch.from_numpy(pred_eval.predictions).to(device)   # [N, C]
labels_s2_val = torch.from_numpy(pred_eval.label_ids).long().to(device)

# If your S2 fitter takes (logits, labels), use it directly:
T_s2 = fit_temperature(logits_s2_val, labels_s2_val)  # LBFGS-based; returns float

# Save alongside the S2 export so inference can pick it up automatically
s2_calib_path = os.path.join(SAVE_DIR, "emotion_classifier_model", "stage2_calibration.json")
os.makedirs(os.path.dirname(s2_calib_path), exist_ok=True)
with open(s2_calib_path, "w") as f:
    json.dump({
        "T": float(max(1e-6, T_s2)),
        "val_size": int(labels_s2_val.numel()),
        "notes": "Scalar temperature via NLL on eval; seed=42"
    }, f, indent=2)

print(f"âœ… S2 calibration done: T={T_s2:.3f} â†’ {s2_calib_path}")

# (Optional) if you want the final on-screen eval to reflect calibrated T:
trainer_s2.compute_metrics = partial(
    compute_metrics_with_confusion,
    label_names=RELEVANT_CLASSES,
    stage_name="Stage2",
    s2_temperature=float(T_s2),
)
_ = trainer_s2.evaluate(eval_dataset_s2)


In [1]:
"""
V35 dataset preflight & curation helper (final, merged)
------------------------------------------------------
Automates 4 tasks before running V35 Stage-2 training:

1) Shortlist parity check & regeneration from full inference log (thresholded by confidence).
2) Patch-set assembly for fragile corridors (sadness/speech_action vs neutral_speech) using V34 artifacts,
   with an integrated, safe "top-up sadness" step that prefers shortlist sources and falls back to full log.
3) Label & file integrity checks (exists, readable image, label in RELEVANT_CLASSES).
4) Split hygiene: ensure patch items are train-only (exclude any residing under eval_dir).

Outputs:
  - curation_shortlist_V35_auto.csv
  - patch_V35.csv
  - v35_prep_report.txt  (human-readable summary)
"""

import os
from pathlib import Path
from typing import List, Optional, Tuple, Dict
import pandas as pd
from PIL import Image

In [2]:
# -----------------------------------------------------------------------------
# CONFIG
# -----------------------------------------------------------------------------
CONF_THRESHOLD = 0.85   # shortlist threshold
RELEVANT_CLASSES = [
    "anger","contempt","disgust","fear","happiness","neutral",
    "questioning","sadness","surprise","neutral_speech","speech_action"
]

# Your V35 run root (unchanged)
V35_DIR = "/Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V35_20251014_162112"
DATASET_ROOT = "/Users/natalyagrokh/AI/ml_expressions/img_datasets/ferckjalfaga_dataset_14_labels"
CONF_THRESHOLD = 0.85

ARTIFACTS = {
    "full_inference":  f"{V35_DIR}/V35_full_inference_log.csv",   # if not present, set to V34 log and note it
    "shortlist_auto":  f"{V35_DIR}/curation_shortlist_V36_auto.csv",
    "shortlist_prev":  f"{V35_DIR}/curation_shortlist_V35.csv",    # if absent, this will be skipped gracefully
    "patch":           f"{V35_DIR}/patch_V36.csv"
}

# 1) Build shortlist from V35 full log (below threshold)
n_short, n_total = build_shortlist(
    ARTIFACTS["full_inference"],
    CONF_THRESHOLD,
    ARTIFACTS["shortlist_auto"]
)
print(f"[Shortlist] threshold={CONF_THRESHOLD} â†’ {n_short} / {n_total} rows â†’ {ARTIFACTS['shortlist_auto']}")

# 2) Assemble patch (corridors + sadness top-up + cap speech_action)
n_patch_raw, raw_counts = assemble_patch(ARTIFACTS, classes=[
    'anger','contempt','disgust','fear','happiness','neutral','questioning',
    'sadness','surprise','neutral_speech','speech_action'
], out_csv=ARTIFACTS["patch"])
print(f"[Patch] pre-validation rows={n_patch_raw} (counts by label={raw_counts})")

# 3) Validate (existence, read, no eval-leak); write back to same path
n_in, n_drop_missing, n_drop_eval, n_out = validate_patch(
    ARTIFACTS["patch"], 
    classes=[
        'anger','contempt','disgust','fear','happiness','neutral','questioning',
        'sadness','surprise','neutral_speech','speech_action'
    ],
    stage2_train_dir=DATASET_ROOT,      # your data are organized by label folders; no split dirs
    stage2_eval_dir=None,               # no dedicated val dir â€“ function handles None
    out_csv=ARTIFACTS["patch"]
)
print(f"[Validate] input={n_in}, dropped_missing_or_unreadable={n_drop_missing}, "
      f"dropped_eval_leak={n_drop_eval}, final={n_out} â†’ {ARTIFACTS['patch']}")

# Optional: dataset dirs for split hygiene. If unknown, leave as None.
STAGE2_TRAIN_DIR = None  # e.g., "/Users/you/datasets/stage2/train"
STAGE2_EVAL_DIR  = None  # e.g., "/Users/you/datasets/stage2/val"

# Outputs go inside V34 root
OUT_SHORTLIST = f"{V34_ROOT}/curation_shortlist_V35_auto.csv"
OUT_PATCH     = f"{V34_ROOT}/patch_V35.csv"
OUT_REPORT    = f"{V34_ROOT}/v35_prep_report.txt"
ARTIFACTS["shortlist_auto"] = OUT_SHORTLIST

# --- class balancing / top-up knobs ---
FLOOR_SADNESS       = 200      # ensure at least this many sadness examples in patch
CAP_SPEECH_ACTION   = 900      # cap speech_action in patch
TARGET_SADNESS_ADD  = 250      # preferred additional sadness to try add (if available)

In [3]:
# -----------------------------------------------------------------------------
# Helpers
# -----------------------------------------------------------------------------
def _ensure_dir(p: str):
    Path(p).parent.mkdir(parents=True, exist_ok=True)

def _load_csv_safe(path: str) -> Optional[pd.DataFrame]:
    if not path or not Path(path).exists():
        return None
    try:
        return pd.read_csv(path)
    except Exception:
        return None

def _open_ok(img_path: str) -> bool:
    try:
        with Image.open(img_path) as im:
            im.verify()  # quick decode check
        return True
    except Exception:
        return False

def _guess_path_column(df: pd.DataFrame) -> Optional[str]:
    candidates = ["filepath","file_path","path","image_path","img_path","file","image","filename"]
    for c in candidates:
        if c in df.columns:
            return c
    return None

def _guess_label_column(df: pd.DataFrame) -> Optional[str]:
    candidates = ["label","target_label","true_label","assigned_label","class","category"]
    for c in candidates:
        if c in df.columns:
            return c
    return None

def _is_under_dir(child: str, parent: str) -> bool:
    try:
        child_path = Path(child).resolve()
        parent_path = Path(parent).resolve()
        return parent_path in child_path.parents or child_path == parent_path
    except Exception:
        return False

# -----------------------------------------------------------------------------
# 1) Shortlist parity check & regeneration
# -----------------------------------------------------------------------------
def build_shortlist(full_inference_csv: str, threshold: float, out_csv: str) -> Tuple[int, int]:
    df = _load_csv_safe(full_inference_csv)
    if df is None:
        return 0, 0
    if "confidence" not in df.columns:
        # cannot compute; write empty file with header if possible
        df0 = pd.DataFrame(columns=list(df.columns))
        _ensure_dir(out_csv)
        df0.to_csv(out_csv, index=False)
        return 0, len(df)
    mask = df["confidence"] < threshold
    shortlist = df.loc[mask].copy()
    _ensure_dir(out_csv)
    shortlist.to_csv(out_csv, index=False)
    return len(shortlist), len(df)

# -----------------------------------------------------------------------------
# 2) Patch-set assembly (corridor-focused + curated sources + top-up sadness)
# -----------------------------------------------------------------------------
def assemble_patch(artifacts: Dict[str,str],
                   classes: List[str],
                   out_csv: str) -> Tuple[int, Dict[str,int]]:
    """
    Build a patch set from curated + hard_negatives + (previous & auto) shortlists.
    * Dedupes by filepath.
    * Infers label from folder name if missing.
    * Prioritizes corridor (sadness/neutral_speech/speech_action).
    * TOP-UP: add sadness from shortlist_auto/shortlist_prev/full_log (safe, deduped).
    * Applies FLOOR/CAP (sadness floor, speech_action cap).
    """
    frames = []
    # 2.1) Include curated + hard-negatives + shortlists (NOT the full log here)
    for key in [
        "curated_additions",
        "curated_additions_merged",
        "hard_neg_1",
        "hard_neg_2",
        "shortlist_prev",
        "shortlist_auto",
    ]:
        p = artifacts.get(key)
        df = _load_csv_safe(p)
        if df is not None and not df.empty:
            df["__source"] = key
            frames.append(df)

    if not frames:
        _ensure_dir(out_csv)
        pd.DataFrame(columns=["filepath","label","__source"]).to_csv(out_csv, index=False)
        return 0, {}

    big = pd.concat(frames, axis=0, ignore_index=True)

    # 2.2) Standardize columns
    path_col = _guess_path_column(big)
    if path_col is None:
        # heuristic: fall back to first column that looks like a path
        for col in big.columns:
            if big[col].astype(str).str.contains(r"/|\\").any():
                path_col = col
                break
    if path_col is None:
        _ensure_dir(out_csv)
        pd.DataFrame(columns=["filepath","label","__source"]).to_csv(out_csv, index=False)
        return 0, {}

    big = big.rename(columns={path_col: "filepath"})
    if "label" not in big.columns:
        big["label"] = ""

    # infer label from folder name if missing/invalid
    def _infer_label_from_path(fp, classes):
        try:
            parent = Path(str(fp)).parent.name
            return parent if parent in classes else ""
        except Exception:
            return ""

    big["label"] = big["label"].where(
        big["label"].isin(classes),
        big["filepath"].apply(lambda x: _infer_label_from_path(x, classes))
    )

    # keep only recognized labels + non-null paths
    big = big.dropna(subset=["filepath"]).copy()
    big["filepath"] = big["filepath"].astype(str)
    big = big[big["label"].isin(classes)].copy()

    # 2.3) Dedup by filepath (keep first occurrence per source order)
    big = big.drop_duplicates(subset=["filepath"], keep="first")

    # 2.4) Priority-first ordering (corridor first)
    if "predicted_class" in big.columns:
        mask_corridor = big["predicted_class"].isin(["neutral_speech","sadness","speech_action"])
    else:
        mask_corridor = big["label"].isin(["neutral_speech","sadness","speech_action"])
    mask_corridor = mask_corridor.reindex(big.index, fill_value=False)

    patch = pd.concat([big[mask_corridor], big[~mask_corridor]], ignore_index=True)


    # 2.5) TOP-UP sadness (prefer shortlist_auto, then shortlist_prev, then full log) â€” robust
    have_sad = int((patch["label"] == "sadness").sum())
    need_floor = max(0, FLOOR_SADNESS - have_sad)
    
    # Also try to add up to TARGET_SADNESS_ADD total new sadness samples (without exceeding availability)
    target_add = max(need_floor, TARGET_SADNESS_ADD if have_sad < FLOOR_SADNESS else 0)
    
    if target_add > 0:
        # choose candidate source
        cands = None
        for key in ("shortlist_auto", "shortlist_prev"):
            cands = _load_csv_safe(artifacts.get(key, ""))
            if cands is not None and not cands.empty:
                cands["__source"] = key
                break
        if cands is None:
            cands = _load_csv_safe(artifacts.get("full_inference", ""))
            if cands is not None and not cands.empty:
                cands["__source"] = "full_inference"
    
        if cands is not None and not cands.empty:
            # ---- normalize path column for cands ----
            path_col_c = _guess_path_column(cands)
            if path_col_c is None:
                # heuristic: pick any column that looks like a path
                for col in cands.columns:
                    try:
                        if cands[col].astype(str).str.contains(r"/|\\").any():
                            path_col_c = col
                            break
                    except Exception:
                        pass
            if path_col_c is None:
                # cannot proceed without a path column
                pass
            else:
                if path_col_c != "filepath":
                    cands = cands.rename(columns={path_col_c: "filepath"})
                cands = cands.dropna(subset=["filepath"]).copy()
                cands["filepath"] = cands["filepath"].astype(str)
    
                # ensure expected columns exist
                for col in ["predicted_class", "top2_class", "confidence"]:
                    if col not in cands.columns:
                        cands[col] = None
    
                # build sadness corridor mask
                has_pred = "predicted_class" in cands.columns and cands["predicted_class"].notna().any()
                if has_pred:
                    mask_sad = (
                        (cands["predicted_class"] == "sadness") |
                        (
                            (cands["predicted_class"] == "neutral_speech") &
                            (
                                (cands.get("top2_class") == "sadness") |
                                (cands.get("confidence").fillna(1.0) <= 0.90)
                            )
                        )
                    )
                    sad_cands = cands.loc[mask_sad, ["filepath", "predicted_class", "confidence"]].copy()
                else:
                    # fallback: infer from folder name if no predictions present
                    sad_cands = cands[["filepath"]].copy()
                    sad_cands["__folder_label"] = sad_cands["filepath"].apply(
                        lambda p: Path(p).parent.name if isinstance(p, str) else ""
                    )
                    sad_cands = sad_cands[sad_cands["__folder_label"] == "sadness"].copy()
    
                # existence check and dedupe against current patch
                sad_cands = sad_cands[sad_cands["filepath"].map(lambda p: os.path.isfile(p))]
                already = set(patch["filepath"].astype(str))
                sad_cands = sad_cands[~sad_cands["filepath"].isin(already)]
    
                # bias by lower confidence first if available
                if "confidence" in sad_cands.columns:
                    try:
                        sad_cands = sad_cands.sort_values(by="confidence", ascending=True, na_position="last")
                    except Exception:
                        pass
    
                # take up to target_add
                if len(sad_cands) > target_add:
                    sad_cands = sad_cands.head(target_add)
    
                if not sad_cands.empty:
                    add_df = sad_cands[["filepath"]].copy()
                    add_df["label"] = "sadness"
                    patch = pd.concat([patch, add_df], ignore_index=True)
                    patch = patch.drop_duplicates(subset=["filepath"], keep="first")

    # 2.6) CAP speech_action if needed
    sa_count = int((patch["label"] == "speech_action").sum())
    if sa_count > CAP_SPEECH_ACTION:
        keep_sa = patch[patch["label"] == "speech_action"].sample(
            n=CAP_SPEECH_ACTION, random_state=42, replace=False
        )
        non_sa = patch[patch["label"] != "speech_action"]
        patch  = pd.concat([non_sa, keep_sa], ignore_index=True)

    # 2.7) Write out
    _ensure_dir(out_csv)
    patch.to_csv(out_csv, index=False)

    counts = patch["label"].value_counts().to_dict()
    return len(patch), counts

# -----------------------------------------------------------------------------
# 3) Validate + sanitize patch (files/labels/split hygiene)
# -----------------------------------------------------------------------------
def validate_patch(in_csv: str,
                   classes: List[str],
                   train_dir: Optional[str],
                   eval_dir: Optional[str],
                   out_csv: str) -> Tuple[int,int,int,int]:
    """
    Validate and sanitize a patch CSV:
      - keep only rows with existing files
      - keep only labels in `classes`
      - optionally drop rows that live under `eval_dir` (leak guard)
      - write cleaned CSV to `out_csv`
    Returns: (n_input, n_dropped_missing, n_dropped_eval, n_output)
    """
    def _ensure_dir_local(p: str):
        Path(os.path.dirname(p) or ".").mkdir(parents=True, exist_ok=True)

    def _safe_read(csv_path: str) -> Optional[pd.DataFrame]:
        try:
            df = pd.read_csv(csv_path)
            return df if not df.empty else None
        except Exception:
            return None

    df = _safe_read(in_csv)
    if df is None:
        _ensure_dir_local(out_csv)
        pd.DataFrame(columns=["filepath","label"]).to_csv(out_csv, index=False)
        return (0, 0, 0, 0)

    # normalize columns
    if "filepath" not in df.columns:
        for c in df.columns:
            if df[c].astype(str).str.contains(r"/|\\").any():
                df = df.rename(columns={c: "filepath"})
                break
    if "filepath" not in df.columns:
        _ensure_dir_local(out_csv)
        pd.DataFrame(columns=["filepath","label"]).to_csv(out_csv, index=False)
        return (0, 0, 0, 0)

    if "label" not in df.columns:
        df["label"] = ""

    n_input = len(df)

    # drop missing/unreadable files
    df["filepath"] = df["filepath"].astype(str)
    mask_exists = df["filepath"].apply(lambda p: os.path.isfile(p))
    n_drop_missing = int((~mask_exists).sum())
    df = df[mask_exists].copy()

    # keep only allowed labels
    df["label"] = df["label"].astype(str)
    df = df[df["label"].isin(classes)].copy()

    # drop eval-leak rows, if eval_dir provided
    n_drop_eval = 0
    if eval_dir:
        eval_dir_resolved = str(Path(eval_dir).resolve())
        def _is_in_eval(p):
            try:
                return str(Path(p).resolve()).startswith(eval_dir_resolved + os.sep)
            except Exception:
                return False
        mask_eval = df["filepath"].apply(_is_in_eval)
        n_drop_eval = int(mask_eval.sum())
        df = df[~mask_eval].copy()

    # dedupe by filepath
    df = df.drop_duplicates(subset=["filepath"], keep="first")

    # write out
    _ensure_dir_local(out_csv)
    df.to_csv(out_csv, index=False)

    n_output = len(df)
    return (n_input, n_drop_missing, n_drop_eval, n_output)

In [4]:
# -----------------------------------------------------------------------------
# 4) Driver
# -----------------------------------------------------------------------------
def main():
    report_lines = []

    # (1) Shortlist regeneration
    n_short, n_total = build_shortlist(ARTIFACTS["full_inference"], CONF_THRESHOLD, OUT_SHORTLIST)
    report_lines.append(f"[Shortlist] threshold={CONF_THRESHOLD} â†’ {n_short} / {n_total} rows written to {OUT_SHORTLIST}")

    # Compare against prior shortlist if present
    prev = _load_csv_safe(ARTIFACTS["shortlist_prev"])
    if prev is not None:
        report_lines.append(f"[Shortlist] previous shortlist detected: {ARTIFACTS['shortlist_prev']} (rows={len(prev)})")

    # (2) Assemble patch (includes corridor priority + TOP-UP sadness + CAP speech_action)
    n_patch_raw, raw_counts = assemble_patch(ARTIFACTS, RELEVANT_CLASSES, OUT_PATCH)
    report_lines.append(f"[Patch] pre-validation rows={n_patch_raw} (counts by label={raw_counts})")

    # (3) Validate patch files/labels and enforce split hygiene
    n_in, n_drop_missing, n_drop_eval, n_out = validate_patch(
        OUT_PATCH, RELEVANT_CLASSES, STAGE2_TRAIN_DIR, STAGE2_EVAL_DIR, OUT_PATCH
    )
    report_lines.append(
        f"[Validate] input={n_in}, dropped_missing_or_unreadable={n_drop_missing}, "
        f"dropped_eval_leak={n_drop_eval}, final={n_out} â†’ {OUT_PATCH}"
    )

    # (4) Save summary report
    report = "\n".join(report_lines) + "\n"
    Path(OUT_REPORT).write_text(report, encoding="utf-8")
    print(report)

if __name__ == "__main__":
    main()

[Shortlist] threshold=0.85 â†’ 5207 / 26902 rows written to /Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V34_20251013_211825/curation_shortlist_V35_auto.csv
[Shortlist] previous shortlist detected: /Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V34_20251013_211825/curation_shortlist_V34.csv (rows=2590)
[Patch] pre-validation rows=1924 (counts by label={'speech_action': 900, 'neutral_speech': 320, 'sadness': 200, 'happiness': 189, 'neutral': 157, 'surprise': 65, 'questioning': 60, 'fear': 20, 'contempt': 12, 'anger': 1})
[Validate] input=1924, dropped_missing_or_unreadable=0, dropped_eval_leak=0, final=1924 â†’ /Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V34_20251013_211825/patch_V35.csv

