In [1]:
# ==============================================================================
# üß† V32 Review Shortlist Generator
# ------------------------------------------------------------------------------

# Macro Description:
# ------------------
# This script automates the process of creating a *targeted shortlist* of
# low-confidence or high-entropy predictions from the full inference results
# (`V32_full_inference_log.csv`). It is designed to help you quickly identify
# and manually verify *high-value review samples* that will most improve
# training quality for the next model run (e.g., V33).

# Instead of reviewing all low-confidence predictions, this script:
#   1. Loads the V32 full inference log.
#   2. Filters only the classes you care about most (e.g., sadness, neutral, etc.).
#   3. Sorts each class subset by lowest confidence (or highest entropy).
#   4. Copies the top N images per class into a new `review_shortlist/` folder.
#   5. Saves:
#         - `curation_shortlist_V32.csv`: list of the copied samples, with
#           original predicted class and confidence.
#         - `curated_additions_V32.csv`: blank template CSV for you to record
#           correct labels (used as your curation patch in V33).

# This process enables fast, structured human review and clean integration
# of curated corrections into your next fine-tuning stage.

# -------------------------------------------------------------------------------
# Typical Usage:
#     1. Adjust `V32_DIR` and the `CLASSES` list to match your environment.
#     2. Run this script once.
#     3. Open `review_shortlist/` and inspect images visually.
#     4. Fill in `curated_additions_V32.csv` with correct labels and notes.
#     5. Feed that file into V33‚Äôs dataset builder (for oversampling).
# -------------------------------------------------------------------------------

In [2]:
import os, shutil
import pandas as pd

In [3]:
# --------------------------------------------------------------------------
# üîß CONFIGURATION
# --------------------------------------------------------------------------
V32_DIR = "/Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V32_20251008_115114"
LOG_CSV = os.path.join(V32_DIR, "V32_full_inference_log.csv")
CLASSES = ["sadness", "neutral", "neutral_speech", "happiness", "speech_action"]  # adjust as needed
N_PER_CLASS = 200  # how many per class to shortlist
OUT_DIR = os.path.join(V32_DIR, "review_shortlist")
BASE_DATA_ROOT = ""  # set if the CSV stores relative paths (e.g., "/Users/.../dataset_root")

In [4]:
# --------------------------------------------------------------------------
# üß© LOAD AND PREPARE DATA
# --------------------------------------------------------------------------
df = pd.read_csv(LOG_CSV)

# --- Column autodetect (robust to different header names) ---
cols = {c.lower(): c for c in df.columns}
def pick(*names):
    for n in names:
        for k, v in cols.items():
            if k == n.lower():
                return v
    return None

col_pred = pick("predicted_label", "prediction", "pred", "pred_label", "label_pred")
col_path = pick("filepath", "path", "image_path", "img_path", "file", "filename")
col_conf = pick("confidence", "conf", "score", "prob", "prob_max")
col_entropy = pick("entropy", "ent")

if col_pred is None or col_path is None:
    raise RuntimeError(f"Need predicted label and filepath columns; got: {df.columns.tolist()}")

In [5]:
# --------------------------------------------------------------------------
# üîç BUILD SHORTLIST: LOWEST-CONFIDENCE (or HIGHEST-ENTROPY)
# --------------------------------------------------------------------------
shortlist_frames = []
for cls in CLASSES:
    d = df[df[col_pred] == cls].copy()
    if d.empty:
        continue
    if col_conf:
        d = d.sort_values(col_conf, ascending=True)
    elif col_entropy:
        d = d.sort_values(col_entropy, ascending=False)
    shortlist_frames.append(d.head(N_PER_CLASS))

if not shortlist_frames:
    print("No rows matched selected classes. Check CLASSES or CSV headers.")
else:
    short = pd.concat(shortlist_frames, ignore_index=True)

In [6]:
# --------------------------------------------------------------------------
# üìÇ COPY IMAGES FOR VISUAL REVIEW
# --------------------------------------------------------------------------
os.makedirs(OUT_DIR, exist_ok=True)
copied, missing = 0, 0
resolved_paths = []

for p in short[col_path]:
    src = p
    # Expand relative paths if needed
    if not os.path.isabs(src) and BASE_DATA_ROOT:
        src = os.path.join(BASE_DATA_ROOT, p)
    if not os.path.exists(src):
        missing += 1
        continue
    resolved_paths.append(src)
    dst = os.path.join(OUT_DIR, os.path.basename(src))
    if not os.path.exists(dst):
        shutil.copy2(src, dst)
    copied += 1

In [7]:
# --------------------------------------------------------------------------
# üíæ SAVE SHORTLIST AND TEMPLATE FILES
# --------------------------------------------------------------------------
# 1Ô∏è‚É£  Save a CSV describing what was copied
out_cols = [col_path, col_pred] + (
    [col_conf] if col_conf else ([col_entropy] if col_entropy else [])
)
short[out_cols].to_csv(os.path.join(V32_DIR, "curation_shortlist_V32.csv"), index=False)

# 2Ô∏è‚É£  Create a blank patch template to fill in during review
patch = pd.DataFrame(
    {
        "filepath": resolved_paths,
        "correct_label": ["" for _ in resolved_paths],
        "notes": ["" for _ in resolved_paths],
    }
)
patch.to_csv(os.path.join(V32_DIR, "curated_additions_V32.csv"), index=False)

print(
    f"‚úÖ Shortlist ready: copied {copied} files to {OUT_DIR} (missing: {missing}).\n"
    "Wrote curation_shortlist_V32.csv and curated_additions_V32.csv in the V32 folder."
)

‚úÖ Shortlist ready: copied 888 files to /Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V32_20251008_115114/review_shortlist (missing: 0).
Wrote curation_shortlist_V32.csv and curated_additions_V32.csv in the V32 folder.
