In [1]:
# ==============================================================================
# V35 RESCUE CODE  -  One-off stabilization & rerun prep (non-standard VML flow)
# Summary: Reuse existing V35 folder; load S1 from disk; de-dupe SAVE_DIR logic;
    # ensure S2 trains cleanly without patch_V35.csv for this run.
# ==============================================================================

# overview: Stabilize runtime and pathing for an in-notebook rerun; skip S1 retrain;
#           keep Stage-2 flow clean and spawn-safe on macOS; minimal edits to proceed.

# section #0 (imports / housekeeping / paths):
#   - Defined single source of truth:
#       SAVE_DIR  = ".../V35_20251014_162112"
#       VERSION_TAG = os.path.basename(SAVE_DIR)
#       VERSION     = VERSION_TAG
#   - Consolidated SAVE_DIR setup (removed duplicate assignments/prints).
#   - Ensured imports precede first use (os/glob/re/time/datetime).
#   - Rationale: avoid accidental new-run folders and NameErrors.
#   - Expected Impact: deterministic artifact locations; clean bootstrap.

# section #2 (checkpoint discovery):
#   - find_latest_checkpoint(...) sorts by (version, timestamp) and excludes current run.
#   - Rationale: when reusing V35, latest-previous resolves to V34 by design.
#   - Expected Impact: S2 initializes from the most recent completed baseline.


# Stage 1 (relevance) ‚Äî loader path
# section #3 (S1 load instead of train):
#   - Replaced training with disk load from SAVE_DIR/relevance_filter_model.
#   - Loaded processor from same folder for S2 parity.
#   - Rationale: skip multi-hour S1 retrain.
#   - Expected Impact: immediate availability of model_s1 + processor.

# section #3 (S1 calibration block removal):
#   - Disabled the calibration block referencing trainer_s1/eval_dataset_s1.
#   - Rationale: those objects do not exist in loader-only mode.
#   - Expected Impact: prevent NameError; optional calibration deferred.


# Stage 2 (emotion) ‚Äî training safety & cleanliness:
# section #5 (TrainingArguments):
#   - Set overwrite_output_dir=True to allow reruns into the same directory.
#   - Rationale: avoid ‚Äúdirectory not empty‚Äù conflicts.
#   - Expected Impact: idempotent reruns.

# section #5 (early stopping):
#   - Defined EarlyStoppingCallback(patience=2, threshold=0.0).
#   - Rationale: previously referenced but undefined.
#   - Expected Impact: stable convergence guard (no NameError).

# section #5 (sampler bind order):
#   - Bound WeightedRandomSampler only AFTER trainer_s2 construction.
#   - Rationale: prevent early reference to trainer_s2.
#   - Expected Impact: valid sampler override; no shuffle conflicts.


# Stage 2 (emotion) ‚Äî DataLoader stability (macOS spawn-safe):
# section #5 (TrainingArguments ‚Äì workers):
#   - Added: dataloader_num_workers=0
#   - Rationale: prevent multiprocessing from pickling DataCollatorWithAugmentation
#     (defined in __main__), which raised AttributeError in worker processes.
#   - Expected Impact: stable training loop; minor I/O slowdown acceptable.

# section #5 (custom train DataLoader override):
#   - Updated override to: num_workers=0, pin_memory=False  (CPU-only).
#   - Rationale: keep collation in main process to avoid spawn-time pickling.
#   - Expected Impact: eliminates worker exit errors; deterministic batches.

# notes (future optional):
#   - To re-enable multi-worker loading, move DataCollatorWithAugmentation into
#     an importable module (e.g., collators.py) and import it; then raise
#     dataloader_num_workers > 0 safely.


# Utilities / nits:
# section #9 (CSV writing):
#   - Fixed writer argument where applicable: newline="" (not line="").
#   - Rationale: correct CSV semantics; avoid extra blank lines.
#   - Expected Impact: cleaner artifact CSVs (minor).


# Out-of-scope for this rescue run:
# section #10 (dataset patching):
#   - Intentionally not using patch_V35.csv (train-only injection) in this pass.
#   - Rationale: expedite a clean S2 rerun first; revisit targeted patching later.
#   - Expected Impact: standard 80/20 split for S2 in this run.


# Operator Notes:
# - This ‚ÄúRescue Code‚Äù changelog documents a one-off stabilization lane to complete V35
#   safely without data prep or S1 retraining. Fold changes selectively into mainline
#   once metrics are reviewed.

In [2]:
# --------------------------
# 0. Imports
# --------------------------
# WORKAROUND for PyTorch MPS bug
import os
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

# Standard Library Imports
import datasets
import csv
import gc
import glob
import multiprocessing as mp
import torch
import random
import re
import shutil
import subprocess
import sys
import time
import json

# Third-Party Imports
import accelerate
import dill
import face_recognition
import matplotlib.pyplot as plt
import numpy as np, cv2
import pandas as pd
import seaborn as sns
import torch.nn.functional as F
import torchvision.transforms as T
import transformers

# From Imports
from collections import Counter
from datasets import ClassLabel, Dataset, Features, Image as DatasetsImage, concatenate_datasets, load_dataset
from datetime import datetime
from functools import partial
from imagehash import phash, hex_to_hash
from io import BytesIO
from pathlib import Path
from PIL import Image, ImageOps, ImageStat, ExifTags, UnidentifiedImageError
from sklearn.metrics import classification_report, confusion_matrix, log_loss
from sklearn.utils.class_weight import compute_class_weight
from torch import nn
from torch.optim import AdamW, LBFGS
from torch.utils.data import WeightedRandomSampler, DataLoader
from torchvision import transforms
from torchvision.transforms import (
    RandAugment,
)
from tqdm import tqdm
from transformers import (
    AutoImageProcessor,
    AutoModelForImageClassification,
    EarlyStoppingCallback,
    TrainingArguments,
    Trainer,
    ViTForImageClassification,
)
from types import SimpleNamespace

In [3]:
# --------------------------
# 1. Global Configurations
# --------------------------

# (ensure these imports appear before this block in your notebook/file)
# import os, glob, re, time
# from datetime import datetime

# --- üìÇ Core Paths ---
BASE_DATASET_PATH = "/Users/natalyagrokh/AI/ml_expressions/img_datasets/ferckjalfaga_dataset_14_labels"
OUTPUT_ROOT_DIR   = "/Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training"

# --- ‚öôÔ∏è Run Configuration ---
RUN_INFERENCE     = True          # safer default for dev
PREPARE_DATASETS  = False         # set True only when layout changes

# ‚îÄ‚îÄ Reuse existing V35 run directory (no new folder) ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
USE_EXISTING_SAVE_DIR = True
EXISTING_V35_DIR      = "/Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V35_20251014_162112"

# Resolve SAVE_DIR / VERSION_TAG / VERSION exactly once
if USE_EXISTING_SAVE_DIR:
    SAVE_DIR    = EXISTING_V35_DIR
    VERSION_TAG = os.path.basename(SAVE_DIR)   # e.g., "V35_20251014_162112"
else:
    # (fallback: create a new run folder only if you ever flip the flag)
    VERSION_TAG = f"V35_{time.strftime('%Y%m%d_%H%M%S')}"
    SAVE_DIR    = os.path.join(OUTPUT_ROOT_DIR, VERSION_TAG)

os.makedirs(SAVE_DIR, exist_ok=True)
VERSION = VERSION_TAG
print(f"üìÅ Using SAVE_DIR: {SAVE_DIR}")

# --- ü§ñ Model Configuration ---
BASE_MODEL_NAME = "google/vit-base-patch16-224-in21k"

# --- üè∑Ô∏è Dataset & Label Definitions ---
RELEVANT_CLASSES = [
    'anger','contempt','disgust','fear','happiness',
    'neutral','questioning','sadness','surprise',
    'neutral_speech','speech_action'
]
IRRELEVANT_CLASSES = ['hard_case']  # 'unknown' is inside 'hard_case' recursively

id2label_s2 = dict(enumerate(RELEVANT_CLASSES))
label2id_s2 = {v: k for k, v in id2label_s2.items()}
id2label_s1 = {0: 'irrelevant', 1: 'relevant'}
label2id_s1 = {v: k for k, v in id2label_s1.items()}

REVIEW_CONF_THRESHOLD = 0.85

VALID_EXTENSIONS = (".jpg", ".jpeg", ".png", ".tif", ".tiff")
def is_valid_image(filename):
    return filename.lower().endswith(VALID_EXTENSIONS) and not filename.startswith("._")

# --- üî¢ Versioning helpers (optional) ---
def get_next_version(base_dir):
    all_entries = glob.glob(os.path.join(base_dir, "V*_*"))
    existing = [os.path.basename(d) for d in all_entries if os.path.isdir(d)]
    versions = [
        int(d[1:].split("_")[0]) for d in existing
        if d.startswith("V") and "_" in d and d[1:].split("_")[0].isdigit()
    ]
    next_version = max(versions, default=0) + 1
    return f"V{next_version}"

def find_latest_checkpoint(root_dir, current_run_basename=None):
    """
    Return the path to the most recent *completed* run by semantic version + timestamp,
    excluding the current run directory. Ignores folders that don't contain model artifacts.
    Pattern: V<num>_YYYYMMDD_HHMMSS (e.g., V34_20251013_211825)
    """
    candidates = []
    pat = re.compile(r"^V(\d+)_(\d{8}_\d{6})$")

    for d in os.listdir(root_dir):
        full = os.path.join(root_dir, d)
        if not (os.path.isdir(full) and d.startswith("V")):
            continue
        if current_run_basename and d == current_run_basename:
            continue

        m = pat.match(d)
        if not m:
            continue

        ver = int(m.group(1))
        ts  = m.group(2)

        # "completed" if it contains any known artifact folders
        has_model = any(
            os.path.isdir(os.path.join(full, p))
            for p in ("emotion_classifier_model", "relevance_filter_model", "stage_2_emotion_model_training")
        )
        if not has_model:
            continue

        candidates.append((ver, ts, full))

    if not candidates:
        return None

    # Sort by (version, timestamp) descending
    candidates.sort(key=lambda t: (t[0], t[1]), reverse=True)
    return candidates[0][2]

# Dynamically find the latest checkpoint (excluding the current run dir)
latest_checkpoint = find_latest_checkpoint(OUTPUT_ROOT_DIR, current_run_basename=VERSION_TAG)
if latest_checkpoint:
    PRETRAINED_CHECKPOINT_PATH = latest_checkpoint
    print(f"‚úÖ Dynamically loading latest checkpoint: {os.path.basename(PRETRAINED_CHECKPOINT_PATH)}")
else:
    PRETRAINED_CHECKPOINT_PATH = BASE_MODEL_NAME
    print("‚ö†Ô∏è No previous checkpoint found ‚Äî falling back to base model.")

üìÅ Using SAVE_DIR: /Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V35_20251014_162112
‚úÖ Dynamically loading latest checkpoint: V34_20251013_211825


In [4]:
# ----------------------------------------------------
# 2. Hierarchical Dataset Preparation
# ----------------------------------------------------
# This function organizes the original multi-class dataset into two separate
# folder structures required for the two-stage training process. It recursively
# searches through subdirectories (no matter how deep) and is smart enough to
# skip non-image files.
def prepare_hierarchical_datasets(base_path, output_path):
    
    stage1_path = os.path.join(output_path, "stage_1_relevance_dataset")
    stage2_path = os.path.join(output_path, "stage_2_emotion_dataset")

    print(f"üóÇÔ∏è Preparing hierarchical datasets at: {output_path}")

    # --- Create Stage 1 Dataset (Relevance Filter) ---
    print("\n--- Creating Stage 1 Dataset ---")
    irrelevant_dest = os.path.join(stage1_path, "0_irrelevant")
    relevant_dest = os.path.join(stage1_path, "1_relevant")
    os.makedirs(irrelevant_dest, exist_ok=True)
    os.makedirs(relevant_dest, exist_ok=True)

    # Copy irrelevant files recursively
    print("Processing 'irrelevant' classes...")
    for class_name in IRRELEVANT_CLASSES:
        src_dir = Path(os.path.join(base_path, class_name))
        if src_dir.is_dir():
            print(f"  Recursively copying from '{class_name}'...")
            # Here, rglob('*') finds every file in every sub-folder.
            for file_path in src_dir.rglob('*'):
                if file_path.is_file() and is_valid_image(file_path.name):
                    shutil.copy(file_path, irrelevant_dest)
        else:
            print(f"  ‚ö†Ô∏è Warning: Source directory not found for '{class_name}'")

    # Copy relevant files recursively
    print("Processing 'relevant' classes...")
    for class_name in RELEVANT_CLASSES:
        src_dir = Path(os.path.join(base_path, class_name))
        if src_dir.is_dir():
            print(f"  Recursively copying from '{class_name}'...")
            for file_path in src_dir.rglob('*'):
                if file_path.is_file() and is_valid_image(file_path.name):
                    shutil.copy(file_path, relevant_dest)
        else:
            print(f"  ‚ö†Ô∏è Warning: Source directory not found for '{class_name}'")

    # --- Create Stage 2 Dataset (Emotion Classifier) ---
    print("\n--- Creating Stage 2 Dataset ---")
    for class_name in RELEVANT_CLASSES:
        src_dir = Path(os.path.join(base_path, class_name))
        dest_dir = os.path.join(stage2_path, class_name)

        # Ensure destination is clean before copying
        if os.path.exists(dest_dir):
            shutil.rmtree(dest_dir)
        os.makedirs(dest_dir)

        if src_dir.is_dir():
            print(f"  Copying '{class_name}' to Stage 2 directory...")
            for file_path in src_dir.rglob('*'):
                 if file_path.is_file() and is_valid_image(file_path.name):
                    shutil.copy(file_path, dest_dir)
        else:
            print(f"  ‚ö†Ô∏è Warning: Source directory not found for '{class_name}'")

    print("\n‚úÖ Hierarchical dataset preparation complete.")
    return stage1_path, stage2_path

In [5]:
# -----------------------------------------------
# 3. Utility Functions & Custom Classes
# -----------------------------------------------

# --- Part A: Data Augmentation ---

# üì¶ Applies augmentations and processes images on-the-fly for each batch.
# This is a more robust approach than pre-processing the entire dataset.
class DataCollatorWithAugmentation:
    def __init__(self,
                 processor,
                 augment_dict=None,
                 base_augment=None,
                 # --- : tensor-level erasing controls (applied after processor) ---
                 random_erasing_prob: float = 0.10,
                 random_erasing_scale = (0.02, 0.08),
                 skip_erasing_label_ids=None):
        
        """
        Args:
            processor: HF image processor that yields pixel_value tensors
            augment_dict: dict[int label_id -> PIL transform], class-specific
            base_augment: fallback PIL transform when class-specific not found
            random_erasing_prob: probability for applying tensor-level RandomErasing
            random_erasing_scale: area range for erasing region
            skip_erasing_label_ids: iterable of label ids to skip erasing for
        """
        self.processor = processor
        self.augment_dict = augment_dict or {}
        # Baseline augmentation for majority classes.
        self.base_augment = base_augment or T.Compose([T.Resize((224, 224))])

        # --- : tensor-level RandomErasing (applied AFTER processor) ---
        # Keep None to disable; expects CHW tensors in [0,1]
        self.random_erasing = (
            T.RandomErasing(p=random_erasing_prob, scale=random_erasing_scale, value="random")
            if random_erasing_prob and random_erasing_prob > 0.0 else None
        )
                
        # --- : define tensor <-> PIL helpers used in __call__ ---
        self.to_tensor = T.ToTensor()
        self.to_pil = T.ToPILImage()
        
        # Labels to skip erasing for (can be overridden when constructing the collator)
        self.skip_erasing_label_ids = set(skip_erasing_label_ids or [])
        
    def __call__(self, features):
        processed_images = []
        for x in features:
            label = x["label"]
            rgb_image = x["image"].convert("RGB")

            # 1) apply class-specific PIL pipeline if present; else base PIL pipeline
            pil_aug = self.augment_dict.get(label, self.base_augment)

            img = pil_aug(rgb_image)

            # ‚¨áÔ∏è INSERT THE  LINES HERE
            # --- Tensor-level RandomErasing ---
            img_t = self.to_tensor(img)                 # PIL ‚Üí Tensor [C,H,W]
            if self.random_erasing is not None and label not in self.skip_erasing_label_ids:
                img_t = self.random_erasing(img_t)      # RandomErasing on tensor
            img = self.to_pil(img_t)  
        
            processed_images.append(img)

        batch = self.processor(images=processed_images, return_tensors="pt")
        batch["labels"] = torch.tensor([x["label"] for x in features], dtype=torch.long)
        return batch

# --- normalize any image-like object to 3-channel RGB (PIL) ---
def _ensure_rgb(img):
    # If already PIL, force RGB mode
    if isinstance(img, Image.Image):
        return img.convert("RGB")
    # Else coerce to array and expand grayscale to 3 channels
    arr = np.array(img)
    if arr.ndim == 2:
        arr = np.stack([arr, arr, arr], axis=-1)
    return Image.fromarray(arr.astype(np.uint8))


# --- Part B: Model & Training Components ---

# üèãÔ∏è Defines a custom Trainer that can use either a targeted loss function or class weights.
class CustomLossTrainer(Trainer):
    def __init__(self, *args, loss_fct=None, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fct = loss_fct
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        
        if self.loss_fct:
            # Stage 2 uses the custom targeted smoothing loss
            loss = self.loss_fct(logits, labels)
        else:
            # Stage 1 uses standard CrossEntropyLoss with class weights (all on CPU)
            loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
            loss = loss_fct(logits, labels)
            
        return (loss, outputs) if return_outputs else loss


# üîÑ Implements Cross-Entropy Loss with *Targeted* Label Smoothing.
# Smoothing is turned OFF for specified classes to encourage confident predictions. This is used for Stage 2.
class TargetedSmoothedCrossEntropyLoss(nn.Module):
    def __init__(self, smoothing=0.05, target_class_names=None, label2id_map=None, focal_gamma=None):
        super().__init__()
        self.smoothing = smoothing
        self.focal_gamma = focal_gamma  #  (None disables focal scaling)
        if target_class_names and label2id_map:
            self.target_class_ids = [label2id_map[name] for name in target_class_names]
        else:
            self.target_class_ids = []

    def forward(self, logits, target):
        num_classes = logits.size(1)
        with torch.no_grad():
            smooth_labels = torch.full_like(logits, self.smoothing / (num_classes - 1))
            smooth_labels.scatter_(1, target.unsqueeze(1), 1.0 - self.smoothing)
            if self.target_class_ids:
                target_mask = torch.isin(target, torch.tensor(self.target_class_ids, device=target.device))
                if target_mask.any():
                    sharp_labels = F.one_hot(target[target_mask], num_classes=num_classes).float()
                    smooth_labels[target_mask] = sharp_labels

        log_probs = F.log_softmax(logits, dim=1)
        ce_per_sample = -(smooth_labels * log_probs).sum(dim=1)

        # : optional focal scaling
        if self.focal_gamma is not None and self.focal_gamma > 0:
            with torch.no_grad():
                probs = torch.softmax(logits, dim=1)
                pt = (probs * smooth_labels).sum(dim=1).clamp_min(1e-6)
            ce_per_sample = ((1 - pt) ** self.focal_gamma) * ce_per_sample

        return ce_per_sample.mean()

# ------------------------------------------------------------------------------
# Stage 1 loss function: focal-modulated cross-entropy (relevant-only)
#   - We keep class weights for imbalance handling.
#   - We add focal modulation ONLY when the ground truth is "relevant"
#     to emphasize difficult positives without exploding FP on easy negatives.
# ------------------------------------------------------------------------------
class RelevantFocalCrossEntropy(torch.nn.Module):
    def __init__(self, class_weights: torch.Tensor, gamma: float = 2.0, relevant_id: int = 1):
        """
        Args:
            class_weights: Tensor of per-class weights (size 2 for S1)
            gamma: focal exponent (higher -> more emphasis on hard examples)
            relevant_id: integer id for the 'relevant' class
        """
        super().__init__()
        self.ce = torch.nn.CrossEntropyLoss(weight=class_weights, reduction="none")
        self.gamma = gamma
        self.relevant_id = relevant_id

    def forward(self, logits: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
        """
        Computes cross-entropy per-sample, then applies focal scaling only
        for samples whose target == 'relevant'. Non-relevant samples keep vanilla CE.
        """
        # base cross-entropy (per-sample)
        ce = self.ce(logits, targets)  # shape: [B]

        # compute p_t = softmax(logits)[range(B), targets]
        with torch.no_grad():
            probs = torch.softmax(logits, dim=-1)
            p_t = probs[torch.arange(probs.size(0)), targets]  # [B]

        # mask: 1 for relevant targets, 0 otherwise
        mask = (targets == self.relevant_id).float()

        # focal factor: (1 - p_t)^gamma for relevant samples; 1.0 for others
        focal = (1.0 - p_t).pow(self.gamma) * mask + (1.0 - mask)

        # mean reduced loss
        return (focal * ce).mean()


# --- Part C: Metrics & Evaluation ---

# üìä Computes metrics and generates a confusion matrix plot for each evaluation step.
def compute_metrics_with_confusion(eval_pred, label_names, stage_name=""):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    print(f"\nüìà Classification Report for {stage_name}:")
    report = classification_report(labels, preds, target_names=label_names, output_dict=True, zero_division=0)
    print(classification_report(labels, preds, target_names=label_names, zero_division=0))

    # Save raw logits/labels for later analysis like temperature scaling
    np.save(os.path.join(SAVE_DIR, f"logits_eval_{stage_name}_{VERSION}.npy"), logits)
    np.save(os.path.join(SAVE_DIR, f"labels_eval_{stage_name}_{VERSION}.npy"), labels)

    # --- Re-integrated from V28 ---
    # Save per-class F1/precision/recall/entropy to CSV (append per epoch)
    f1s = [report[name]["f1-score"] for name in label_names]
    recalls = [report[name]["recall"] for name in label_names]
    precisions = [report[name]["precision"] for name in label_names]

    # Entropy per class (sorted by entropy)
    softmax_probs = F.softmax(torch.tensor(logits), dim=-1)
    entropies = -torch.sum(softmax_probs * torch.log(softmax_probs + 1e-12), dim=-1)
    entropy_per_class = []
    for idx, class_name in enumerate(label_names):
        mask = (np.array(labels) == idx)
        if mask.any():
            class_entropy = entropies[mask].mean().item()
            entropy_per_class.append((class_name, class_entropy))
        else:
            entropy_per_class.append((class_name, 0.0))
    
    # Create a dictionary for entropies in the correct order for the CSV
    entropy_dict = dict(entropy_per_class)

    # CSV logging
    epoch_metrics_path = os.path.join(SAVE_DIR, f"per_class_metrics_{stage_name}.csv")
    # Access the trainer instance through its global-like availability during compute_metrics call
    active_trainer = trainer_s1 if stage_name == "Stage1" else trainer_s2
    epoch = getattr(active_trainer.state, "epoch", None)

    df_row = pd.DataFrame({
        "epoch": [epoch],
        **{f"f1_{n}": [f] for n, f in zip(label_names, f1s)},
        **{f"recall_{n}": [r] for n, r in zip(label_names, recalls)},
        **{f"precision_{n}": [p] for n, p in zip(label_names, precisions)},
        **{f"entropy_{n}": [entropy_dict[n]] for n in label_names}
    })
    
    if os.path.exists(epoch_metrics_path):
        df_row.to_csv(epoch_metrics_path, mode="a", header=False, index=False)
    else:
        df_row.to_csv(epoch_metrics_path, mode="w", header=True, index=False)
    # --- End Re-integration ---

    # Generate and save a heatmap of the confusion matrix
    cm = confusion_matrix(labels, preds)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=label_names, yticklabels=label_names)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title(f"Confusion Matrix - {stage_name}")
    plt.tight_layout()
    plt.savefig(os.path.join(SAVE_DIR, f"confusion_matrix_{stage_name}_{VERSION}.png"))
    plt.close()

    # --- Re-integrated from V28 ---
    # Top confused pairs
    confusion_pairs = [
        ((label_names[i], label_names[j]), cm[i][j])
        for i in range(len(label_names))
        for j in range(len(label_names)) if i != j and cm[i][j] > 0
    ]
    top_confusions = sorted(confusion_pairs, key=lambda x: x[1], reverse=True)[:3]
    if top_confusions:
        print("\nTop 3 confused class pairs:")
        for (true_label, pred_label), count in top_confusions:
            print(f"  - {true_label} ‚Üí {pred_label}: {count} instances")

    # Compute and print entropy metrics
    avg_entropy = entropies.mean().item()
    print(f"\nüß† Avg prediction entropy: {avg_entropy:.4f}")

    sorted_entropy = sorted(entropy_per_class, key=lambda x: x[1], reverse=True)
    if sorted_entropy:
        print("\nüîç Class entropies (sorted):")
        for class_name, entropy in sorted_entropy:
            print(f"  - {class_name}: entropy = {entropy:.4f}")
    # --- End Re-integration ---
    
    accuracy = (preds == labels).mean()
    return {"accuracy": accuracy}


# ------------------------------------------------------------------------------
# Stage 1: Temperature scaling + threshold (œÑ) sweep
#   - Fit a single scalar T on eval logits (minimize NLL) to calibrate probabilities.
#   - Sweep œÑ in [0.30, 0.55] to pick the value that maximizes F1(relevant).
#   - Persist T and œÑ for hierarchical inference.
# ------------------------------------------------------------------------------
def fit_temperature(model, eval_ds, processor, device):
    """
    Fits a single temperature scalar T by minimizing NLL on eval set.
    Returns:
        float: learned temperature T (>= ~1e-3)
    """
    model.eval()
    logits_list, labels_list = [], []
    with torch.no_grad():
        #Normalize every eval image to 3-channel RGB in fit_temperature
        for ex in eval_ds:
            img, lab = ex["image"], int(ex["label"])
        
            # --- Ensure 3-channel RGB for the processor ---
            # If PIL: convert directly; if numpy/other: coerce to array and expand gray to 3-channels
            if isinstance(img, Image.Image):
                img = img.convert("RGB")
            else:
                arr = np.array(img)
                if arr.ndim == 2:                      # grayscale -> stack to RGB
                    arr = np.stack([arr, arr, arr], axis=-1)
                img = Image.fromarray(arr.astype(np.uint8))  # ensure PIL RGB
        
            inputs = processor(images=img, return_tensors="pt").to(device)
            logits = model(**inputs).logits
            logits_list.append(logits.cpu())
            labels_list.append(lab)

    logits = torch.cat(logits_list, dim=0)  # [N, 2]
    labels = torch.tensor(labels_list)

    T = torch.nn.Parameter(torch.ones(1))
    opt = torch.optim.LBFGS([T], lr=0.1, max_iter=50)
    ce = torch.nn.CrossEntropyLoss()

    def _closure():
        """
        LBFGS closure for temperature scaling:
        Scales logits by 1/T, computes CE loss, backprops to adjust T.
        """
        opt.zero_grad()
        scaled = logits / T.clamp(min=1e-3)
        loss = ce(scaled, labels)
        loss.backward()
        return loss

    opt.step(_closure)
    return float(T.data.item())

def sweep_tau(model, eval_ds, processor, device, T=1.0):
    """
    Sweeps œÑ (threshold on P(relevant)) over [0.30, 0.55] to maximize F1(relevant).
    Returns:
        dict: {'tau', 'f1', 'prec', 'rec'} with 3-decimal rounding for logging.
    """
    import numpy as np
    model.eval()
    y_true, y_prob = [], []
    with torch.no_grad():
        for ex in eval_ds:
            img, lab = ex["image"], int(ex["label"])
    
            # Normalize to 3-channel RGB to avoid ndim==2 errors
            img = _ensure_rgb(img)
    
            inputs = processor(images=img, return_tensors="pt").to(device)
            logits = model(**inputs).logits / max(T, 1e-3)  # robustness vs tiny T
            prob_rel = torch.softmax(logits, dim=-1)[0, label2id_s1['relevant']].item()
            y_true.append(lab == label2id_s1['relevant'])
            y_prob.append(prob_rel)
    
    y_true = np.array(y_true, dtype=bool)
    y_prob = np.array(y_prob, dtype=float)


    best = {"tau": None, "f1": -1.0, "prec": None, "rec": None}
    for tau in np.linspace(0.30, 0.55, 26):
        pred = (y_prob >= tau)
        tp = ((pred == 1) & (y_true == 1)).sum()
        fp = ((pred == 1) & (y_true == 0)).sum()
        fn = ((pred == 0) & (y_true == 1)).sum()
        prec = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        rec  = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1   = 2*prec*rec/(prec+rec) if (prec+rec) > 0 else 0.0
        if f1 > best["f1"]:
            best = {"tau": round(float(tau), 3),
                    "f1": round(float(f1), 3),
                    "prec": round(float(prec), 3),
                    "rec": round(float(rec), 3)}
    return best
    

# --- Part D: Model Saving ---

# üíæ Saves the model and its associated processor to a specified directory.
def save_model_and_processor(model, processor, save_dir, model_name):
    print(f"üíæ Saving {model_name} and processor to: {save_dir}")
    model_path = os.path.join(save_dir, model_name)
    os.makedirs(model_path, exist_ok=True)
    model = model.to("cpu")
    processor.save_pretrained(model_path)
    model.save_pretrained(model_path, safe_serialization=True)
    print(f"‚úÖ {model_name} saved successfully.")


# --- Part E: Post-Training Analysis ---
# ==========================================================================
#   POST-TRAINING ANALYSIS UTILITIES (OFFLINE / OPTIONAL)
#   - Qualitative error bucketing (QE)
#   - Attention rollout (XAI) for S1 inspection
#   - Ablation helpers
# ==========================================================================

def check_deployment_readiness(metrics_csv_path, f1_threshold=0.80):
    """Analyzes the final metrics CSV to check for production readiness."""
    print("\n" + "="*60)
    print("  DEPLOYMENT READINESS CHECK")
    print("="*60)
    
    if not os.path.exists(metrics_csv_path):
        print(f"‚ö†Ô∏è Metrics file not found at: {metrics_csv_path}")
        return

    metrics_df = pd.read_csv(metrics_csv_path)
    last_epoch_metrics = metrics_df.iloc[-1]
    
    label_names = [col.replace("f1_", "") for col in metrics_df.columns if col.startswith("f1_")]
    
    print(f"Threshold: F1-Score >= {f1_threshold}\n")
    
    issues_found = False
    for label in label_names:
        f1_score = last_epoch_metrics.get(f"f1_{label}", 0)
        if f1_score < f1_threshold:
            print(f"  - ‚ùå {label:<15} | F1-Score: {f1_score:.2f} (Below Threshold)")
            issues_found = True
        else:
            print(f"  - ‚úÖ {label:<15} | F1-Score: {f1_score:.2f}")
            
    if issues_found:
        print("\n Model is NOT ready for production.")
    else:
        print("\n Model meets the minimum F1-score threshold for all classes.")

# --- Qualitative Error Bucketing (Stage 1) ---
# Scans an inference CSV and tags each row with simple visual heuristics:
# blur/shadow/occlusion/low-res. Outputs a QE report CSV for targeting data fixes.
def variance_of_laplacian(gray):
    return cv2.Laplacian(gray, cv2.CV_64F).var()

def is_dark(img_pil, thresh=40):
    stat = ImageStat.Stat(img_pil.convert("L"))
    return stat.mean[0] < thresh

def qualitative_buckets_s1(inference_csv, out_csv):
    import pandas as pd
    df = pd.read_csv(inference_csv)
    # consider only S1 mistakes if you logged them; otherwise filter low conf or S2 mismatches
    rows = []
    for _, r in df.iterrows():
        path = r['filepath']
        if not os.path.exists(path): continue
        img = Image.open(path).convert("RGB")
        arr = np.array(img)
        gray = cv2.cvtColor(arr, cv2.COLOR_RGB2GRAY)
        blur = variance_of_laplacian(gray) < 60         # motion blur proxy
        dark = is_dark(img, thresh=45)                  # shadows proxy
        lowres = min(img.size) < 80
        # Cheap occlusion proxy: large random erasing candidate on face area would help, but without faces we use entropy
        ent = cv2.calcHist([gray],[0],None,[256],[0,256]).flatten()
        ent = -np.sum((ent/ent.sum()+1e-9)*np.log2(ent/ent.sum()+1e-9))
        occl = ent < 4.5                                 # low entropy proxy
        rows.append([path, r.get('true_label','?'), r.get('predicted_label','?'), r.get('confidence',np.nan),
                     int(blur), int(dark), int(occl), int(lowres)])
    with open(out_csv, "w", line="") as f:
        w = csv.writer(f)
        w.writerow(["filepath","true","pred","conf","blur","shadow","occlusion","lowres"])
        w.writerows(rows)
    return out_csv

# --- Ablation summary utility for Stage 1 ---
# Summarizes precision/recall/F1 for S1 given (T, tau).
def summarize_s1(eval_ds, model, processor, device, T: float, tau: float):
    import numpy as np
    y_true, y_prob = [], []
    model.eval()
    with torch.no_grad():
        for ex in eval_ds:
            img, lab = ex["image"], int(ex["label"])
    
            # Normalize to 3-channel RGB to avoid ndim==2 errors
            img = _ensure_rgb(img)
    
            logits = model(**processor(images=img, return_tensors="pt").to(device)).logits
            logits = logits / max(T, 1e-3)
            p = torch.softmax(logits, dim=-1)[0, label2id_s1['relevant']].item()
            y_true.append(lab == label2id_s1['relevant'])
            y_prob.append(p)

    y_true = np.array(y_true, bool); y_prob = np.array(y_prob, float)
    pred = (y_prob >= tau)
    tp = ((pred==1)&(y_true==1)).sum(); fp=((pred==1)&(y_true==0)).sum(); fn=((pred==0)&(y_true==1)).sum()
    prec = tp/(tp+fp) if tp+fp>0 else 0.0
    rec  = tp/(tp+fn) if tp+fn>0 else 0.0
    f1   = 2*prec*rec/(prec+rec) if prec+rec>0 else 0.0
    return {"precision":round(prec,3), "recall":round(rec,3), "f1":round(f1,3), "tau":tau, "T":T}


# --- Attention Rollout heatmaps for ViT (offline) ---
def vit_attention_rollout(model, inputs, discard_ratio=0.9):
    # returns a [H,W] mask normalized 0..1; you can overlay it
    # (Implementation omitted for brevity; use a standard attention-rollout snippet for ViT)
    pass

In [6]:
# --------------------------
# 4. Main Training Script
# --------------------------

def main(device):
    # Make trainer objects accessible to metrics function
    global trainer_s1, trainer_s2
    
    # --- Sanity Check for Checkpoint Path ---
    if not os.path.exists(PRETRAINED_CHECKPOINT_PATH):
        raise FileNotFoundError(f"Fatal: Pretrained checkpoint not found at {PRETRAINED_CHECKPOINT_PATH}")

    # --- Define specific model paths from the latest checkpoint ---
    s1_checkpoint_path = os.path.join(PRETRAINED_CHECKPOINT_PATH, "relevance_filter_model")
    s2_checkpoint_path = os.path.join(PRETRAINED_CHECKPOINT_PATH, "emotion_classifier_model")

    # The device is now passed in, so the local definition is removed.
    print(f"\nüñ•Ô∏è Using device: {device}")

    # --- Step 0: Prepare Datasets ---
    # This function copies files into the required two-stage structure.
    # It only needs to be run once.
    prepared_data_path = os.path.join(OUTPUT_ROOT_DIR, "prepared_datasets")
    if PREPARE_DATASETS:
        stage1_dataset_path, stage2_dataset_path = prepare_hierarchical_datasets(BASE_DATASET_PATH, prepared_data_path)
    else:
        stage1_dataset_path = os.path.join(prepared_data_path, "stage_1_relevance_dataset")
        stage2_dataset_path = os.path.join(prepared_data_path, "stage_2_emotion_dataset")
        print("‚úÖ Skipping dataset preparation, using existing directories.")
    
    # # --- Set hardware device ---
    # # commented out due to present mps and pytorch incompatibilities
    # device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    # print(f"\nüñ•Ô∏è Using device: {device}")


    # ============================================================
    #   STAGE 1: LOAD RELEVANCE FILTER FROM SAVED ARTIFACTS
    # ============================================================

    # We are NOT retraining S1 for this run
    TRAIN_STAGE1 = False
    CALIBRATE_STAGE1 = False
    
    # Reuse the already-created V35 directory as SAVE_DIR (must be defined earlier)
    # SAVE_DIR should be: "/Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V35_20251014_162112"
    # Ensure SAVE_DIR exists and you printed it earlier like: print(f"Using SAVE_DIR: {SAVE_DIR}")
    
    # Prefer loading S1 from the SAME V35 run; fallback to V34 if missing
    S1_LOAD_DIR = os.path.join(SAVE_DIR, "relevance_filter_model")
    if not os.path.isdir(S1_LOAD_DIR):
        S1_LOAD_DIR = "/Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V34_20251013_211825/relevance_filter_model"
    print(f"üîé Stage-1 load path: {S1_LOAD_DIR}")
    
    # Device should already be defined earlier; if not, uncomment:
    # device = torch.device("cpu")
    
    # Load S1 model + processor from disk (no training)
    model_s1  = ViTForImageClassification.from_pretrained(S1_LOAD_DIR).to(device)
    processor = AutoImageProcessor.from_pretrained(S1_LOAD_DIR)
    print("‚è≠Ô∏è Loaded Stage 1 model/processor from disk (training skipped).")

    trainer_s1 = SimpleNamespace(model=model_s1, args=None)

    
    # Optional: load S1 calibration so inference gate can use it later
    s1_calib = None
    calib_path = os.path.join(os.path.dirname(S1_LOAD_DIR), "stage1_calibration.json")
    if os.path.isfile(calib_path):
        with open(calib_path, "r") as f:
            s1_calib = json.load(f)
        try:
            print(f"üìè Loaded S1 calibration: T={s1_calib.get('T')} œÑ={s1_calib.get('tau')}")
        except Exception:
            pass


    # ==========================================================================
    #   STAGE 2: TRAIN EMOTION CLASSIFIER (11-CLASS)
    # ==========================================================================
    print("\n" + "="*60)
    print(f"  STAGE 2: TRAINING EMOTION CLASSIFIER ({len(RELEVANT_CLASSES)}-CLASS)")
    print("="*60)

    # --- Load Stage 2 data ---
    stage2_output_dir = os.path.join(SAVE_DIR, "stage_2_emotion_model_training")
    dataset_s2 = load_dataset("imagefolder", data_dir=stage2_dataset_path, split='train').train_test_split(test_size=0.2, seed=42)
    train_dataset_s2 = dataset_s2["train"]
    eval_dataset_s2 = dataset_s2["test"]
    print(f"Stage 2: {len(train_dataset_s2)} training samples, {len(eval_dataset_s2)} validation samples.")
    print("Stage 2 Label Distribution (Train):", Counter(train_dataset_s2['label']))


    # --- Configure Stage 2 model ---
    # Load the pretrained checkpoint again, this time with a classifier head for our 11 emotion classes.
    model_s2 = ViTForImageClassification.from_pretrained(
        s2_checkpoint_path, # <-- Use the specific path for the Stage 2 model
        num_labels=len(RELEVANT_CLASSES),
        label2id=label2id_s2,
        id2label=id2label_s2,
        ignore_mismatched_sizes=True
    ).to(device)

    # --- Define Augmentation and Loss for Stage 2 ---
    # Apply stronger augmentation to the minority classes to help the model learn them better.
    minority_aug = T.Compose([
        RandAugment(num_ops=2, magnitude=11),  
        T.RandomResizedCrop(224, scale=(0.7, 1.0)),
        T.ColorJitter(0.3, 0.3, 0.3, 0.1),
    ])
    minority_classes_s2 = [label2id_s2[n] for n in ['disgust','questioning','contempt','fear']]
    minority_augment_map_s2 = {lid: minority_aug for lid in minority_classes_s2}
    
    # very mild, targeted aug ONLY for the weakest classes
    mild_aug = T.Compose([
        T.RandomResizedCrop(224, scale=(0.95, 1.0)),
        T.RandomHorizontalFlip(),
        T.ColorJitter(0.05, 0.05, 0.05, 0.02),
        T.RandomAffine(degrees=3, translate=(0.02, 0.02), scale=(0.98, 1.02)),
    ])

    # targeted mild augmentation for fragile classes
    #     - Keep 'sadness' and 'speech_action' on very mild pipeline (no RandAug)
    #     - Extend to 'neutral_speech' to preserve subtle mouth/phoneme cues
    targeted_mild_classes = [
        label2id_s2['sadness'],
        label2id_s2['speech_action'],
    ]
    targeted_mild_map_s2 = {label_id: mild_aug for label_id in targeted_mild_classes}

    # MERGE: single mapping passed to the collator (class id -> transform)
    augment_dict = {**minority_augment_map_s2, **targeted_mild_map_s2}

    # Use the custom loss function to turn off label smoothing for historically difficult classes.
        # Turn OFF smoothing for the hardest classes (sharper targets) and apply mild focal emphasis
        # Stage 2 loss: slightly softer focal gamma for fragile classes
        # Reduces over-focus; improves probability calibration a bit.
    loss_fct_s2 = TargetedSmoothedCrossEntropyLoss(
        smoothing=0.05,
        target_class_names=['sadness', 'speech_action'],
        label2id_map=label2id_s2,
        focal_gamma=1.6   # <-- NEW restored 1.6 gamma like V33
    )

    # --- Set up Stage 2 Trainer ---
    # Adding weight decay, cosine scheduler + warmup, grad accumulation improves stability 
        # (especially on CPU/small batch) without altering your high-level flow.
    
    training_args_s2 = TrainingArguments(
        output_dir=stage2_output_dir,
        overwrite_output_dir=True,            # <‚Äî important: reuse the folder
        evaluation_strategy="epoch",
        save_strategy="epoch",
        use_cpu=True,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=6,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        logging_dir=os.path.join(stage2_output_dir, "logs"),
        logging_strategy="epoch",
        remove_unused_columns=False,
        weight_decay=0.05,
        lr_scheduler_type="cosine",
        warmup_ratio=0.10,
        gradient_accumulation_steps=2,
        dataloader_num_workers=0,
        learning_rate=4e-5,                   # you set this later; ok to put here
    )

    # --- Set up Stage 2 Trainer ---
    # As with Stage 1, the complex fine-tuning strategy implemented in V31 failed. 
        # This change reverts the Stage 2 training process to V30's more effective 
        # uniform learning rate strategy to restore model performance.
    training_args_s2.learning_rate = 4e-5 # Set learning rate directly

    # skip erasing for fragile classes: sadness and neutral_speech
    # NEW added speech_action
    fragile_ids = [
        label2id_s2['sadness'],
        label2id_s2['neutral_speech'],
        label2id_s2['speech_action']
    ]

    early_stop_callback = EarlyStoppingCallback(
        early_stopping_patience=2,
        early_stopping_threshold=0.0
    )

    # Use the CustomLossTrainer again, passing the targeted loss function.
    trainer_s2 = CustomLossTrainer(
        model=model_s2,
        args=training_args_s2,
        train_dataset=train_dataset_s2,
        eval_dataset=eval_dataset_s2,
        compute_metrics=partial(compute_metrics_with_confusion, 
                                label_names=RELEVANT_CLASSES, 
                                stage_name="Stage2"),
        data_collator=DataCollatorWithAugmentation(
            processor=processor,
            augment_dict=augment_dict,                  # your merged S2 map
            random_erasing_prob=0.10,
            random_erasing_scale=(0.02, 0.08),
            skip_erasing_label_ids=fragile_ids          # <-- skip erasing for fragile classes
        ),
        loss_fct=loss_fct_s2, # Pass custom loss function
        callbacks=[early_stop_callback] # Keep early stopping
    )

    # --- Single sampler override (bind AFTER trainer_s2 exists) ---
    labels = np.array(train_dataset_s2["label"])
    num_classes = len(label2id_s2)
    
    class_counts  = np.bincount(labels, minlength=num_classes)
    class_weights = 1.0 / np.clip(class_counts, 1, None)
    class_weights[label2id_s2['sadness']]       *= 2.5
    class_weights[label2id_s2['speech_action']]  *= 2.0
    
    sample_weights = torch.as_tensor(class_weights[labels], dtype=torch.float)
    sampler = WeightedRandomSampler(sample_weights, num_samples=len(sample_weights), replacement=True)
    
    def _custom_train_loader():
        return DataLoader(
            train_dataset_s2,
            batch_size=training_args_s2.per_device_train_batch_size,
            sampler=sampler,                      # no shuffle when sampler is set
            collate_fn=trainer_s2.data_collator,
            num_workers=0,                        # ‚Üê avoid pickling collator in notebook
            pin_memory=False                      # CPU run; pin not needed
        )
    
    trainer_s2.get_train_dataloader = _custom_train_loader
    # --- end override ---


    # --- Train Stage 2 model ---
    print("üöÄ Starting Stage 2 training...")
    start_time_s2 = time.time() # Record start time
    trainer_s2.train()
    end_time_s2 = time.time()   # Record end time
    
    # Calculate and print the duration
    duration_s2 = end_time_s2 - start_time_s2
    print(f"‚åõ Stage 2 training took: {time.strftime('%H:%M:%S', time.gmtime(duration_s2))}")
    save_model_and_processor(trainer_s2.model, processor, SAVE_DIR, model_name="emotion_classifier_model")
    print("\n‚úÖ Stage 2 Training Complete.")
    print("\nüéâ Hierarchical Training Pipeline Finished Successfully.")

    
    # Return the trained models and processor to be used by analysis functions
    # replace the bad return
    return model_s1, trainer_s2.model, processor

In [7]:
# ----------------------------------
# 5. Hierarchical Inference
# ----------------------------------
# This function defines the two-step prediction pipeline for  images.
# It first checks for relevance (Stage 1) and then classifies the emotion (Stage 2).

def hierarchical_predict(image_paths, model_s1, model_s2, processor, device, batch_size=32):
    results = []
    for i in tqdm(range(0, len(image_paths), batch_size), desc="üî¨ Running Hierarchical Inference"):
        batch_paths = image_paths[i:i+batch_size]
        images = []
        valid_paths = []
        for path in batch_paths:
            try:
                img = Image.open(path).convert("RGB")
                images.append(img)
                valid_paths.append(path)
            except Exception:
                continue

        if not images:
            continue

        inputs = processor(images=images, return_tensors="pt").to(device)
        
        # --- Stage 1 Prediction: 
        # apply learned T and œÑ; fall back safely if file missing)
        # --- Apply Stage-1 temperature scaling + threshold from saved calibration ---
        calib_path = os.path.join(SAVE_DIR, "stage1_calibration.json")
        T_s1, tau = 1.0, 0.45  # safe defaults
        if os.path.exists(calib_path):
            try:
                with open(calib_path, "r") as f:
                    _c = json.load(f)
                    T_s1 = float(_c.get("T", 1.0))
                    tau  = float(_c.get("tau", 0.45))
            except Exception:
                pass
        
        with torch.no_grad():
            logits_s1 = model_s1(**inputs).logits / max(T_s1, 1e-3)  # temperature scaling
            probs_s1 = F.softmax(logits_s1, dim=-1)
        
        # Create a mask of images that were classified as 'relevant'
            # Gate on calibrated œÑ
        relevant_mask = (probs_s1[:, label2id_s1['relevant']] >= tau)
        dev = logits_s1.device
        preds_s1 = torch.where(
            relevant_mask,
            torch.tensor(label2id_s1['relevant'], device=dev, dtype=torch.long),
            torch.tensor(label2id_s1['irrelevant'], device=dev, dtype=torch.long)
        )
        
        # --- Stage 2 Prediction (only on relevant images) ---
        if relevant_mask.any():
            # Filter the input tensors to only include the relevant images
            relevant_inputs = {k: v[relevant_mask] for k, v in inputs.items()}

            with torch.no_grad():
                logits_s2 = model_s2(**relevant_inputs).logits
                probs_s2 = F.softmax(logits_s2, dim=-1)
                confs_s2, preds_s2 = torch.max(probs_s2, dim=-1)

        # --- Aggregate Results ---
        # Loop through the original batch and assign the correct prediction
        s2_idx = 0
        for j in range(len(valid_paths)):
            if relevant_mask[j]:
                # If relevant, get the prediction from the Stage 2 model
                pred_label = id2label_s2[preds_s2[s2_idx].item()]
                confidence = confs_s2[s2_idx].item()
                s2_idx += 1
            else:
                # If not relevant, label it and stop
                pred_label = "irrelevant"
                confidence = torch.softmax(logits_s1[j], dim=-1)[preds_s1[j]].item()

            results.append({
                "image_path": valid_paths[j],
                "prediction": pred_label,
                "confidence": confidence
            })
    return results

In [8]:
# ==============================================================================
# 6. Post-Training Analysis, Review, and Curation
# ==============================================================================

def run_post_training_analysis(model_s1, model_s2, processor, device, base_dataset_path, save_dir, version):
    """
    Runs a full inference pass and generates logs for review, curation, and analysis.
    Combines logic from old sections 15 and 16.
    """
    import pandas as pd   # ensure pd is local; prevents UnboundLocalError in notebooks
    
    print("\n" + "="*60)
    print("  RUNNING POST-TRAINING ANALYSIS & CURATION WORKFLOW")
    print("="*60)

    # --- Part A: Run Hierarchical Inference on the Entire Dataset ---
    all_image_paths = [str(p) for p in Path(base_dataset_path).rglob("*") if is_valid_image(p.name)]
    print(f"Found {len(all_image_paths)} images to process for inference.")
    
    predictions = hierarchical_predict(all_image_paths, model_s1, model_s2, processor, device)
    df = pd.DataFrame(predictions)
    
    # Derive true label from path for analysis
    df['true_label'] = df['image_path'].apply(lambda p: Path(p).parent.name)

    # Save the full log
    full_log_path = os.path.join(save_dir, f"{version}_full_inference_log.csv")
    df.to_csv(full_log_path, index=False)
    print(f"\n‚úÖ Full inference log saved to: {full_log_path}")

    # --- Part B: Identify and Organize Images for Manual Review ---
    # Tag images with low confidence as "REVIEW"
    review_threshold = REVIEW_CONF_THRESHOLD
    review_df = df[df['confidence'] < review_threshold]
    
    review_sort_dir = os.path.join(save_dir, "review_candidates_by_predicted_class")
    os.makedirs(review_sort_dir, exist_ok=True)
    
    print(f"\nFound {len(review_df)} images below {review_threshold} confidence for review.")
    for _, row in tqdm(review_df.iterrows(), total=len(review_df), desc="Sorting review images"):
        dest_dir = os.path.join(review_sort_dir, row['prediction'])
        os.makedirs(dest_dir, exist_ok=True)
        shutil.copy(row['image_path'], dest_dir)
    print(f"üìÇ Sorted review images into folders at: {review_sort_dir}")

    # --- : Generate shortlist and curated patch CSVs for THIS run ---
    #     - Shortlist: low-confidence items in focus classes (for targeted manual review)
    #     - Curated patch: template CSV for corrected labels to be fed back into VNext
    focus_classes = ['sadness','speech_action','neutral','neutral_speech','happiness']
    
    # Defensive: ensure the expected columns exist
    has_pred = 'prediction' in df.columns or 'predicted_label' in df.columns
    pred_col = 'prediction' if 'prediction' in df.columns else ('predicted_label' if 'predicted_label' in df.columns else None)
    if pred_col is not None:
        # Sort by confidence ascending (uncertain first)
        df_focus = df[df[pred_col].isin(focus_classes)].copy()
        if 'confidence' in df_focus.columns:
            df_focus = df_focus.sort_values('confidence', ascending=True)
    
        short_csv = os.path.join(save_dir, f"curation_shortlist_{version}.csv")
        patch_csv  = os.path.join(save_dir, f"curated_additions_{version}.csv")
    
        # Write shortlist with a stable set of columns
        keep_cols = [c for c in ['image_path','filepath','true_label',pred_col,'confidence'] if c in df_focus.columns]
        df_focus[keep_cols].to_csv(short_csv, index=False)
        print(f"‚úÖ Shortlist written: {short_csv}")
    
        # Create empty curated patch template
        src_path_col = 'image_path' if 'image_path' in df_focus.columns else 'filepath'
        patch_df = pd.DataFrame({
            "filepath": df_focus[src_path_col],
            "correct_label": "",
            "notes": ""
        })
        patch_df.to_csv(patch_csv, index=False)
        print(f"‚úÖ Curated patch template written: {patch_csv}")
    else:
        print("‚ÑπÔ∏è Skipped shortlist/patch CSVs: no predicted label column found in full log.")

    # --- : Merge this run's shortlist/patch with V32 to create canonical merged artifacts ---
    def _merge_csvs(csv_list, key_cols, out_csv):
        import pandas as pd
        import os
    
        # Normalize common column name variants so we can dedupe safely
        def _normalize_cols(df: pd.DataFrame) -> pd.DataFrame:
            colmap = {}
            # path columns
            if "image_path" not in df.columns:
                if "filepath" in df.columns:
                    colmap["filepath"] = "image_path"
                elif "path" in df.columns:
                    colmap["path"] = "image_path"
            # predicted label columns
            if "predicted_label" not in df.columns:
                if "prediction" in df.columns:
                    colmap["prediction"] = "predicted_label"
                elif "predicted" in df.columns:
                    colmap["predicted"] = "predicted_label"
            return df.rename(columns=colmap)
    
        frames = []
        for p in csv_list:
            if os.path.exists(p):
                try:
                    df = pd.read_csv(p)
                    df = _normalize_cols(df)
                    frames.append(df)
                except Exception:
                    pass
    
        if not frames:
            return
    
        merged = pd.concat(frames, ignore_index=True)
    
        # Keep only keys that actually exist after normalization
        available_keys = [k for k in key_cols if k in merged.columns]
        if not available_keys:
            print(f"‚ÑπÔ∏è Skipped merge for {out_csv}: none of the key columns {key_cols} exist in merged data.")
            return
    
        merged = merged.drop_duplicates(subset=available_keys, keep="first")
        merged.to_csv(out_csv, index=False)
        print(f"‚úÖ Merged: {out_csv} ({len(merged)} rows)")

    
    # Paths for this run (already defined above)
    short_csv = os.path.join(save_dir, f"curation_shortlist_{version}.csv")
    patch_csv  = os.path.join(save_dir, f"curated_additions_{version}.csv")
    
    # V32 paths (if present)
    v32_short = os.path.join(save_dir, "curation_shortlist_V32.csv")
    v32_patch = os.path.join(save_dir, "curated_additions_V32.csv")
    
    # Canonical merged outputs
    short_merged = os.path.join(save_dir, "curation_shortlist_merged.csv")
    patch_merged = os.path.join(save_dir, "curated_additions_merged.csv")
    
    # Merge (shortlist merges on [filepath, predicted_label]; patch merges on [filepath])
    if pred_col is not None:
        # Figure out the filepath column available
        avail_path_cols = ['image_path','filepath']
        path_col = next((c for c in avail_path_cols if c in df.columns), None)
    
        if path_col is not None:
            _merge_csvs([v32_short, short_csv], key_cols=[path_col, pred_col], out_csv=short_merged)
            _merge_csvs([v32_patch, patch_csv], key_cols=[path_col], out_csv=patch_merged)
        else:
            print("‚ÑπÔ∏è Skipped merge: no filepath column present in full log.")
    else:
        print("‚ÑπÔ∏è Skipped merge: no predicted label column present in full log.")


    # --- Part C: Mine for "Hard Negative" Confusion Pairs (toggleable & robust) ---
    MINING_HARD_NEGATIVES = True  # ‚Üê set False for deployment runs

    if MINING_HARD_NEGATIVES:
        import pandas as pd
         
        # Prefer the freshly generated full log from THIS run; fallback to prior runs only if missing.
        inference_log_path = full_log_path
        if not os.path.exists(inference_log_path):
            v33_log = os.path.join(SAVE_DIR, "V33_full_inference_log.csv")
            v32_log = os.path.join(SAVE_DIR, "V32_full_inference_log.csv")
            inference_log_path = v33_log if os.path.exists(v33_log) else (v32_log if os.path.exists(v32_log) else None)

    
        if not os.path.exists(inference_log_path):
            print("‚è© Skipping hard-negative mining: no full inference log found.")
        else:
            print("\n‚õèÔ∏è  Mining for hard negative confusion pairs...")
            print(f"   using: {inference_log_path}")
            df = pd.read_csv(inference_log_path)
    
            # Normalize column names between runs (V32 used 'prediction', V33 uses 'predicted_label')
            cols = {c.lower(): c for c in df.columns}
            col_true = cols.get("true_label", "true_label")
            col_pred = cols.get("predicted_label") or cols.get("prediction")
            if col_pred is None:
                raise RuntimeError(f"Could not find predicted label column in {df.columns.tolist()}")
    
            # (Optional) keep a stable sort by confidence descending if available
            col_conf = cols.get("confidence")
            if col_conf:
                df = df.sort_values(col_conf, ascending=False)
    
            # Which pairs to mine
            confusion_pairs_to_mine = [
                ('contempt', 'questioning'),
                ('contempt', 'neutral'),
                ('fear', 'surprise')
            ]
    
            # Save to the current run folder
            save_dir = SAVE_DIR
    
            for c1, c2 in confusion_pairs_to_mine:
                mask = ((df[col_true] == c1) & (df[col_pred] == c2)) | \
                       ((df[col_true] == c2) & (df[col_pred] == c1))
                hard_negatives = df.loc[mask]
    
                if not hard_negatives.empty:
                    out_path = os.path.join(save_dir, f"hard_negatives_{c1}_vs_{c2}.csv")
                    hard_negatives.to_csv(out_path, index=False)
                    print(f"  - Found {len(hard_negatives)} hard negatives for ({c1} ‚Üî {c2}). Saved: {out_path}")
                else:
                    print(f"  - No hard negatives found for ({c1} ‚Üî {c2}).")
    else:
        print("‚è© Hard-negative mining disabled (set MINING_HARD_NEGATIVES=True to enable).")

In [9]:
# ==============================================================================
# 7. Model Calibration
# ==============================================================================

def apply_temperature_scaling(logits, labels):
    """Finds the optimal temperature for calibrating model confidence."""
    logits_tensor = torch.tensor(logits, dtype=torch.float32)
    labels_tensor = torch.tensor(labels, dtype=torch.long)

    class TemperatureScaler(nn.Module):
        def __init__(self):
            super().__init__()
            self.temperature = nn.Parameter(torch.ones(1) * 1.5)

        def forward(self, logits):
            return logits / self.temperature

    model = TemperatureScaler()
    optimizer = LBFGS([model.temperature], lr=0.01, max_iter=50)

    def eval_fn():
        optimizer.zero_grad()
        loss = F.cross_entropy(model(logits_tensor), labels_tensor)
        loss.backward()
        return loss

    optimizer.step(eval_fn)
    return model.temperature.item()

def plot_reliability_diagram(logits, labels, temperature, save_dir, version, stage_name):
    """Visualizes model calibration before and after temperature scaling."""
    logits = torch.from_numpy(logits)
    labels = torch.from_numpy(labels)
    
    # Calculate before
    probs_before = F.softmax(logits, dim=1)
    confs_before, _ = torch.max(probs_before, 1)
    
    # Calculate after
    probs_after = F.softmax(logits / temperature, dim=1)
    confs_after, _ = torch.max(probs_after, 1)

    # Plotting logic remains the same...
    # (For brevity, the detailed plotting code from your old script goes here)
    print(f"üìä Reliability diagram generation logic would go here.")

In [10]:
# ==============================================================================
# 8. Hierarchical Model Ensembling
# ==============================================================================

def hierarchical_ensemble_predict(image_path, processor, s1_models, s2_models, device):
    """Performs an ensembled prediction using multiple hierarchical models."""
    try:
        image = Image.open(image_path).convert("RGB")
        inputs = processor(images=image, return_tensors="pt").to(device)

    except Exception:
        return None, None

    # --- Stage 1 Ensemble (Majority Vote) ---
    s1_votes = []
    with torch.no_grad():
        for model in s1_models:
            logits = model(**inputs).logits
            pred = torch.argmax(logits, dim=-1).item()
            s1_votes.append(pred)
    
    # Decide relevance based on majority vote (1 = relevant)
    is_relevant = Counter(s1_votes).most_common(1)[0][0] == label2id_s1['relevant']

    if not is_relevant:
        return "irrelevant", None

    # --- Stage 2 Ensemble (Average Probabilities) ---
    s2_probs = []
    with torch.no_grad():
        for model in s2_models:
            logits = model(**inputs).logits
            probs = F.softmax(logits, dim=-1)
            s2_probs.append(probs)
            
    # Average the probabilities across all models
    avg_probs = torch.mean(torch.stack(s2_probs), dim=0)
    confidence, pred_idx = torch.max(avg_probs, dim=-1)
    
    final_prediction = id2label_s2[pred_idx.item()]
    final_confidence = confidence.item()
    
    return final_prediction, final_confidence

In [11]:
# ==============================================================================
# 9. Script Execution Entry Point
# ==============================================================================
if __name__ == "__main__":

    # Define the device once for the entire script run.
    device = torch.device("cpu")
    
    # --- Step 1: Execute Training Pipeline ---
    # The main function now returns the trained models and processor
    model_s1, model_s2, processor = main(device)
    
    # --- Step 2: Run Post-Training Analysis & Curation ---
    if RUN_INFERENCE:
        # This function runs the full inference pass and generates logs for review.
        # It uses the in-memory models returned from main().
        run_post_training_analysis(model_s1, model_s2, processor, device, BASE_DATASET_PATH, SAVE_DIR, VERSION)
    
    # --- Step 3: Run Final Model Checks ---
    # Check if the model is ready for "deployment" based on F1 scores
    stage2_metrics_path = os.path.join(SAVE_DIR, "per_class_metrics_Stage2.csv")
    check_deployment_readiness(stage2_metrics_path, f1_threshold=0.80)
    
    # --- Step 4: Calibrate the Stage 2 Model ---
    logits_s2_path = os.path.join(SAVE_DIR, f"logits_eval_Stage2_{VERSION}.npy")
    labels_s2_path = os.path.join(SAVE_DIR, f"labels_eval_Stage2_{VERSION}.npy")
    
    if os.path.exists(logits_s2_path) and os.path.exists(labels_s2_path):
        print("\n" + "="*60)
        print("  CALIBRATING STAGE 2 MODEL")
        print("="*60)
        logits_s2 = np.load(logits_s2_path)
        labels_s2 = np.load(labels_s2_path)
        
        optimal_temp = apply_temperature_scaling(logits_s2, labels_s2)
        print(f"‚úÖ Optimal temperature for Stage 2 model: {optimal_temp:.4f}")
        # plot_reliability_diagram(logits_s2, labels_s2, optimal_temp, SAVE_DIR, VERSION, "Stage2")
    else:
        print("‚ö†Ô∏è Skipping calibration, logits/labels files for Stage 2 not found.")

    # COME BACK LATER TO MAKE DYNAMIC AND AUTOMATED LOADING OF PATH
    # --- Step 5: (Hypothetical) Run Ensemble Analysis ---
    # Use the saved V32 artifacts as the "previous" models for ensembling
    v_prev_path = "/Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V32_20251008_115114"
    
    if os.path.exists(v_prev_path):
        print("\n" + "="*60)
        print("  RUNNING HIERARCHICAL ENSEMBLE ANALYSIS (current + V32)")
        print("="*60)
        
        # Load the older V32 models for the ensemble
        s1_model_prev = AutoModelForImageClassification.from_pretrained(
            os.path.join(v_prev_path, "relevance_filter_model")
        ).to(device).eval()
        s2_model_prev = AutoModelForImageClassification.from_pretrained(
            os.path.join(v_prev_path, "emotion_classifier_model")
        ).to(device).eval()
        
        # Use the in-memory models from THIS run (e.g., V33 when you launch it)
        # Assumes you have model_s1 and model_s2 already defined in memory
        s1_models_ensemble = [model_s1, s1_model_prev]
        s2_models_ensemble = [model_s2, s2_model_prev]

        # : auto-pick a real image from ANY non-empty predicted-class folder
        review_root = os.path.join(v_prev_path, "review_candidates_by_predicted_class")
        example_image_path = None
        if os.path.isdir(review_root):
            for cls in os.listdir(review_root):
                cls_dir = os.path.join(review_root, cls)
                if os.path.isdir(cls_dir):
                    imgs = [f for f in os.listdir(cls_dir) if f.lower().endswith((".jpg",".jpeg",".png",".tif",".tiff"))]
                    if imgs:
                        example_image_path = os.path.join(cls_dir, imgs[0])
                        break
    
        if example_image_path and os.path.exists(example_image_path):
            prediction, confidence = hierarchical_ensemble_predict(
                example_image_path, processor, s1_models_ensemble, s2_models_ensemble, device
            )
            print(f"Ensemble prediction for {Path(example_image_path).name}: {prediction} (Confidence: {confidence:.2f})")
        else:
            print("‚ÑπÔ∏è Skipping ensemble demo: no example image found under 'review_candidates_by_predicted_class'.")


üñ•Ô∏è Using device: cpu
‚úÖ Skipping dataset preparation, using existing directories.
üîé Stage-1 load path: /Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V35_20251014_162112/relevance_filter_model
‚è≠Ô∏è Loaded Stage 1 model/processor from disk (training skipped).
üìè Loaded S1 calibration: T=3.8392443656921387 œÑ=0.3

  STAGE 2: TRAINING EMOTION CLASSIFIER (11-CLASS)


Resolving data files:   0%|          | 0/6175 [00:00<?, ?it/s]

Stage 2: 4940 training samples, 1235 validation samples.
Stage 2 Label Distribution (Train): Counter({9: 1608, 4: 651, 8: 554, 5: 530, 0: 388, 6: 382, 1: 251, 3: 240, 10: 135, 7: 101, 2: 100})
üöÄ Starting Stage 2 training...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.047,0.28807,0.874494
2,0.0571,0.261487,0.892308
3,0.0379,0.209864,0.897166
4,0.0271,0.199238,0.910931
5,0.0255,0.216251,0.894737
6,0.021,0.23304,0.893927



üìà Classification Report for Stage2:
                precision    recall  f1-score   support

         anger       0.82      0.99      0.90        85
      contempt       0.68      0.80      0.73        60
       disgust       0.88      0.81      0.84        26
          fear       0.98      0.77      0.87        71
     happiness       0.97      0.91      0.94       167
       neutral       0.93      0.94      0.94       135
   questioning       0.88      0.62      0.73        92
       sadness       0.60      0.70      0.64        40
      surprise       0.99      0.94      0.96       147
neutral_speech       0.87      0.90      0.89       381
 speech_action       0.63      0.84      0.72        31

      accuracy                           0.87      1235
     macro avg       0.84      0.84      0.83      1235
  weighted avg       0.88      0.87      0.87      1235


Top 3 confused class pairs:
  - questioning ‚Üí neutral_speech: 18 instances
  - neutral_speech ‚Üí sadness: 16 inst

NameError: name 'trainer_s1' is not defined

In [20]:
# --- Load Stage-1 model + processor from the V35 folder (no retrain) ---
import os, json
from types import SimpleNamespace
from transformers import ViTForImageClassification, ViTImageProcessor

s1_dir = os.path.join(SAVE_DIR, "relevance_filter_model")

model_s1 = ViTForImageClassification.from_pretrained(
    s1_dir,
    num_labels=2,
    label2id=label2id_s1,
    id2label=id2label_s1
).to(device)

# ensure we have a processor in scope (prefer S1‚Äôs export if needed)
try:
    processor
except NameError:
    processor = None
if processor is None or not isinstance(processor, ViTImageProcessor):
    processor = ViTImageProcessor.from_pretrained(s1_dir)

# tiny shim to satisfy any downstream references to trainer_s1.model
trainer_s1 = SimpleNamespace(model=model_s1, args=None)
print("‚úÖ Loaded S1 model/processor from:", s1_dir)

# (optional) load S1 calibration (T, œÑ)
calib_path = os.path.join(SAVE_DIR, "stage1_calibration.json")
with open(calib_path, "r") as f:
    calib = json.load(f)
T_cal = float(calib.get("T", 1.0))
tau   = float(calib.get("tau", REVIEW_CONF_THRESHOLD))
print(f"üß™ Using S1 calibration: T={T_cal} œÑ={tau}")

‚úÖ Loaded S1 model/processor from: /Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V35_20251014_162112/relevance_filter_model
üß™ Using S1 calibration: T=3.8392443656921387 œÑ=0.3


In [24]:
# === Signature-safe call to run_post_training_analysis (no retrain) ===
import os, inspect

# ----- Inputs we already have in memory/files -----
MODELS_KIT = {
    "model_s1": model_s1,                    # loaded S1
    "model_s2": trainer_s2.model,            # loaded S2
    "processor": processor,                  # ViT processor
    "device": device,                        # torch.device("cpu") or cuda
    "base_dataset_path": BASE_DATASET_PATH,  # your 14-folder dataset root
    "version": VERSION,                      # e.g., "V35_20251014_162112"
    # Preferred outputs/controls
    "save_dir": SAVE_DIR,
    "full_log_filename": "V35_full_inference_log.csv",
    "review_dir": os.path.join(SAVE_DIR, "review_candidates_by_predicted_class"),
    "conf_threshold": REVIEW_CONF_THRESHOLD,
    "temperature": T_cal,                    # from stage1_calibration.json
    "tau": tau,                              # from stage1_calibration.json
    "make_shortlists": False,                # dedupe: keep in curation nb
    "make_mining_pairs": False,
}

sig = inspect.signature(run_post_training_analysis)
pos_args = []
kw_args  = {}

for name, param in sig.parameters.items():
    if param.kind in (param.POSITIONAL_ONLY, param.POSITIONAL_OR_KEYWORD):
        if name in MODELS_KIT:
            pos_args.append(MODELS_KIT[name])
        elif param.default is not inspect._empty:
            pos_args.append(param.default)
        else:
            raise TypeError(f"Missing required positional argument for {name}")
    elif param.kind == param.KEYWORD_ONLY:
        if name in MODELS_KIT:
            kw_args[name] = MODELS_KIT[name]
    # (*args/**kwargs not needed here)

result = run_post_training_analysis(*pos_args, **kw_args)

# Confirm where the log landed; fall back to our expected default name
out_name = kw_args.get("full_log_filename", MODELS_KIT["full_log_filename"])
print(f"‚úÖ Full inference log saved ‚Üí {os.path.join(SAVE_DIR, out_name)}")



  RUNNING POST-TRAINING ANALYSIS & CURATION WORKFLOW
Found 26902 images to process for inference.


üî¨ Running Hierarchical Inference: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 841/841 [23:55<00:00,  1.71s/it]



‚úÖ Full inference log saved to: /Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V35_20251014_162112/V35_20251014_162112_full_inference_log.csv

Found 5850 images below 0.85 confidence for review.


Sorting review images: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5850/5850 [00:02<00:00, 2857.83it/s]

üìÇ Sorted review images into folders at: /Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V35_20251014_162112/review_candidates_by_predicted_class
‚úÖ Shortlist written: /Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V35_20251014_162112/curation_shortlist_V35_20251014_162112.csv
‚úÖ Curated patch template written: /Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V35_20251014_162112/curated_additions_V35_20251014_162112.csv
‚úÖ Merged: /Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V35_20251014_162112/curation_shortlist_merged.csv (2126 rows)
‚úÖ Merged: /Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V35_20251014_162112/curated_additions_merged.csv (2126 rows)

‚õèÔ∏è  Mining for hard negative confusion pairs...
   using: /Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V35_20251014_162112/V35_20251014_162112_full_inference_log.csv
  - No hard negatives found for (contempt


