In [None]:
# ==============================================================================
# V31 changes:
    # overview: Re-integrated advanced diagnostics from V28 and automated the end-to-end
        # workflow. Added early stopping, dynamic checkpoint loading, and a full suite
        # of post-hoc analysis tools for model validation, calibration, and curation.
    # section #1 - Replaced the hard-coded PRETRAINED_CHECKPOINT_PATH with a new
        # `find_latest_checkpoint` function that dynamically finds and loads the
        # most recent model run, automating iterative training.
    # section #3 - Overhauled `compute_metrics_with_confusion` to restore advanced
        # diagnostics: per-epoch CSV logging (F1, precision, recall), calculation of
        # top confusion pairs, and class-based prediction entropy.
    # section #3 - Added `check_deployment_readiness` function to automate final
        # validation against a production F1-score threshold.
    # section #4 - Added a sanity check to `main()` to verify the checkpoint path exists.
    # section #4 - Integrated `EarlyStoppingCallback` to prevent overfitting and find
        # the optimal number of training epochs automatically.
    # section #4 - Modified the `main` function to return trained models and the processor,
        # enabling an efficient in-memory workflow for post-training analysis.
    # section #4 - deleted device = torch.device("cpu") since it's defined
        # in main execution block (if __name__...)
    # section #4 - added Discriminative Learning Rate Implementation in training_args_s1
    # section #4 - added 'sadness' and 'speech_action; to minoirity_classes_s2
    # section #4 - created global trainer_s1, trainer_s2 to bypass the 
        # NameError: name 'trainer_s1' is not defined as a result for Hugging Face Trainer 
        # running its evaluation at the end of an epoch calling compute_metrics_with_...
    # section #6 - Added a new `run_post_training_analysis` function to consolidate
        # the entire post-hoc workflow, including inference, low-confidence review
        # candidate generation, and hard-negative mining.
    # section #7 & #8 - Re-integrated `apply_temperature_scaling` for model calibration
        # and `hierarchical_ensemble_predict` for advanced ensembling analysis,
        # adapting both for the two-stage architecture.
    # section #9 - Consolidated all scattered execution blocks into a single, unified
        # `if __name__ == "__main__":` entry point. This new block orchestrates the
        # entire train-then-analyze pipeline in a logical sequence.
# ==============================================================================

In [1]:
# --------------------------
# 0. Imports
# --------------------------
# WORKAROUND for PyTorch MPS bug
import os
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

# Standard Library Imports
import datasets
import csv
import gc
import glob
import multiprocessing as mp
import os
import random
import re
import shutil
import subprocess
import sys
import time

# Third-Party Imports
import accelerate
import dill
import face_recognition
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn.functional as F
import torchvision.transforms as T
import transformers

# From Imports
from collections import Counter
from datasets import ClassLabel, Dataset, Features, Image as DatasetsImage, concatenate_datasets, load_dataset
from datetime import datetime
from functools import partial
from imagehash import phash, hex_to_hash
from io import BytesIO
from pathlib import Path
from PIL import Image, ImageOps, ExifTags, UnidentifiedImageError
from sklearn.metrics import classification_report, confusion_matrix, log_loss
from sklearn.utils.class_weight import compute_class_weight
from torch import nn
from torch.nn import functional as F
from torch.optim import AdamW, LBFGS
from torchvision import transforms
from torchvision.transforms import (
    RandAugment,
)
from tqdm import tqdm
from transformers import (
    AutoImageProcessor,
    AutoModelForImageClassification,
    EarlyStoppingCallback,
    TrainingArguments,
    Trainer,
    ViTForImageClassification,
)

In [2]:
# --------------------------
# 1. Global Configurations
# --------------------------

# --- üìÇ Core Paths ---
# This is the root directory containing your original 14-class dataset structure.
BASE_DATASET_PATH = "/Users/natalyagrokh/AI/ml_expressions/img_datasets/ferckjalfaga_dataset_14_labels"
# This is the root directory where all outputs (models, logs, prepared datasets) will be saved.
OUTPUT_ROOT_DIR = "/Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training"

# --- ‚öôÔ∏è Run Configuration ---
# Set to True to run the hierarchical inference pipeline on the full dataset after training is complete.
RUN_INFERENCE = True
# Set to True on the first run to copy and organize files. Set to False on subsequent runs to save time.
PREPARE_DATASETS = True

# Finds the most recent V* model directory based on modification time.
def find_latest_checkpoint(root_dir):
    all_run_dirs = [
        os.path.join(root_dir, d)
        for d in os.listdir(root_dir)
        if d.startswith("V") and os.path.isdir(os.path.join(root_dir, d))
    ]
    if not all_run_dirs:
        return None

    # Sort directories by modification time, newest first
    sorted_dirs = sorted(all_run_dirs, key=os.path.getmtime, reverse=True)

    # The newest directory is the current run's empty folder.
    # We need the second newest, which is the latest *completed* run.
    if len(sorted_dirs) > 1:
        return sorted_dirs[1] # <-- Return the second item in the list
    else:
        # If there's only one (or zero), no previous checkpoint exists
        return None

# --- ü§ñ Model Configuration ---
# The pretrained Vision Transformer model from Hugging Face to be used as a base.
BASE_MODEL_NAME = "google/vit-base-patch16-224-in21k"

# Dynamically find the latest checkpoint to train from
latest_checkpoint = find_latest_checkpoint(OUTPUT_ROOT_DIR)

if latest_checkpoint:
    PRETRAINED_CHECKPOINT_PATH = latest_checkpoint
    print(f"‚úÖ Dynamically loading latest checkpoint: {os.path.basename(PRETRAINED_CHECKPOINT_PATH)}")
else:
    # If no checkpoint is found, fall back to the base model from Hugging Face
    PRETRAINED_CHECKPOINT_PATH = BASE_MODEL_NAME
    print(f"‚ö†Ô∏è No local checkpoint found. Starting from base model: {BASE_MODEL_NAME}")
    
# --- üè∑Ô∏è Dataset & Label Definitions ---
# These lists define the structure for the hierarchical pipeline.
# All folders listed here will be grouped into the 'relevant' class for Stage 1
# and used for training the final 11-class classifier in Stage 2.
RELEVANT_CLASSES = [
    'anger', 'contempt', 'disgust', 'fear', 'happiness',
    'neutral', 'questioning', 'sadness', 'surprise',
    'neutral_speech', 'speech_action'
]
# **IMPORTANT**: Since 'unknown' is a subfolder of 'hard_case', we only need to
# list 'hard_case' here. The script will find all images inside it recursively.
IRRELEVANT_CLASSES = ['hard_case']

# Mappings for the Stage 2 (11-class Emotion) model
id2label_s2 = dict(enumerate(RELEVANT_CLASSES))
label2id_s2 = {v: k for k, v in id2label_s2.items()}

# Mappings for the Stage 1 (binary Relevance) model
id2label_s1 = {0: 'irrelevant', 1: 'relevant'}
label2id_s1 = {v: k for k, v in id2label_s1.items()}

# --- üñºÔ∏è File Handling ---
# Defines valid image extensions and provides a function to check them.
VALID_EXTENSIONS = (".jpg", ".jpeg", ".png", ".tif", ".tiff")
def is_valid_image(filename):
    return filename.lower().endswith(VALID_EXTENSIONS) and not filename.startswith("._")

# --- üî¢ Versioning and Output Directory Setup ---
# Automatically determines the next version number (e.g., V31) and creates a timestamped output folder.
def get_next_version(base_dir):
    all_entries = glob.glob(os.path.join(base_dir, "V*_*"))
    existing = [os.path.basename(d) for d in all_entries if os.path.isdir(d)]
    versions = [
        int(d[1:].split("_")[0]) for d in existing
        if d.startswith("V") and "_" in d and d[1:].split("_")[0].isdigit()
    ]
    next_version = max(versions, default=0) + 1
    return f"V{next_version}"

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
VERSION = get_next_version(OUTPUT_ROOT_DIR)
VERSION_TAG = VERSION + "_" + timestamp
SAVE_DIR = os.path.join(OUTPUT_ROOT_DIR, VERSION_TAG)
os.makedirs(SAVE_DIR, exist_ok=True)
print(f"üìÅ Output directory created: {SAVE_DIR}")

‚úÖ Dynamically loading latest checkpoint: V29_20250710_082807
üìÅ Output directory created: /Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V31_20251007_153512


In [3]:
# ----------------------------------------------------
# 2. Hierarchical Dataset Preparation
# ----------------------------------------------------
# This function organizes the original multi-class dataset into two separate
# folder structures required for the two-stage training process. It recursively
# searches through subdirectories (no matter how deep) and is smart enough to
# skip non-image files.
def prepare_hierarchical_datasets(base_path, output_path):
    
    stage1_path = os.path.join(output_path, "stage_1_relevance_dataset")
    stage2_path = os.path.join(output_path, "stage_2_emotion_dataset")

    print(f"üóÇÔ∏è Preparing hierarchical datasets at: {output_path}")

    # --- Create Stage 1 Dataset (Relevance Filter) ---
    print("\n--- Creating Stage 1 Dataset ---")
    irrelevant_dest = os.path.join(stage1_path, "0_irrelevant")
    relevant_dest = os.path.join(stage1_path, "1_relevant")
    os.makedirs(irrelevant_dest, exist_ok=True)
    os.makedirs(relevant_dest, exist_ok=True)

    # Copy irrelevant files recursively
    print("Processing 'irrelevant' classes...")
    for class_name in IRRELEVANT_CLASSES:
        src_dir = Path(os.path.join(base_path, class_name))
        if src_dir.is_dir():
            print(f"  Recursively copying from '{class_name}'...")
            # Here, rglob('*') finds every file in every sub-folder.
            for file_path in src_dir.rglob('*'):
                if file_path.is_file() and is_valid_image(file_path.name):
                    shutil.copy(file_path, irrelevant_dest)
        else:
            print(f"  ‚ö†Ô∏è Warning: Source directory not found for '{class_name}'")

    # Copy relevant files recursively
    print("Processing 'relevant' classes...")
    for class_name in RELEVANT_CLASSES:
        src_dir = Path(os.path.join(base_path, class_name))
        if src_dir.is_dir():
            print(f"  Recursively copying from '{class_name}'...")
            for file_path in src_dir.rglob('*'):
                if file_path.is_file() and is_valid_image(file_path.name):
                    shutil.copy(file_path, relevant_dest)
        else:
            print(f"  ‚ö†Ô∏è Warning: Source directory not found for '{class_name}'")

    # --- Create Stage 2 Dataset (Emotion Classifier) ---
    print("\n--- Creating Stage 2 Dataset ---")
    for class_name in RELEVANT_CLASSES:
        src_dir = Path(os.path.join(base_path, class_name))
        dest_dir = os.path.join(stage2_path, class_name)

        # Ensure destination is clean before copying
        if os.path.exists(dest_dir):
            shutil.rmtree(dest_dir)
        os.makedirs(dest_dir)

        if src_dir.is_dir():
            print(f"  Copying '{class_name}' to Stage 2 directory...")
            for file_path in src_dir.rglob('*'):
                 if file_path.is_file() and is_valid_image(file_path.name):
                    shutil.copy(file_path, dest_dir)
        else:
            print(f"  ‚ö†Ô∏è Warning: Source directory not found for '{class_name}'")

    print("\n‚úÖ Hierarchical dataset preparation complete.")
    return stage1_path, stage2_path

In [4]:
# -----------------------------------------------
# 3. Utility Functions & Custom Classes
# -----------------------------------------------

# --- Part A: Data Augmentation ---

# üì¶ Applies augmentations and processes images on-the-fly for each batch.
# This is a more robust approach than pre-processing the entire dataset.
class DataCollatorWithAugmentation:
    def __init__(self, processor, augment_dict):
        self.processor = processor
        self.augment_dict = augment_dict
        # Baseline augmentation for majority classes.
        self.base_augment = T.Compose([
            T.RandomResizedCrop(size=(224, 224)), # <-- Use this instead of T.Resize
            T.RandomHorizontalFlip(),
            T.RandomRotation(10),
            T.ColorJitter(brightness=0.1, contrast=0.1)
        ])
    def __call__(self, features):
        processed_images = []
        for x in features:
            label = x["label"]
            # Select the correct augmentation pipeline, default to base_augment
            aug_pipeline = self.augment_dict.get(label, self.base_augment)
            rgb_image = x["image"].convert("RGB")
            augmented_image = aug_pipeline(rgb_image)
            processed_images.append(augmented_image)

        batch = self.processor(
            images=processed_images,
            return_tensors="pt"
        )
        batch["labels"] = torch.tensor([x["label"] for x in features], dtype=torch.long)
        return batch

# --- Part B: Model & Training Components ---

# üèãÔ∏è Defines a custom Trainer that can use either a targeted loss function or class weights.
class CustomLossTrainer(Trainer):
    def __init__(self, *args, loss_fct=None, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fct = loss_fct
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        
        if self.loss_fct:
            # Stage 2 uses the custom targeted smoothing loss
            loss = self.loss_fct(logits, labels)
        else:
            # Stage 1 uses standard CrossEntropyLoss with class weights (all on CPU)
            loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
            loss = loss_fct(logits, labels)
            
        return (loss, outputs) if return_outputs else loss


# üîÑ Implements Cross-Entropy Loss with *Targeted* Label Smoothing.
# Smoothing is turned OFF for specified classes to encourage confident predictions. This is used for Stage 2.
class TargetedSmoothedCrossEntropyLoss(nn.Module):
    def __init__(self, smoothing=0.05, target_class_names=None, label2id_map=None):
        super().__init__()
        self.smoothing = smoothing
        if target_class_names and label2id_map:
            self.target_class_ids = [label2id_map[name] for name in target_class_names]
        else:
            self.target_class_ids = []

    def forward(self, logits, target):
        num_classes = logits.size(1)
        with torch.no_grad():
            smooth_labels = torch.full_like(logits, self.smoothing / (num_classes - 1))
            smooth_labels.scatter_(1, target.unsqueeze(1), 1.0 - self.smoothing)

            if self.target_class_ids:
                target_mask = torch.isin(target, torch.tensor(self.target_class_ids, device=target.device))
                if target_mask.any():
                    sharp_labels = F.one_hot(target[target_mask], num_classes=num_classes).float()
                    smooth_labels[target_mask] = sharp_labels

        log_probs = F.log_softmax(logits, dim=1)
        loss = -(smooth_labels * log_probs).sum(dim=1).mean()
        return loss

# --- Part C: Metrics & Evaluation ---

# üìä Computes metrics and generates a confusion matrix plot for each evaluation step.
def compute_metrics_with_confusion(eval_pred, label_names, stage_name=""):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    print(f"\nüìà Classification Report for {stage_name}:")
    report = classification_report(labels, preds, target_names=label_names, output_dict=True, zero_division=0)
    print(classification_report(labels, preds, target_names=label_names, zero_division=0))

    # Save raw logits/labels for later analysis like temperature scaling
    np.save(os.path.join(SAVE_DIR, f"logits_eval_{stage_name}_{VERSION}.npy"), logits)
    np.save(os.path.join(SAVE_DIR, f"labels_eval_{stage_name}_{VERSION}.npy"), labels)

    # --- Re-integrated from V28 ---
    # Save per-class F1/precision/recall/entropy to CSV (append per epoch)
    f1s = [report[name]["f1-score"] for name in label_names]
    recalls = [report[name]["recall"] for name in label_names]
    precisions = [report[name]["precision"] for name in label_names]

    # Entropy per class (sorted by entropy)
    softmax_probs = F.softmax(torch.tensor(logits), dim=-1)
    entropies = -torch.sum(softmax_probs * torch.log(softmax_probs + 1e-12), dim=-1)
    entropy_per_class = []
    for idx, class_name in enumerate(label_names):
        mask = (np.array(labels) == idx)
        if mask.any():
            class_entropy = entropies[mask].mean().item()
            entropy_per_class.append((class_name, class_entropy))
        else:
            entropy_per_class.append((class_name, 0.0))
    
    # Create a dictionary for entropies in the correct order for the CSV
    entropy_dict = dict(entropy_per_class)

    # CSV logging
    epoch_metrics_path = os.path.join(SAVE_DIR, f"per_class_metrics_{stage_name}.csv")
    # Access the trainer instance through its global-like availability during compute_metrics call
    active_trainer = trainer_s1 if stage_name == "Stage1" else trainer_s2
    epoch = getattr(active_trainer.state, "epoch", None)

    df_row = pd.DataFrame({
        "epoch": [epoch],
        **{f"f1_{n}": [f] for n, f in zip(label_names, f1s)},
        **{f"recall_{n}": [r] for n, r in zip(label_names, recalls)},
        **{f"precision_{n}": [p] for n, p in zip(label_names, precisions)},
        **{f"entropy_{n}": [entropy_dict[n]] for n in label_names}
    })
    
    if os.path.exists(epoch_metrics_path):
        df_row.to_csv(epoch_metrics_path, mode="a", header=False, index=False)
    else:
        df_row.to_csv(epoch_metrics_path, mode="w", header=True, index=False)
    # --- End Re-integration ---

    # Generate and save a heatmap of the confusion matrix
    cm = confusion_matrix(labels, preds)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=label_names, yticklabels=label_names)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title(f"Confusion Matrix - {stage_name}")
    plt.tight_layout()
    plt.savefig(os.path.join(SAVE_DIR, f"confusion_matrix_{stage_name}_{VERSION}.png"))
    plt.close()

    # --- Re-integrated from V28 ---
    # Top confused pairs
    confusion_pairs = [
        ((label_names[i], label_names[j]), cm[i][j])
        for i in range(len(label_names))
        for j in range(len(label_names)) if i != j and cm[i][j] > 0
    ]
    top_confusions = sorted(confusion_pairs, key=lambda x: x[1], reverse=True)[:3]
    if top_confusions:
        print("\nTop 3 confused class pairs:")
        for (true_label, pred_label), count in top_confusions:
            print(f"  - {true_label} ‚Üí {pred_label}: {count} instances")

    # Compute and print entropy metrics
    avg_entropy = entropies.mean().item()
    print(f"\nüß† Avg prediction entropy: {avg_entropy:.4f}")

    sorted_entropy = sorted(entropy_per_class, key=lambda x: x[1], reverse=True)
    if sorted_entropy:
        print("\nüîç Class entropies (sorted):")
        for class_name, entropy in sorted_entropy:
            print(f"  - {class_name}: entropy = {entropy:.4f}")
    # --- End Re-integration ---
    
    accuracy = (preds == labels).mean()
    return {"accuracy": accuracy}

# --- Part D: Model Saving ---

# üíæ Saves the model and its associated processor to a specified directory.
def save_model_and_processor(model, processor, save_dir, model_name):
    print(f"üíæ Saving {model_name} and processor to: {save_dir}")
    model_path = os.path.join(save_dir, model_name)
    os.makedirs(model_path, exist_ok=True)
    model = model.to("cpu")
    processor.save_pretrained(model_path)
    model.save_pretrained(model_path, safe_serialization=True)
    print(f"‚úÖ {model_name} saved successfully.")


# --- Part E: Post-Training Analysis ---

def check_deployment_readiness(metrics_csv_path, f1_threshold=0.80):
    """Analyzes the final metrics CSV to check for production readiness."""
    print("\n" + "="*60)
    print("  DEPLOYMENT READINESS CHECK")
    print("="*60)
    
    if not os.path.exists(metrics_csv_path):
        print(f"‚ö†Ô∏è Metrics file not found at: {metrics_csv_path}")
        return

    metrics_df = pd.read_csv(metrics_csv_path)
    last_epoch_metrics = metrics_df.iloc[-1]
    
    label_names = [col.replace("f1_", "") for col in metrics_df.columns if col.startswith("f1_")]
    
    print(f"Threshold: F1-Score >= {f1_threshold}\n")
    
    issues_found = False
    for label in label_names:
        f1_score = last_epoch_metrics.get(f"f1_{label}", 0)
        if f1_score < f1_threshold:
            print(f"  - ‚ùå {label:<15} | F1-Score: {f1_score:.2f} (Below Threshold)")
            issues_found = True
        else:
            print(f"  - ‚úÖ {label:<15} | F1-Score: {f1_score:.2f}")
            
    if issues_found:
        print("\n Model is NOT ready for production.")
    else:
        print("\n Model meets the minimum F1-score threshold for all classes.")

In [5]:
# --------------------------
# 4. Main Training Script
# --------------------------

def main(device):
    # Make trainer objects accessible to metrics function
    global trainer_s1, trainer_s2
    
    # --- Sanity Check for Checkpoint Path ---
    if not os.path.exists(PRETRAINED_CHECKPOINT_PATH):
        raise FileNotFoundError(f"Fatal: Pretrained checkpoint not found at {PRETRAINED_CHECKPOINT_PATH}")

    # The device is now passed in, so the local definition is removed.
    print(f"\nüñ•Ô∏è Using device: {device}")

    # --- Step 0: Prepare Datasets ---
    # This function copies files into the required two-stage structure.
    # It only needs to be run once.
    prepared_data_path = os.path.join(OUTPUT_ROOT_DIR, "prepared_datasets")
    if PREPARE_DATASETS:
        stage1_dataset_path, stage2_dataset_path = prepare_hierarchical_datasets(BASE_DATASET_PATH, prepared_data_path)
    else:
        stage1_dataset_path = os.path.join(prepared_data_path, "stage_1_relevance_dataset")
        stage2_dataset_path = os.path.join(prepared_data_path, "stage_2_emotion_dataset")
        print("‚úÖ Skipping dataset preparation, using existing directories.")
    
    # # --- Set hardware device ---
    # # commented out due to present mps and pytorch incompatibilities
    # device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    # print(f"\nüñ•Ô∏è Using device: {device}")

    # ==========================================================================
    #   STAGE 1: TRAIN RELEVANCE FILTER (BINARY CLASSIFIER)
    # ==========================================================================
    print("\n" + "="*60)
    print("  STAGE 1: TRAINING RELEVANCE FILTER (BINARY CLASSIFIER)")
    print("="*60)

    # --- Load Stage 1 data ---
    stage1_output_dir = os.path.join(SAVE_DIR, "stage_1_relevance_model_training")
    dataset_s1 = load_dataset("imagefolder", data_dir=stage1_dataset_path, split='train').train_test_split(test_size=0.2, seed=42)
    train_dataset_s1 = dataset_s1["train"]
    eval_dataset_s1 = dataset_s1["test"]
    print(f"Stage 1: {len(train_dataset_s1)} training samples, {len(eval_dataset_s1)} validation samples.")

    # --- Configure Stage 1 model ---
    # We load the base processor once.
    processor = AutoImageProcessor.from_pretrained(BASE_MODEL_NAME)
    # Load the pretrained checkpoint but replace the final layer (classifier head)
    # for our binary (2-label) task.
    model_s1 = ViTForImageClassification.from_pretrained(
        PRETRAINED_CHECKPOINT_PATH,
        num_labels=2,
        label2id=label2id_s1,
        id2label=id2label_s1,
        ignore_mismatched_sizes=True
    ).to(device)

    # --- Handle Extreme Class Imbalance in Stage 1 with Class Weights ---
    # This is critical because the 'irrelevant' class is much larger than the 'relevant' class.
    class_weights_s1 = compute_class_weight('balanced', classes=np.unique(train_dataset_s1['label']), y=train_dataset_s1['label'])
    class_weights_s1 = torch.tensor(class_weights_s1, dtype=torch.float).to(device)
    print(f"‚öñÔ∏è Stage 1 Class Weights: {class_weights_s1}")

    # --- Define Early Stopping ---
    # Stops training if validation loss doesn't improve for 2 consecutive epochs
    early_stop_callback = EarlyStoppingCallback(
        early_stopping_patience=2,
        early_stopping_threshold=0.001
    )
    
    # --- Set up Stage 1 Trainer ---
    training_args_s1 = TrainingArguments(
        output_dir=stage1_output_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        use_cpu=True,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=5,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        logging_dir=os.path.join(stage1_output_dir, "logs"),
        logging_strategy="steps",
        logging_steps=50,
        remove_unused_columns=False,
    )

    # --- Discriminative Learning Rate for Stage 1 ---
    # This implements discriminative learning rate, allowing 
        # the new classifier head to learn quickly while only 
        # making small, careful updates to pre-trained backbone
    # Freeze all layers by default
    for param in model_s1.parameters():
        param.requires_grad = False
    # Unfreeze the classifier head
    for param in model_s1.classifier.parameters():
        param.requires_grad = True
    # Unfreeze the last 4 encoder layers for fine-tuning
    for param in model_s1.vit.encoder.layer[-4:].parameters():
        param.requires_grad = True

    # Create parameter groups with different learning rates
    optimizer_grouped_parameters_s1 = [
        {'params': model_s1.classifier.parameters(), 'lr': 3e-5},
        {'params': model_s1.vit.encoder.layer[-4:].parameters(), 'lr': 1e-7}
    ]
    optimizer_s1 = torch.optim.AdamW(optimizer_grouped_parameters_s1, weight_decay=0.01)

    # Use the flexible CustomLossTrainer, passing the class weights to it.
    trainer_s1 = CustomLossTrainer(
        model=model_s1,
        args=training_args_s1,
        train_dataset=train_dataset_s1,
        eval_dataset=eval_dataset_s1,
        compute_metrics=partial(compute_metrics_with_confusion, label_names=list(id2label_s1.values()), stage_name="Stage1"),
        data_collator=DataCollatorWithAugmentation(processor=processor, augment_dict={}), # Use base augmentation for all
        class_weights=class_weights_s1, # Pass weights to the trainer
        callbacks=[early_stop_callback], # early stopping
        optimizers=(optimizer_s1, None) # Pass the custom optimizer
    )

    # --- Train Stage 1 model ---
    print("üöÄ Starting Stage 1 training...")
    start_time_s1 = time.time() # Record start time
    trainer_s1.train()
    end_time_s1 = time.time()   # Record end time
    
    # Calculate and print the duration
    duration_s1 = end_time_s1 - start_time_s1
    print(f"‚åõ Stage 1 training took: {time.strftime('%H:%M:%S', time.gmtime(duration_s1))}")
    save_model_and_processor(trainer_s1.model, processor, SAVE_DIR, model_name="relevance_filter_model")
    print("\n‚úÖ Stage 1 Training Complete.")

    # ==========================================================================
    #   STAGE 2: TRAIN EMOTION CLASSIFIER (11-CLASS)
    # ==========================================================================
    print("\n" + "="*60)
    print(f"  STAGE 2: TRAINING EMOTION CLASSIFIER ({len(RELEVANT_CLASSES)}-CLASS)")
    print("="*60)

    # --- Load Stage 2 data ---
    stage2_output_dir = os.path.join(SAVE_DIR, "stage_2_emotion_model_training")
    dataset_s2 = load_dataset("imagefolder", data_dir=stage2_dataset_path, split='train').train_test_split(test_size=0.2, seed=42)
    train_dataset_s2 = dataset_s2["train"]
    eval_dataset_s2 = dataset_s2["test"]
    print(f"Stage 2: {len(train_dataset_s2)} training samples, {len(eval_dataset_s2)} validation samples.")
    print("Stage 2 Label Distribution (Train):", Counter(sorted(train_dataset_s2['label'])))

    # --- Configure Stage 2 model ---
    # Load the pretrained checkpoint again, this time with a classifier head for our 11 emotion classes.
    model_s2 = ViTForImageClassification.from_pretrained(
        PRETRAINED_CHECKPOINT_PATH,
        num_labels=len(RELEVANT_CLASSES),
        label2id=label2id_s2,
        id2label=id2label_s2,
        ignore_mismatched_sizes=True
    ).to(device)

    # --- Define Augmentation and Loss for Stage 2 ---
    # Apply stronger augmentation to the minority classes to help the model learn them better.
    minority_aug = T.Compose([
        RandAugment(num_ops=2, magnitude=9),
        T.RandomResizedCrop(224, scale=(0.7, 1.0)),
        T.ColorJitter(0.3, 0.3, 0.3, 0.1),
    ])

    minority_classes_s2 = [label2id_s2[name] for name in ['disgust', 'questioning', 'contempt', 'fear', 'sadness', 'speech_action']]
    minority_augment_map_s2 = {label_id: minority_aug for label_id in minority_classes_s2}

    # Use the custom loss function to turn off label smoothing for historically difficult classes.
    loss_fct_s2 = TargetedSmoothedCrossEntropyLoss(
        smoothing=0.05,
        target_class_names=['contempt', 'disgust'],
        label2id_map=label2id_s2
    )

    # --- Set up Stage 2 Trainer ---
    training_args_s2 = TrainingArguments(
        output_dir=stage2_output_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        use_cpu=True, 
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=5,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        logging_dir=os.path.join(stage2_output_dir, "logs"),
        logging_strategy="epoch",
        remove_unused_columns=False,
    )

    # --- Discriminative Learning Rate for Stage 2 ---
    # Implements a discriminative learning rate, allowing 
        # new classifier head to learn quickly while only 
        # making small, careful updates to pre-trained backbone
    # Freeze all layers by default
    for param in model_s2.parameters():
        param.requires_grad = False
    # Unfreeze the classifier head
    for param in model_s2.classifier.parameters():
        param.requires_grad = True
    # Unfreeze the last 4 encoder layers for fine-tuning
    for param in model_s2.vit.encoder.layer[-4:].parameters():
        param.requires_grad = True

    # Create parameter groups with different learning rates
    optimizer_grouped_parameters_s2 = [
        {'params': model_s2.classifier.parameters(), 'lr': 5e-5},
        {'params': model_s2.vit.encoder.layer[-4:].parameters(), 'lr': 2e-7}
    ]
    optimizer_s2 = torch.optim.AdamW(optimizer_grouped_parameters_s2, weight_decay=0.01)

    # Use the CustomLossTrainer again, passing the targeted loss function and new optimizer.
    trainer_s2 = CustomLossTrainer(
        model=model_s2,
        args=training_args_s2,
        train_dataset=train_dataset_s2,
        eval_dataset=eval_dataset_s2,
        compute_metrics=partial(compute_metrics_with_confusion, label_names=RELEVANT_CLASSES, stage_name="Stage2"),
        data_collator=DataCollatorWithAugmentation(processor=processor, augment_dict=minority_augment_map_s2),
        loss_fct=loss_fct_s2, # Pass custom loss function
        callbacks=[early_stop_callback], # early stopping
        optimizers=(optimizer_s2, None) # Pass the custom optimizer
    )

    # --- Train Stage 2 model ---
    print("üöÄ Starting Stage 2 training...")
    start_time_s2 = time.time() # Record start time
    trainer_s2.train()
    end_time_s2 = time.time()   # Record end time
    
    # Calculate and print the duration
    duration_s2 = end_time_s2 - start_time_s2
    print(f"‚åõ Stage 2 training took: {time.strftime('%H:%M:%S', time.gmtime(duration_s2))}")
    save_model_and_processor(trainer_s2.model, processor, SAVE_DIR, model_name="emotion_classifier_model")
    print("\n‚úÖ Stage 2 Training Complete.")
    print("\nüéâ Hierarchical Training Pipeline Finished Successfully.")
    
    # Return the trained models and processor to be used by analysis functions
    return trainer_s1.model, trainer_s2.model, processor

In [6]:
# ----------------------------------
# 5. Hierarchical Inference
# ----------------------------------
# This function defines the two-step prediction pipeline for new images.
# It first checks for relevance (Stage 1) and then classifies the emotion (Stage 2).

def hierarchical_predict(image_paths, model_s1, model_s2, processor, device, batch_size=32):
    results = []
    for i in tqdm(range(0, len(image_paths), batch_size), desc="üî¨ Running Hierarchical Inference"):
        batch_paths = image_paths[i:i+batch_size]
        images = []
        valid_paths = []
        for path in batch_paths:
            try:
                img = Image.open(path).convert("RGB")
                images.append(img)
                valid_paths.append(path)
            except Exception:
                continue

        if not images:
            continue

        inputs = processor(images=images, return_tensors="pt").to(device)

        # --- Stage 1 Prediction: Is the image relevant? ---
        with torch.no_grad():
            logits_s1 = model_s1(**inputs).logits
            preds_s1 = torch.argmax(logits_s1, dim=-1)

        # Create a mask of images that were classified as 'relevant'
        relevant_mask = (preds_s1 == label2id_s1['relevant'])

        # --- Stage 2 Prediction (only on relevant images) ---
        if relevant_mask.any():
            # Filter the input tensors to only include the relevant images
            relevant_inputs = {k: v[relevant_mask] for k, v in inputs.items()}

            with torch.no_grad():
                logits_s2 = model_s2(**relevant_inputs).logits
                probs_s2 = F.softmax(logits_s2, dim=-1)
                confs_s2, preds_s2 = torch.max(probs_s2, dim=-1)

        # --- Aggregate Results ---
        # Loop through the original batch and assign the correct prediction
        s2_idx = 0
        for j in range(len(valid_paths)):
            if relevant_mask[j]:
                # If relevant, get the prediction from the Stage 2 model
                pred_label = id2label_s2[preds_s2[s2_idx].item()]
                confidence = confs_s2[s2_idx].item()
                s2_idx += 1
            else:
                # If not relevant, label it and stop
                pred_label = "irrelevant"
                confidence = torch.softmax(logits_s1[j], dim=-1)[preds_s1[j]].item()

            results.append({
                "image_path": valid_paths[j],
                "prediction": pred_label,
                "confidence": confidence
            })
    return results

In [7]:
# ==============================================================================
# 6. Post-Training Analysis, Review, and Curation
# ==============================================================================

def run_post_training_analysis(model_s1, model_s2, processor, device, base_dataset_path, save_dir, version):
    """
    Runs a full inference pass and generates logs for review, curation, and analysis.
    Combines logic from old sections 15 and 16.
    """
    print("\n" + "="*60)
    print("  RUNNING POST-TRAINING ANALYSIS & CURATION WORKFLOW")
    print("="*60)

    # --- Part A: Run Hierarchical Inference on the Entire Dataset ---
    all_image_paths = [str(p) for p in Path(base_dataset_path).rglob("*") if is_valid_image(p.name)]
    print(f"Found {len(all_image_paths)} images to process for inference.")
    
    predictions = hierarchical_predict(all_image_paths, model_s1, model_s2, processor, device)
    df = pd.DataFrame(predictions)
    
    # Derive true label from path for analysis
    df['true_label'] = df['image_path'].apply(lambda p: Path(p).parent.name)

    # Save the full log
    full_log_path = os.path.join(save_dir, f"{version}_full_inference_log.csv")
    df.to_csv(full_log_path, index=False)
    print(f"\n‚úÖ Full inference log saved to: {full_log_path}")

    # --- Part B: Identify and Organize Images for Manual Review ---
    # Tag images with low confidence as "REVIEW"
    review_threshold = 0.85
    review_df = df[df['confidence'] < review_threshold]
    
    review_sort_dir = os.path.join(save_dir, "review_candidates_by_predicted_class")
    os.makedirs(review_sort_dir, exist_ok=True)
    
    print(f"\nFound {len(review_df)} images below {review_threshold} confidence for review.")
    for _, row in tqdm(review_df.iterrows(), total=len(review_df), desc="Sorting review images"):
        dest_dir = os.path.join(review_sort_dir, row['prediction'])
        os.makedirs(dest_dir, exist_ok=True)
        shutil.copy(row['image_path'], dest_dir)
    print(f"üìÇ Sorted review images into folders at: {review_sort_dir}")

    # --- Part C: Mine for "Hard Negative" Confusion Pairs ---
    # Find images where the model was confused between specific, problematic classes
    confusion_pairs_to_mine = [('contempt', 'questioning'), ('contempt', 'neutral'), ('fear', 'surprise')]
    
    print("\n‚õèÔ∏è  Mining for hard negative confusion pairs...")
    for pair in confusion_pairs_to_mine:
        c1, c2 = pair
        # Find images where true is c1 but predicted is c2, OR true is c2 and predicted is c1
        mask = ((df['true_label'] == c1) & (df['prediction'] == c2)) | \
               ((df['true_label'] == c2) & (df['prediction'] == c1))
        
        hard_negatives = df[mask]
        
        if not hard_negatives.empty:
            out_path = os.path.join(save_dir, f"hard_negatives_{c1}_vs_{c2}.csv")
            hard_negatives.to_csv(out_path, index=False)
            print(f"  - Found {len(hard_negatives)} hard negatives for {pair}. Saved to: {out_path}")

In [8]:
# ==============================================================================
# 7. Model Calibration
# ==============================================================================

def apply_temperature_scaling(logits, labels):
    """Finds the optimal temperature for calibrating model confidence."""
    logits_tensor = torch.tensor(logits, dtype=torch.float32)
    labels_tensor = torch.tensor(labels, dtype=torch.long)

    class TemperatureScaler(nn.Module):
        def __init__(self):
            super().__init__()
            self.temperature = nn.Parameter(torch.ones(1) * 1.5)

        def forward(self, logits):
            return logits / self.temperature

    model = TemperatureScaler()
    optimizer = LBFGS([model.temperature], lr=0.01, max_iter=50)

    def eval_fn():
        optimizer.zero_grad()
        loss = F.cross_entropy(model(logits_tensor), labels_tensor)
        loss.backward()
        return loss

    optimizer.step(eval_fn)
    return model.temperature.item()

def plot_reliability_diagram(logits, labels, temperature, save_dir, version, stage_name):
    """Visualizes model calibration before and after temperature scaling."""
    logits = torch.from_numpy(logits)
    labels = torch.from_numpy(labels)
    
    # Calculate before
    probs_before = F.softmax(logits, dim=1)
    confs_before, _ = torch.max(probs_before, 1)
    
    # Calculate after
    probs_after = F.softmax(logits / temperature, dim=1)
    confs_after, _ = torch.max(probs_after, 1)

    # Plotting logic remains the same...
    # (For brevity, the detailed plotting code from your old script goes here)
    print(f"üìä Reliability diagram generation logic would go here.")

In [9]:
# ==============================================================================
# 8. Hierarchical Model Ensembling
# ==============================================================================

def hierarchical_ensemble_predict(image_path, processor, s1_models, s2_models, device):
    """Performs an ensembled prediction using multiple hierarchical models."""
    try:
        image = Image.open(image_path).convert("RGB")
        inputs = processor(image, return_tensors="pt").to(device)
    except Exception:
        return None, None

    # --- Stage 1 Ensemble (Majority Vote) ---
    s1_votes = []
    with torch.no_grad():
        for model in s1_models:
            logits = model(**inputs).logits
            pred = torch.argmax(logits, dim=-1).item()
            s1_votes.append(pred)
    
    # Decide relevance based on majority vote (1 = relevant)
    is_relevant = Counter(s1_votes).most_common(1)[0][0] == label2id_s1['relevant']

    if not is_relevant:
        return "irrelevant", None

    # --- Stage 2 Ensemble (Average Probabilities) ---
    s2_probs = []
    with torch.no_grad():
        for model in s2_models:
            logits = model(**inputs).logits
            probs = F.softmax(logits, dim=-1)
            s2_probs.append(probs)
            
    # Average the probabilities across all models
    avg_probs = torch.mean(torch.stack(s2_probs), dim=0)
    confidence, pred_idx = torch.max(avg_probs, dim=-1)
    
    final_prediction = id2label_s2[pred_idx.item()]
    final_confidence = confidence.item()
    
    return final_prediction, final_confidence

In [10]:
# ==============================================================================
# 9. Script Execution Entry Point
# ==============================================================================
if __name__ == "__main__":

    # Define the device once for the entire script run.
    device = torch.device("cpu")
    
    # --- Step 1: Execute Training Pipeline ---
    # The main function now returns the trained models and processor
    model_s1, model_s2, processor = main(device)
    
    # --- Step 2: Run Post-Training Analysis & Curation ---
    if RUN_INFERENCE:
        # This function runs the full inference pass and generates logs for review.
        # It uses the in-memory models returned from main().
        run_post_training_analysis(model_s1, model_s2, processor, device, BASE_DATASET_PATH, SAVE_DIR, VERSION)
    
    # --- Step 3: Run Final Model Checks ---
    # Check if the model is ready for "deployment" based on F1 scores
    stage2_metrics_path = os.path.join(SAVE_DIR, "per_class_metrics_Stage2.csv")
    check_deployment_readiness(stage2_metrics_path, f1_threshold=0.80)
    
    # --- Step 4: Calibrate the Stage 2 Model ---
    logits_s2_path = os.path.join(SAVE_DIR, f"logits_eval_Stage2_{VERSION}.npy")
    labels_s2_path = os.path.join(SAVE_DIR, f"labels_eval_Stage2_{VERSION}.npy")
    
    if os.path.exists(logits_s2_path) and os.path.exists(labels_s2_path):
        print("\n" + "="*60)
        print("  CALIBRATING STAGE 2 MODEL")
        print("="*60)
        logits_s2 = np.load(logits_s2_path)
        labels_s2 = np.load(labels_s2_path)
        
        optimal_temp = apply_temperature_scaling(logits_s2, labels_s2)
        print(f"‚úÖ Optimal temperature for Stage 2 model: {optimal_temp:.4f}")
        # plot_reliability_diagram(logits_s2, labels_s2, optimal_temp, SAVE_DIR, VERSION, "Stage2")
    else:
        print("‚ö†Ô∏è Skipping calibration, logits/labels files for Stage 2 not found.")

    # --- Step 5: (Hypothetical) Run Ensemble Analysis ---
    # This is a hypothetical example assuming a V30 model has been trained
    v30_path = "/path/to/your/V30_run_folder" 
    
    if os.path.exists(v30_path):
        print("\n" + "="*60)
        print("  RUNNING HIERARCHICAL ENSEMBLE ANALYSIS")
        print("="*60)
        
        # Load the older V30 models for the ensemble
        s1_model_v30 = AutoModelForImageClassification.from_pretrained(os.path.join(v30_path, "relevance_filter_model")).to(device).eval()
        s2_model_v30 = AutoModelForImageClassification.from_pretrained(os.path.join(v30_path, "emotion_classifier_model")).to(device).eval()
        
        # Use the in-memory V31 models (model_s1, model_s2) from the current run
        s1_models_ensemble = [model_s1, s1_model_v30]
        s2_models_ensemble = [model_s2, s2_model_v30]
        
        example_image_path = "path/to/a/difficult/image.jpg"
        if os.path.exists(example_image_path):
            prediction, confidence = hierarchical_ensemble_predict(example_image_path, processor, s1_models_ensemble, s2_models_ensemble, device)
            print(f"Ensemble prediction for {Path(example_image_path).name}: {prediction} (Confidence: {confidence:.2f})")


üñ•Ô∏è Using device: cpu
üóÇÔ∏è Preparing hierarchical datasets at: /Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/prepared_datasets

--- Creating Stage 1 Dataset ---
Processing 'irrelevant' classes...
  Recursively copying from 'hard_case'...
Processing 'relevant' classes...
  Recursively copying from 'anger'...
  Recursively copying from 'contempt'...
  Recursively copying from 'disgust'...
  Recursively copying from 'fear'...
  Recursively copying from 'happiness'...
  Recursively copying from 'neutral'...
  Recursively copying from 'questioning'...
  Recursively copying from 'sadness'...
  Recursively copying from 'surprise'...
  Recursively copying from 'neutral_speech'...
  Recursively copying from 'speech_action'...

--- Creating Stage 2 Dataset ---
  Copying 'anger' to Stage 2 directory...
  Copying 'contempt' to Stage 2 directory...
  Copying 'disgust' to Stage 2 directory...
  Copying 'fear' to Stage 2 directory...
  Copying 'happiness' to Stage 2 direc

Resolving data files:   0%|          | 0/26881 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Stage 1: 21504 training samples, 5377 validation samples.


Some weights of ViTForImageClassification were not initialized from the model checkpoint at /Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V29_20250710_082807 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([10]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([10, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚öñÔ∏è Stage 1 Class Weights: tensor([0.6492, 2.1761])
üöÄ Starting Stage 1 training...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6438,0.610737,0.678817
2,0.6313,0.60148,0.705412
3,0.6311,0.597912,0.716943
4,0.6204,0.605464,0.70876
5,0.6249,0.600617,0.715827



üìà Classification Report for Stage1:
              precision    recall  f1-score   support

  irrelevant       0.87      0.68      0.77      4132
    relevant       0.39      0.66      0.49      1245

    accuracy                           0.68      5377
   macro avg       0.63      0.67      0.63      5377
weighted avg       0.76      0.68      0.70      5377


Top 3 confused class pairs:
  - irrelevant ‚Üí relevant: 1305 instances
  - relevant ‚Üí irrelevant: 422 instances

üß† Avg prediction entropy: 0.6290

üîç Class entropies (sorted):
  - relevant: entropy = 0.6329
  - irrelevant: entropy = 0.6279

üìà Classification Report for Stage1:
              precision    recall  f1-score   support

  irrelevant       0.87      0.73      0.79      4132
    relevant       0.41      0.64      0.50      1245

    accuracy                           0.71      5377
   macro avg       0.64      0.68      0.65      5377
weighted avg       0.76      0.71      0.72      5377


Top 3 confused c

Resolving data files:   0%|          | 0/6175 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Stage 2: 4940 training samples, 1235 validation samples.
Stage 2 Label Distribution (Train): Counter({9: 1608, 4: 651, 8: 554, 5: 530, 0: 388, 6: 382, 1: 251, 3: 240, 10: 135, 7: 101, 2: 100})


Some weights of ViTForImageClassification were not initialized from the model checkpoint at /Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V29_20250710_082807 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([10]) in the checkpoint and torch.Size([11]) in the model instantiated
- classifier.weight: found shape torch.Size([10, 768]) in the checkpoint and torch.Size([11, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


üöÄ Starting Stage 2 training...


Epoch,Training Loss,Validation Loss,Accuracy
1,1.706,1.340401,0.62996
2,1.3036,1.264631,0.634008
3,1.2171,1.228003,0.650202
4,1.1875,1.208875,0.663158
5,1.1759,1.194901,0.647773



üìà Classification Report for Stage2:
                precision    recall  f1-score   support

         anger       0.64      0.60      0.62        85
      contempt       0.46      0.72      0.56        60
       disgust       0.46      1.00      0.63        26
          fear       0.64      0.79      0.71        71
     happiness       0.79      0.75      0.77       167
       neutral       0.62      0.49      0.55       135
   questioning       0.50      0.49      0.49        92
       sadness       0.00      0.00      0.00        40
      surprise       0.66      0.78      0.71       147
neutral_speech       0.68      0.65      0.67       381
 speech_action       0.15      0.13      0.14        31

      accuracy                           0.63      1235
     macro avg       0.51      0.58      0.53      1235
  weighted avg       0.62      0.63      0.62      1235


Top 3 confused class pairs:
  - questioning ‚Üí neutral_speech: 32 instances
  - neutral_speech ‚Üí happiness: 28 in

üî¨ Running Hierarchical Inference: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 841/841 [54:46<00:00,  3.91s/it]



‚úÖ Full inference log saved to: /Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V31_20251007_153512/V31_full_inference_log.csv

Found 26555 images below 0.85 confidence for review.


Sorting review images: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 26555/26555 [00:08<00:00, 3255.00it/s]

üìÇ Sorted review images into folders at: /Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V31_20251007_153512/review_candidates_by_predicted_class

‚õèÔ∏è  Mining for hard negative confusion pairs...
  - Found 1 hard negatives for ('contempt', 'questioning'). Saved to: /Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V31_20251007_153512/hard_negatives_contempt_vs_questioning.csv
  - Found 28 hard negatives for ('contempt', 'neutral'). Saved to: /Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V31_20251007_153512/hard_negatives_contempt_vs_neutral.csv
  - Found 3 hard negatives for ('fear', 'surprise'). Saved to: /Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V31_20251007_153512/hard_negatives_fear_vs_surprise.csv

  DEPLOYMENT READINESS CHECK
Threshold: F1-Score >= 0.8

  - ‚ùå anger           | F1-Score: 0.57 (Below Threshold)
  - ‚ùå contempt        | F1-Score: 0.62 (Below Threshold)
  - ‚ùå disgust         |


