# Visual Prototype Per-Class Analysis (CVCL Training Classes Only)

This notebook runs class discrimination tests using visual prototypes with controlled visual properties.
**Only classes that appear in CVCL training data are tested** (based on CVCLKonkMatches.csv).
Distractors are matched for size, color, and texture to ensure the model must rely on class identity.
Multiple seeds are run to obtain confidence intervals for each class.

In [None]:
import os
import sys
import random
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import time

# Path setup
REPO_ROOT = r'C:\Users\jbats\Projects\NTU-Synthetic'
DISCOVER_ROOT = os.path.join(REPO_ROOT, 'discover-hidden-visual-concepts')
sys.path.insert(0, DISCOVER_ROOT)
sys.path.insert(0, REPO_ROOT)

# Import from discover-hidden-visual-concepts repo
sys.path.append(os.path.join(DISCOVER_ROOT, 'src'))
from utils.model_loader import load_model
from models.feature_extractor import FeatureExtractor

# SyntheticKonkle paths - Using 224x224 resized images for faster processing
DATA_DIR = os.path.join(REPO_ROOT, 'data', 'SyntheticKonkle_224')
RESULTS_DIR = os.path.join(REPO_ROOT, 'PatrickProject', 'Chart_Generation')
os.makedirs(RESULTS_DIR, exist_ok=True)

print(f"Data directory: {DATA_DIR}")
print(f"Results will be saved to: {RESULTS_DIR}")

In [None]:
# Load CVCL training classes
def load_cvcl_training_classes():
    """Load the list of classes that appear in CVCL training data."""
    cvcl_csv_path = os.path.join(REPO_ROOT, 'data', 'CVCL_Konkle_Overlap', 'CVCLKonkMatches.csv')
    
    print(f"Loading CVCL training classes from: {cvcl_csv_path}")
    cvcl_df = pd.read_csv(cvcl_csv_path)
    
    # Get unique classes from the CSV
    cvcl_classes = cvcl_df['Class'].unique().tolist()
    
    print(f"Found {len(cvcl_classes)} CVCL training classes")
    print(f"Classes: {', '.join(sorted(cvcl_classes))}")
    
    return cvcl_classes

# Load CVCL classes
CVCL_TRAINING_CLASSES = load_cvcl_training_classes()

In [None]:
# Dataset setup with CVCL class filtering
def build_synthetic_dataset(filter_classes=None):
    """Load the master labels CSV with all visual properties.
    
    Args:
        filter_classes: List of class names to include. If None, include all.
    """
    # Use the master_labels.csv which has all the attribute information
    master_csv = os.path.join(DATA_DIR, 'master_labels.csv')
    
    if not os.path.exists(master_csv):
        print(f"Warning: {master_csv} not found, trying alternative path...")
        # Try the original SyntheticKonkle folder
        master_csv = os.path.join(REPO_ROOT, 'data', 'SyntheticKonkle', 'master_labels.csv')
    
    print(f"Loading master labels from: {master_csv}")
    df = pd.read_csv(master_csv)
    
    # Ensure all required columns are present
    required_cols = ['folder', 'filename', 'class', 'color', 'size', 'texture']
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns: {missing_cols}")
    
    # Clean the data
    df = df.dropna(subset=required_cols)
    
    # Filter for CVCL training classes if specified
    if filter_classes is not None:
        print(f"\nFiltering for {len(filter_classes)} CVCL training classes...")
        df = df[df['class'].isin(filter_classes)]
        print(f"After filtering: {len(df)} images from {df['class'].nunique()} classes")
    
    print(f"\nDataset statistics:")
    print(f"Total images: {len(df)}")
    print(f"Classes: {df['class'].nunique()} unique")
    print(f"Colors: {df['color'].nunique()} unique")
    print(f"Sizes: {df['size'].nunique()} unique")
    print(f"Textures: {df['texture'].nunique()} unique")
    
    return df

class SyntheticImageDataset(Dataset):
    def __init__(self, df, data_dir, transform):
        self.df = df
        # For SyntheticKonkle_224, images are in nested structure
        self.data_dir = os.path.join(data_dir, 'SyntheticKonkle')
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join(self.data_dir, row['folder'], row['filename'])
        try:
            img = Image.open(img_path).convert('RGB')
            return self.transform(img), row['class'], row['color'], row['size'], row['texture'], idx
        except:
            img = Image.new('RGB', (224, 224), color='black')
            return self.transform(img), row['class'], row['color'], row['size'], row['texture'], idx

def collate_fn(batch):
    imgs = torch.stack([b[0] for b in batch])
    classes = [b[1] for b in batch]
    colors = [b[2] for b in batch]
    sizes = [b[3] for b in batch]
    textures = [b[4] for b in batch]
    idxs = [b[5] for b in batch]
    return imgs, classes, colors, sizes, textures, idxs

In [None]:
def run_class_prototype_test_per_class(model_name, seed=0, device='cuda' if torch.cuda.is_available() else 'cpu', 
                                       batch_size=32, trials_per_class=250, filter_classes=None):
    """Run class test using visual prototypes and return per-class results.
    
    Args:
        filter_classes: List of class names to test. If None, test all classes.
    """
    
    random.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Load model & transform
    print(f"Loading {model_name}...")
    model, transform = load_model(model_name, seed=seed, device=device)
    extractor = FeatureExtractor(model_name, model, device)
    
    # Build dataset and extract embeddings
    df = build_synthetic_dataset(filter_classes=filter_classes)
    ds = SyntheticImageDataset(df, DATA_DIR, transform)
    loader = DataLoader(ds, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=collate_fn)

    print("Extracting image embeddings...")
    all_embs, all_classes, all_colors, all_sizes, all_textures, all_idxs = [], [], [], [], [], []
    
    with torch.no_grad():
        for imgs, classes, colors, sizes, textures, idxs in tqdm(loader, desc="Processing images"):
            imgs = imgs.to(device)
            feats = extractor.get_img_feature(imgs)
            feats = extractor.norm_features(feats).cpu().float()
            all_embs.append(feats)
            all_classes.extend(classes)
            all_colors.extend(colors)
            all_sizes.extend(sizes)
            all_textures.extend(textures)
            all_idxs.extend(idxs)
    
    all_embs = torch.cat(all_embs, dim=0)
    print(f"Extracted {len(all_embs)} image embeddings")

    # Group by class and attributes
    class_attr_idxs = defaultdict(lambda: defaultdict(list))
    idx_to_row = {idx: i for i, idx in enumerate(all_idxs)}
    
    for i, (idx, cls, col, size, texture) in enumerate(zip(all_idxs, all_classes, all_colors, all_sizes, all_textures)):
        class_attr_idxs[cls][(col, size, texture)].append(idx)

    # Get unique classes
    unique_classes = list(class_attr_idxs.keys())
    print(f"Testing {len(unique_classes)} classes")
    
    # Create visual prototypes for each class
    print("Creating visual prototypes...")
    class_prototypes = {}
    for cls in unique_classes:
        # Get all images for this class
        class_idxs = []
        for attr_idxs in class_attr_idxs[cls].values():
            class_idxs.extend(attr_idxs)
        
        # Average features to create prototype
        class_features = all_embs[[idx_to_row[idx] for idx in class_idxs]]
        prototype = class_features.mean(0)
        prototype = prototype / prototype.norm()  # Normalize
        class_prototypes[cls] = prototype
    
    # Track per-class performance
    class_correct = defaultdict(int)
    class_total = defaultdict(int)
    
    print(f"Running {trials_per_class} trials per class...")
    
    # Run trials for each class
    for target_class in tqdm(unique_classes, desc=f"Testing {model_name}"):
        trials_done = 0
        
        # Get all attribute combinations for this class
        for (color, size, texture), idx_list in class_attr_idxs[target_class].items():
            if trials_done >= trials_per_class:
                break
                
            # Find distractors from other classes with SAME attributes
            other_classes_with_attr = []
            for other_cls in unique_classes:
                if other_cls != target_class:
                    if (color, size, texture) in class_attr_idxs[other_cls]:
                        other_classes_with_attr.append(other_cls)
            
            # Need at least 3 distractor classes with matching attributes
            if len(idx_list) >= 1 and len(other_classes_with_attr) >= 3:
                # Run multiple trials for this combination
                n_trials = min(10, trials_per_class - trials_done)
                
                for _ in range(n_trials):
                    # Pick query image from target class
                    query_idx = random.choice(idx_list)
                    query_features = all_embs[idx_to_row[query_idx]]
                    
                    # Pick 3 distractor classes with matching attributes
                    distractor_classes = random.sample(other_classes_with_attr, min(3, len(other_classes_with_attr)))
                    
                    # Create list of candidate prototypes (target + 3 distractors)
                    candidate_classes = [target_class] + distractor_classes
                    candidate_prototypes = torch.stack([class_prototypes[cls] for cls in candidate_classes])
                    
                    # Compute similarities between query and all prototypes
                    similarities = query_features @ candidate_prototypes.T
                    
                    # Check if model correctly identifies target class (index 0)
                    prediction = similarities.argmax().item()
                    
                    # Update counts
                    class_correct[target_class] += int(prediction == 0)
                    class_total[target_class] += 1
                    trials_done += 1
                    
                    if trials_done >= trials_per_class:
                        break
    
    # Calculate per-class accuracy
    class_accuracies = {}
    for cls in unique_classes:
        if class_total[cls] > 0:
            class_accuracies[cls] = class_correct[cls] / class_total[cls]
        else:
            class_accuracies[cls] = 0.0
    
    # Print summary
    overall_correct = sum(class_correct.values())
    overall_total = sum(class_total.values())
    overall_acc = overall_correct / overall_total if overall_total > 0 else 0
    
    print(f"\nOverall: {overall_correct}/{overall_total} = {overall_acc:.3f}")
    print(f"Classes tested: {len([c for c in class_accuracies if class_total[c] > 0])}")
    
    return class_accuracies

In [None]:
# Run multiple seeds for statistical analysis - CVCL training classes only
n_seeds = 3  # Limited seeds due to potential rate limiting
trials_per_class = 500  # More trials per seed for robust statistics
models_to_test = ['cvcl-resnext', 'clip-res']

# Check dataset first
test_df = build_synthetic_dataset(filter_classes=CVCL_TRAINING_CLASSES)
n_classes = len(test_df['class'].unique())
print(f"\nFound {n_classes} CVCL training classes in the dataset")

print(f"\nStarting Visual Prototype evaluation (CVCL Training Classes Only):")
print(f"Configuration: {n_seeds} seeds × {trials_per_class} trials/class × {n_classes} classes")
print(f"Key features:")
print(f"  - Only testing classes that appear in CVCL training data")
print(f"  - Using visual prototypes (averaged class features)")
print(f"  - Distractors have MATCHING size, color, and texture\n")

all_results = {model: defaultdict(list) for model in models_to_test}

# Run evaluation
for model_name in models_to_test:
    print(f"\n{'='*60}")
    print(f"Testing {model_name} with visual prototype approach")
    print('='*60)
    
    for seed in range(n_seeds):
        print(f"\nSeed {seed+1}/{n_seeds} for {model_name}")
        
        try:
            class_acc = run_class_prototype_test_per_class(
                model_name, 
                seed=seed, 
                trials_per_class=trials_per_class,
                filter_classes=CVCL_TRAINING_CLASSES  # Filter for CVCL classes
            )
            
            # Store results
            for cls, acc in class_acc.items():
                all_results[model_name][cls].append(acc)
            
            # Print progress
            if len(class_acc) > 0:
                mean_acc = np.mean(list(class_acc.values()))
                print(f"  Mean accuracy across classes: {mean_acc:.3f}")
                print(f"  Classes successfully tested: {len(class_acc)}")
                
        except Exception as e:
            print(f"  Error: {e}")
            if "404" in str(e) or "rate" in str(e).lower():
                print(f"  Rate limit hit - waiting 60 seconds...")
                time.sleep(60)
                # Retry once
                try:
                    class_acc = run_class_prototype_test_per_class(
                        model_name, seed=seed, trials_per_class=trials_per_class,
                        filter_classes=CVCL_TRAINING_CLASSES
                    )
                    for cls, acc in class_acc.items():
                        all_results[model_name][cls].append(acc)
                    print(f"  Retry successful!")
                except:
                    print(f"  Retry failed - skipping seed {seed}")
                    continue
        
        # Add delay between seeds for CVCL
        if 'cvcl' in model_name and seed < n_seeds - 1:
            print("  Waiting 30 seconds before next seed...")
            time.sleep(30)

# Calculate statistics
stats_results = {}
for model_name in models_to_test:
    stats_results[model_name] = {}
    for cls, accs in all_results[model_name].items():
        if len(accs) > 0:
            n_samples = len(accs)
            stats_results[model_name][cls] = {
                'mean': np.mean(accs),
                'std': np.std(accs, ddof=1) if n_samples > 1 else 0,
                'se': np.std(accs, ddof=1) / np.sqrt(n_samples) if n_samples > 1 else 0,
                'ci95': 1.96 * np.std(accs, ddof=1) / np.sqrt(n_samples) if n_samples > 1 else 0,
                'n_samples': n_samples,
                'total_trials': n_samples * trials_per_class,
                'raw': accs
            }

print("\n" + "="*60)
print("VISUAL PROTOTYPE EVALUATION COMPLETE (CVCL TRAINING CLASSES)")
print("="*60)

In [None]:
# Save detailed results
detailed_df = []
for model_name in models_to_test:
    for cls, stats in stats_results[model_name].items():
        for seed_idx, acc in enumerate(stats['raw']):
            detailed_df.append({
                'model': model_name,
                'class': cls,
                'seed': seed_idx,
                'accuracy': acc,
                'n_trials': trials_per_class,
                'test_type': 'visual_prototype_cvcl_training'
            })

if len(detailed_df) > 0:
    detailed_df = pd.DataFrame(detailed_df)
    output_path = os.path.join(RESULTS_DIR, 'class_prototype_cvcl_training_perclass_results.csv')
    detailed_df.to_csv(output_path, index=False)
    print(f"\nSaved detailed results to {output_path}")
    
    # Save summary statistics
    summary_stats = []
    for model_name in models_to_test:
        for cls, stats in stats_results[model_name].items():
            summary_stats.append({
                'model': model_name,
                'class': cls,
                'mean_accuracy': stats['mean'],
                'std': stats['std'],
                'se': stats['se'],
                'ci95': stats['ci95'],
                'n_seeds': stats['n_samples'],
                'total_trials': stats['total_trials']
            })
    
    summary_df = pd.DataFrame(summary_stats)
    summary_path = os.path.join(RESULTS_DIR, 'class_prototype_cvcl_training_perclass_summary.csv')
    summary_df.to_csv(summary_path, index=False)
    print(f"Saved summary statistics to {summary_path}")
else:
    print("\nNo results to save")

In [None]:
# Create visualization for CVCL training classes
if len(stats_results[models_to_test[0]]) > 0:
    fig = plt.figure(figsize=(14, 8))
    
    # Single plot for CVCL training classes (should be ~24 classes)
    ax = plt.subplot(111)
    
    # Prepare data
    classes = sorted(list(stats_results[models_to_test[0]].keys()))
    
    # Define colors and markers
    colors = {
        'cvcl-resnext': '#2a9d8f',  # Teal for CVCL
        'clip-res': '#e63946'  # Red for CLIP
    }
    markers = {
        'cvcl-resnext': 'o',
        'clip-res': 's'
    }
    avg_line_styles = {
        'cvcl-resnext': '--',
        'clip-res': '-.'
    }
    
    x_pos = np.arange(len(classes))
    
    for model_name in models_to_test:
        means = [stats_results[model_name][cls]['mean'] * 100 for cls in classes]
        errors = [stats_results[model_name][cls]['ci95'] * 100 for cls in classes]
        
        ax.errorbar(x_pos, means, yerr=errors,
                   label=model_name.upper().replace('-', ' '),
                   color=colors[model_name],
                   marker=markers[model_name],
                   markersize=8,
                   linewidth=0,
                   capsize=5,
                   capthick=2,
                   alpha=0.9,
                   markeredgecolor='black',
                   markeredgewidth=0.5)
    
    # Add chance level
    ax.axhline(y=25, color='#ffa500', linestyle=':', alpha=0.8, linewidth=1.5,
              label='Chance Level (25%)')
    
    # Calculate and add average lines
    for model_name in models_to_test:
        all_means = [stats_results[model_name][cls]['mean'] * 100 for cls in classes]
        avg_performance = np.mean(all_means)
        ax.axhline(y=avg_performance,
                  color=colors[model_name],
                  linestyle=avg_line_styles[model_name],
                  alpha=0.7,
                  linewidth=2,
                  label=f'{model_name.upper().split("-")[0]} Average ({avg_performance:.1f}%)')
    
    # Formatting
    ax.set_ylabel('Visual Prototype Accuracy (%)', fontsize=12, fontweight='bold')
    ax.set_xlabel('CVCL Training Classes', fontsize=12, fontweight='bold')
    ax.set_xticks(x_pos)
    ax.set_xticklabels(classes, rotation=45, ha='right', fontsize=10)
    ax.set_ylim(0, 105)
    ax.set_yticks([0, 25, 50, 75, 100])
    ax.grid(axis='y', alpha=0.3, linestyle='-', linewidth=0.5)
    ax.set_axisbelow(True)
    ax.set_facecolor('#fafafa')
    
    # Title
    ax.set_title('Visual Prototype Per-Class Performance (CVCL Training Classes Only)\nControlled Visual Properties (Size, Color, Texture)',
                fontsize=14, fontweight='bold', pad=15)
    
    # Legend
    ax.legend(loc='upper left', fontsize=10, frameon=True, fancybox=True, shadow=True,
             framealpha=0.95, ncol=2)
    
    plt.tight_layout()
    
    # Save plots
    png_path = os.path.join(RESULTS_DIR, 'class_prototype_cvcl_training_perclass.png')
    pdf_path = os.path.join(RESULTS_DIR, 'class_prototype_cvcl_training_perclass.pdf')
    
    plt.savefig(png_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.savefig(pdf_path, bbox_inches='tight', facecolor='white')
    plt.show()
    
    print(f"\nSaved plots to:")
    print(f"  - {png_path}")
    print(f"  - {pdf_path}")

In [None]:
# Statistical summary for CVCL training classes
if len(stats_results) > 0 and len(stats_results[models_to_test[0]]) > 0:
    classes = sorted(list(stats_results[models_to_test[0]].keys()))
    
    summary_data = []
    for cls in classes:
        row = {'Class': cls}
        for model in models_to_test:
            if cls in stats_results[model]:
                stats = stats_results[model][cls]
                row[f"{model}_mean"] = f"{stats['mean']:.3f}"
                row[f"{model}_ci95"] = f"±{stats['ci95']:.3f}"
                row[f"{model}_trials"] = stats['total_trials']
        
        # Add difference if both models have results
        if cls in stats_results['clip-res'] and cls in stats_results['cvcl-resnext']:
            diff = stats_results['clip-res'][cls]['mean'] - stats_results['cvcl-resnext'][cls]['mean']
            row['difference'] = f"{diff:+.3f}"
        
        summary_data.append(row)
    
    summary_df = pd.DataFrame(summary_data)
    print("\n" + "="*80)
    print("VISUAL PROTOTYPE PER-CLASS PERFORMANCE (CVCL TRAINING CLASSES ONLY)")
    print("="*80)
    print(summary_df.to_string(index=False))
    
    # Overall statistics
    print("\n" + "="*80)
    print("OVERALL VISUAL PROTOTYPE PERFORMANCE (CVCL TRAINING CLASSES)")
    print("="*80)
    
    for model in models_to_test:
        all_accs = []
        for cls in classes:
            if cls in stats_results[model]:
                all_accs.extend(stats_results[model][cls]['raw'])
        
        if len(all_accs) > 0:
            mean = np.mean(all_accs)
            std = np.std(all_accs)
            se = std / np.sqrt(len(all_accs))
            ci95 = 1.96 * se
            print(f"{model}: {mean:.3f} ± {ci95:.3f} (SE: {se:.3f}, n={len(all_accs)} samples)")
    
    # Statistical test
    from scipy import stats as scipy_stats
    
    cvcl_all = []
    clip_all = []
    for cls in classes:
        if cls in stats_results['cvcl-resnext']:
            cvcl_all.extend(stats_results['cvcl-resnext'][cls]['raw'])
        if cls in stats_results['clip-res']:
            clip_all.extend(stats_results['clip-res'][cls]['raw'])
    
    if len(cvcl_all) > 0 and len(clip_all) > 0:
        t_stat, p_value = scipy_stats.ttest_ind(cvcl_all, clip_all)
        print(f"\nt-test: t={t_stat:.3f}, p={p_value:.6f}")
        if p_value < 0.001:
            print("Result: Highly significant difference (p < 0.001)")
        elif p_value < 0.01:
            print("Result: Significant difference (p < 0.01)")
        elif p_value < 0.05:
            print("Result: Significant difference (p < 0.05)")
        else:
            print("Result: No significant difference")
    
    print(f"\nNote: These results are specifically for the {len(classes)} classes that appear in CVCL training data.")
    print("Visual prototypes are created by averaging features from all images within each class.")

In [None]:
# Create difference plot for CVCL training classes
if len(stats_results) > 0 and len(stats_results[models_to_test[0]]) > 0:
    plt.figure(figsize=(12, 6))
    
    classes = sorted(list(stats_results[models_to_test[0]].keys()))
    differences = []
    
    for cls in classes:
        if cls in stats_results['clip-res'] and cls in stats_results['cvcl-resnext']:
            diff = stats_results['clip-res'][cls]['mean'] - stats_results['cvcl-resnext'][cls]['mean']
            differences.append(diff)
        else:
            differences.append(0)
    
    colors_diff = ['#2ecc71' if d > 0 else '#e74c3c' for d in differences]
    bars = plt.bar(range(len(classes)), differences, color=colors_diff, alpha=0.7, edgecolor='black', linewidth=0.5)
    
    # Add value labels on significant differences
    for i, (cls, diff) in enumerate(zip(classes, differences)):
        if abs(diff) > 0.1:  # Only label large differences
            plt.text(i, diff + (0.01 if diff > 0 else -0.02), f'{diff:.2f}',
                    ha='center', va='bottom' if diff > 0 else 'top', fontsize=8)
    
    plt.axhline(y=0, color='black', linestyle='-', linewidth=1)
    plt.xlabel('CVCL Training Classes', fontsize=12, fontweight='bold')
    plt.ylabel('Performance Difference\n(CLIP - CVCL)', fontsize=12, fontweight='bold')
    plt.title('Visual Prototype Performance Differences (CVCL Training Classes Only)\nWith Controlled Visual Properties',
             fontsize=14, fontweight='bold')
    plt.xticks(range(len(classes)), classes, rotation=45, ha='right')
    plt.grid(axis='y', alpha=0.3, linestyle='--')
    
    # Add legend
    from matplotlib.patches import Patch
    legend_elements = [
        Patch(facecolor='#2ecc71', alpha=0.7, label='CLIP Better'),
        Patch(facecolor='#e74c3c', alpha=0.7, label='CVCL Better')
    ]
    plt.legend(handles=legend_elements, loc='upper right')
    
    # Add annotation
    plt.text(0.02, 0.98, f'Visual Prototype Method\n{len(classes)} CVCL training classes',
            transform=plt.gca().transAxes, fontsize=10, fontweight='bold',
            verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    plt.tight_layout()
    
    diff_plot_path = os.path.join(RESULTS_DIR, 'class_prototype_cvcl_training_difference.png')
    plt.savefig(diff_plot_path, dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"\nSaved difference plot to {diff_plot_path}")
    
    # Print summary of differences
    clip_better = sum(1 for d in differences if d > 0)
    cvcl_better = sum(1 for d in differences if d < 0)
    tied = sum(1 for d in differences if d == 0)
    
    print(f"\nSummary (CVCL Training Classes - Visual Prototype):")
    print(f"  CLIP performs better: {clip_better}/{len(classes)} classes")
    print(f"  CVCL performs better: {cvcl_better}/{len(classes)} classes")
    if tied > 0:
        print(f"  No difference: {tied}/{len(classes)} classes")
    
    avg_diff = np.mean([d for d in differences if d != 0])
    print(f"  Average difference: {avg_diff:.3f}")
    print(f"\nThese classes were seen during CVCL training, making this a direct comparison.")
    print(f"Visual prototypes test pure visual recognition without language.")

In [None]:
# Compare visual prototype vs text-vision results if both exist
import os

# Try to load text-vision results for comparison
textvision_path = os.path.join(RESULTS_DIR, 'Textvision', 'class_textvision_cvcl_training_perclass_summary.csv')

if os.path.exists(textvision_path):
    print("\n" + "="*80)
    print("COMPARING VISUAL PROTOTYPE VS TEXT-VISION METHODS")
    print("="*80)
    
    # Load text-vision results
    textvision_df = pd.read_csv(textvision_path)
    
    # Create comparison for each model
    for model_name in models_to_test:
        print(f"\n{model_name.upper()}:")
        
        # Get visual prototype results
        vp_accs = []
        for cls in classes:
            if cls in stats_results[model_name]:
                vp_accs.extend(stats_results[model_name][cls]['raw'])
        
        if len(vp_accs) > 0:
            vp_mean = np.mean(vp_accs)
            print(f"  Visual Prototype: {vp_mean:.3f}")
        
        # Get text-vision results
        tv_data = textvision_df[textvision_df['model'] == model_name]['mean_accuracy']
        if len(tv_data) > 0:
            tv_mean = tv_data.mean()
            print(f"  Text-Vision:      {tv_mean:.3f}")
            
            # Compare
            if len(vp_accs) > 0:
                diff = vp_mean - tv_mean
                print(f"  Difference:       {diff:+.3f} (Visual Prototype {'better' if diff > 0 else 'worse'})")
    
    print("\nInterpretation:")
    print("- Visual Prototype: Tests pure visual recognition using averaged class features")
    print("- Text-Vision: Tests multimodal understanding using text encodings of class names")
    print("- Both use controlled visual properties (matching size, color, texture for distractors)")
else:
    print("\nText-vision results not found for comparison.")
    print(f"Run the Class_TextVision_CVCLTraining_PerClass_Analysis.ipynb notebook first to enable comparison.")