# Same Class Different Size (SCDS) Text-Vision Analysis with Controlled Attributes

This notebook tests size discrimination WITHIN the same class using text-vision alignment.
All trials have matched color and texture, with only size varying.
Text encoding uses size+class format (e.g., "small apple", "large apple").

In [None]:
import os
import sys
import random
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import clip
import time

# Path setup
REPO_ROOT = r'C:\Users\jbats\Projects\NTU-Synthetic'
DISCOVER_ROOT = os.path.join(REPO_ROOT, 'discover-hidden-visual-concepts')
sys.path.insert(0, DISCOVER_ROOT)
sys.path.insert(0, REPO_ROOT)

# Import from discover-hidden-visual-concepts repo
sys.path.append(os.path.join(DISCOVER_ROOT, 'src'))
from utils.model_loader import load_model
from models.feature_extractor import FeatureExtractor

# SyntheticKonkle paths - Using 224x224 resized images
DATA_DIR = os.path.join(REPO_ROOT, 'data', 'SyntheticKonkle_224')
RESULTS_DIR = os.path.join(REPO_ROOT, 'PatrickProject', 'Chart_Generation', 'Textvision')
os.makedirs(RESULTS_DIR, exist_ok=True)

print(f"Data directory: {DATA_DIR}")
print(f"Results will be saved to: {RESULTS_DIR}")

In [None]:
# Dataset setup with proper attribute tracking
def build_synthetic_dataset():
    """Load the master labels CSV with all visual properties."""
    # Use the master_labels.csv which has all the attribute information
    master_csv = os.path.join(DATA_DIR, 'master_labels.csv')
    
    if not os.path.exists(master_csv):
        print(f"Warning: {master_csv} not found, trying alternative path...")
        # Try the original SyntheticKonkle folder
        master_csv = os.path.join(REPO_ROOT, 'data', 'SyntheticKonkle', 'master_labels.csv')
    
    print(f"Loading master labels from: {master_csv}")
    df = pd.read_csv(master_csv)
    
    # Ensure all required columns are present
    required_cols = ['folder', 'filename', 'class', 'color', 'size', 'texture']
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns: {missing_cols}")
    
    # Clean the data
    df = df.dropna(subset=required_cols)
    
    # Filter to valid sizes only
    valid_sizes = ['small', 'medium', 'large']
    df = df[df['size'].isin(valid_sizes)].copy()
    
    print(f"Loaded {len(df)} images")
    print(f"Classes: {df['class'].nunique()} unique")
    print(f"Colors: {df['color'].nunique()} unique")
    print(f"Sizes: {df['size'].nunique()} unique ({sorted(df['size'].unique())})")
    print(f"Textures: {df['texture'].nunique()} unique")
    
    # Check size distribution within classes
    size_per_class = df.groupby('class')['size'].nunique()
    print(f"\nAverage sizes per class: {size_per_class.mean():.1f}")
    print(f"Min sizes in a class: {size_per_class.min()}")
    print(f"Max sizes in a class: {size_per_class.max()}")
    
    return df

class SyntheticImageDataset(Dataset):
    def __init__(self, df, data_dir, transform):
        self.df = df
        # For SyntheticKonkle_224, images are in nested structure
        self.data_dir = os.path.join(data_dir, 'SyntheticKonkle')
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join(self.data_dir, row['folder'], row['filename'])
        try:
            img = Image.open(img_path).convert('RGB')
            return self.transform(img), row['class'], row['color'], row['size'], row['texture'], idx
        except Exception as e:
            # Return a black image if file not found
            img = Image.new('RGB', (224, 224), color='black')
            return self.transform(img), row['class'], row['color'], row['size'], row['texture'], idx

def collate_fn(batch):
    imgs = torch.stack([b[0] for b in batch])
    classes = [b[1] for b in batch]
    colors = [b[2] for b in batch]
    sizes = [b[3] for b in batch]
    textures = [b[4] for b in batch]
    idxs = [b[5] for b in batch]
    return imgs, classes, colors, sizes, textures, idxs

In [None]:
def run_scds_text_vision_test_per_class(model_name, seed=0, device='cuda' if torch.cuda.is_available() else 'cpu', 
                                        batch_size=32, trials_per_class=500):
    """
    Run Same Class Different Size text-vision test with controlled color and texture.
    Returns per-class accuracy results.
    """
    
    random.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Check if model supports text encoding
    if model_name in ['resnext', 'dino_s_resnext50']:
        print(f"[WARNING] {model_name} has no text encoder, skipping")
        return {}

    # Load model & transform
    print(f"Loading {model_name}...")
    model, transform = load_model(model_name, seed=seed, device=device)
    extractor = FeatureExtractor(model_name, model, device)
    
    # Build dataset and extract image embeddings
    df = build_synthetic_dataset()
    ds = SyntheticImageDataset(df, DATA_DIR, transform)
    loader = DataLoader(ds, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=collate_fn)

    print("Extracting image embeddings...")
    all_img_embs, all_classes, all_colors, all_sizes, all_textures, all_idxs = [], [], [], [], [], []
    
    with torch.no_grad():
        for imgs, classes, colors, sizes, textures, idxs in tqdm(loader, desc="Processing images"):
            imgs = imgs.to(device)
            feats = extractor.get_img_feature(imgs)
            feats = extractor.norm_features(feats).cpu().float()
            all_img_embs.append(feats)
            all_classes.extend(classes)
            all_colors.extend(colors)
            all_sizes.extend(sizes)
            all_textures.extend(textures)
            all_idxs.extend(idxs)
    
    all_img_embs = torch.cat(all_img_embs, dim=0)
    print(f"Extracted {len(all_img_embs)} image embeddings")

    # Group images by class, color, texture, and size
    # For SCDS: class, color, texture are fixed; size varies
    class_color_texture_size_idxs = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    idx_to_row = {idx: i for i, idx in enumerate(all_idxs)}
    
    for i, (idx, cls, col, size, texture) in enumerate(zip(all_idxs, all_classes, all_colors, all_sizes, all_textures)):
        class_color_texture_size_idxs[cls][(col, texture)][size].append(idx)

    # Get unique classes and sizes
    unique_classes = list(set(all_classes))
    valid_sizes = ['small', 'medium', 'large']
    
    print(f"Found {len(unique_classes)} classes")
    
    # Pre-encode all size+class text combinations
    print("Encoding text labels for all size-class combinations...")
    text_features_cache = {}
    
    with torch.no_grad():
        # Create all size+class combinations
        text_labels = []
        label_keys = []
        for cls in unique_classes:
            for size in valid_sizes:
                # Format: "small apple", "large apple", etc.
                label = f"{size} {cls}"
                text_labels.append(label)
                label_keys.append((cls, size))
        
        # Encode in batches for efficiency
        if "clip" in model_name:
            # CLIP text encoding
            tokens = clip.tokenize(text_labels, truncate=True).to(device)
            txt_features = model.encode_text(tokens)
            txt_features = extractor.norm_features(txt_features).cpu().float()
            for i, (cls, size) in enumerate(label_keys):
                text_features_cache[(cls, size)] = txt_features[i]
        else:  # CVCL
            # CVCL text encoding with token length
            tokens, token_len = model.tokenize(text_labels)
            tokens = tokens.to(device)
            if isinstance(token_len, torch.Tensor):
                token_len = token_len.to(device)
            txt_features = model.encode_text(tokens, token_len)
            txt_features = extractor.norm_features(txt_features).cpu().float()
            for i, (cls, size) in enumerate(label_keys):
                text_features_cache[(cls, size)] = txt_features[i]
    
    print(f"Encoded {len(text_features_cache)} size-class text combinations")

    # Track per-class performance
    class_correct = defaultdict(int)
    class_total = defaultdict(int)
    
    print(f"Running {trials_per_class} trials per class for SCDS task...")
    
    # Run trials for each class
    for target_class in tqdm(unique_classes, desc=f"Testing {model_name} SCDS"):
        trials_done = 0
        
        # For each color-texture combination in this class
        for (color, texture), size_dict in class_color_texture_size_idxs[target_class].items():
            if trials_done >= trials_per_class:
                break
            
            # Need all 3 sizes for this class-color-texture combination
            available_sizes = list(size_dict.keys())
            if len(available_sizes) < 3:
                continue
            
            # Check if we have all three sizes
            if not all(size in available_sizes for size in valid_sizes):
                continue
            
            # Run multiple trials for this combination
            n_trials = min(20, trials_per_class - trials_done)  # More trials per combination
            
            for _ in range(n_trials):
                # For SCDS, we need exactly 3 sizes, but 4 candidates
                # Solution: Use all 3 sizes + duplicate one randomly
                
                # Select one image from each size
                candidates = []
                candidate_sizes = []
                
                for size in valid_sizes:
                    if size_dict[size]:  # If images exist for this size
                        img_idx = random.choice(size_dict[size])
                        candidates.append(img_idx)
                        candidate_sizes.append(size)
                
                if len(candidates) != 3:
                    continue
                
                # For the 4th candidate, duplicate a random size
                duplicate_size = random.choice(valid_sizes)
                # Try to get a different image of the same size
                available_for_duplicate = [idx for idx in size_dict[duplicate_size] 
                                         if idx not in candidates]
                if available_for_duplicate:
                    duplicate_idx = random.choice(available_for_duplicate)
                else:
                    # If no different image, use the same one (not ideal but rare)
                    duplicate_idx = random.choice(size_dict[duplicate_size])
                
                candidates.append(duplicate_idx)
                candidate_sizes.append(duplicate_size)
                
                # Randomly select target
                target_position = random.randint(0, 3)
                target_idx = candidates[target_position]
                target_size = candidate_sizes[target_position]
                
                # Get image features for all candidates
                cand_features = torch.stack([all_img_embs[idx_to_row[idx]] for idx in candidates]).float()
                
                # Get text feature for target size+class
                target_text_feature = text_features_cache[(target_class, target_size)].float()
                
                # Compute similarity with text encoding
                similarities = cand_features @ target_text_feature
                
                # Check if model correctly identifies target
                prediction = similarities.argmax().item()
                
                # Update counts
                class_correct[target_class] += int(prediction == target_position)
                class_total[target_class] += 1
                trials_done += 1
                
                if trials_done >= trials_per_class:
                    break
    
    # Calculate per-class accuracy
    class_accuracies = {}
    for cls in unique_classes:
        if class_total[cls] > 0:
            class_accuracies[cls] = class_correct[cls] / class_total[cls]
        else:
            class_accuracies[cls] = 0.0
    
    # Print summary
    overall_correct = sum(class_correct.values())
    overall_total = sum(class_total.values())
    overall_acc = overall_correct / overall_total if overall_total > 0 else 0
    
    print(f"\nSCDS Overall: {overall_correct}/{overall_total} = {overall_acc:.3f}")
    print(f"Classes tested: {len([c for c in class_accuracies if class_total[c] > 0])}")
    
    # Show top and bottom performers
    sorted_classes = sorted(class_accuracies.items(), key=lambda x: x[1], reverse=True)
    print("\nTop 5 classes for size discrimination:")
    for cls, acc in sorted_classes[:5]:
        if class_total[cls] > 0:
            print(f"  {cls}: {acc:.3f} ({class_total[cls]} trials)")
    print("\nBottom 5 classes for size discrimination:")
    for cls, acc in sorted_classes[-5:]:
        if class_total[cls] > 0:
            print(f"  {cls}: {acc:.3f} ({class_total[cls]} trials)")
    
    return class_accuracies

In [None]:
# Run multiple seeds for statistical analysis
n_seeds = 3  # Limited seeds due to potential rate limiting
trials_per_class = 500  # Consistent with other tests
models_to_test = ['cvcl-resnext', 'clip-res']

# Check dataset first
test_df = build_synthetic_dataset()
n_classes = len(test_df['class'].unique())
n_sizes = len(test_df['size'].unique())
print(f"Found {n_classes} unique classes and {n_sizes} unique sizes")

print(f"\nStarting SCDS Text-Vision evaluation:")
print(f"Configuration: {n_seeds} seeds × {trials_per_class} trials/class × {n_classes} classes")
print(f"Task: Same Class Different Size discrimination")
print(f"Control: Color and texture are held constant within each trial")
print(f"Text format: size + class (e.g., 'small apple', 'large apple')\n")

all_results = {model: defaultdict(list) for model in models_to_test}

# Run evaluation
for model_name in models_to_test:
    print(f"\n{'='*60}")
    print(f"Testing {model_name} with SCDS text-vision approach")
    print('='*60)
    
    for seed in range(n_seeds):
        print(f"\nSeed {seed+1}/{n_seeds} for {model_name}")
        
        try:
            class_acc = run_scds_text_vision_test_per_class(
                model_name, 
                seed=seed, 
                trials_per_class=trials_per_class
            )
            
            # Store results
            for cls, acc in class_acc.items():
                all_results[model_name][cls].append(acc)
            
            # Print progress
            if len(class_acc) > 0:
                mean_acc = np.mean(list(class_acc.values()))
                print(f"  Mean accuracy across classes: {mean_acc:.3f}")
                print(f"  Classes successfully tested: {len(class_acc)}")
                
        except Exception as e:
            print(f"  Error: {e}")
            if "404" in str(e) or "rate" in str(e).lower():
                print(f"  Rate limit hit - waiting 60 seconds...")
                time.sleep(60)
                # Retry once
                try:
                    class_acc = run_scds_text_vision_test_per_class(
                        model_name, seed=seed, trials_per_class=trials_per_class
                    )
                    for cls, acc in class_acc.items():
                        all_results[model_name][cls].append(acc)
                    print(f"  Retry successful!")
                except:
                    print(f"  Retry failed - skipping seed {seed}")
                    continue
        
        # Add delay between seeds for CVCL
        if 'cvcl' in model_name and seed < n_seeds - 1:
            print("  Waiting 30 seconds before next seed...")
            time.sleep(30)

# Calculate statistics
stats_results = {}
for model_name in models_to_test:
    stats_results[model_name] = {}
    for cls, accs in all_results[model_name].items():
        if len(accs) > 0:
            n_samples = len(accs)
            stats_results[model_name][cls] = {
                'mean': np.mean(accs),
                'std': np.std(accs, ddof=1) if n_samples > 1 else 0,
                'se': np.std(accs, ddof=1) / np.sqrt(n_samples) if n_samples > 1 else 0,
                'ci95': 1.96 * np.std(accs, ddof=1) / np.sqrt(n_samples) if n_samples > 1 else 0,
                'n_samples': n_samples,
                'total_trials': n_samples * trials_per_class,
                'raw': accs
            }

print("\n" + "="*60)
print("SCDS TEXT-VISION EVALUATION COMPLETE")
print("="*60)

In [None]:
# Save detailed results
detailed_df = []
for model_name in models_to_test:
    for cls, stats in stats_results[model_name].items():
        for seed_idx, acc in enumerate(stats['raw']):
            detailed_df.append({
                'model': model_name,
                'class': cls,
                'seed': seed_idx,
                'accuracy': acc,
                'n_trials': trials_per_class,
                'test_type': 'scds_text_vision'
            })

if len(detailed_df) > 0:
    detailed_df = pd.DataFrame(detailed_df)
    output_path = os.path.join(RESULTS_DIR, 'scds_textvision_perclass_results.csv')
    detailed_df.to_csv(output_path, index=False)
    print(f"\nSaved detailed results to {output_path}")
    
    # Save summary statistics
    summary_stats = []
    for model_name in models_to_test:
        for cls, stats in stats_results[model_name].items():
            summary_stats.append({
                'model': model_name,
                'class': cls,
                'mean_accuracy': stats['mean'],
                'std': stats['std'],
                'se': stats['se'],
                'ci95': stats['ci95'],
                'n_seeds': stats['n_samples'],
                'total_trials': stats['total_trials']
            })
    
    summary_df = pd.DataFrame(summary_stats)
    summary_path = os.path.join(RESULTS_DIR, 'scds_textvision_perclass_summary.csv')
    summary_df.to_csv(summary_path, index=False)
    print(f"Saved summary statistics to {summary_path}")
else:
    print("\nNo results to save")

In [None]:
# Create visualization
if len(stats_results[models_to_test[0]]) > 0:
    fig = plt.figure(figsize=(14, 11))
    
    # Create subplots
    ax1 = plt.subplot2grid((20, 1), (0, 0), rowspan=8)
    ax2 = plt.subplot2grid((20, 1), (12, 0), rowspan=8)
    
    # Prepare data
    classes = sorted(list(stats_results[models_to_test[0]].keys()))
    mid_point = len(classes) // 2
    classes_first_half = classes[:mid_point]
    classes_second_half = classes[mid_point:]
    
    # Define colors and markers
    colors = {
        'cvcl-resnext': '#2a9d8f',  # Teal for CVCL
        'clip-res': '#e63946'  # Red for CLIP
    }
    markers = {
        'cvcl-resnext': 'o',
        'clip-res': 's'
    }
    avg_line_styles = {
        'cvcl-resnext': '--',
        'clip-res': '-.'
    }
    
    legend_elements = []
    
    def plot_on_axis(ax, class_subset, is_first=False):
        x_pos = np.arange(len(class_subset))
        
        for model_name in models_to_test:
            means = [stats_results[model_name][cls]['mean'] * 100 for cls in class_subset]
            errors = [stats_results[model_name][cls]['ci95'] * 100 for cls in class_subset]
            
            ax.errorbar(x_pos, means, yerr=errors,
                       label=model_name.upper().replace('-', ' '),
                       color=colors[model_name],
                       marker=markers[model_name],
                       markersize=7,
                       linewidth=0,
                       capsize=4,
                       capthick=1.5,
                       alpha=0.9,
                       markeredgecolor='black',
                       markeredgewidth=0.5)
        
        # Add chance level
        ax.axhline(y=25, color='#ffa500', linestyle=':', alpha=0.8, linewidth=1.5)
        
        # Calculate overall averages
        all_classes_means = {}
        for model_name in models_to_test:
            all_means = [stats_results[model_name][cls]['mean'] * 100 for cls in classes]
            all_classes_means[model_name] = np.mean(all_means)
        
        # Add average lines
        for model_name in models_to_test:
            avg_performance = all_classes_means[model_name]
            ax.axhline(y=avg_performance,
                      color=colors[model_name],
                      linestyle=avg_line_styles[model_name],
                      alpha=0.7,
                      linewidth=2)
            
            if is_first:
                ax.text(len(class_subset) + 0.8, avg_performance,
                       f'{avg_performance:.1f}%',
                       fontsize=9,
                       color=colors[model_name],
                       va='center',
                       fontweight='bold')
        
        # Formatting
        ax.set_ylabel('SCDS Text-Vision Accuracy (%)', fontsize=11, fontweight='bold')
        ax.set_xticks(x_pos)
        ax.set_xticklabels(class_subset, rotation=45, ha='right', fontsize=10)
        ax.set_ylim(0, 105)
        ax.set_yticks([0, 25, 50, 75, 100])
        ax.grid(axis='y', alpha=0.3, linestyle='-', linewidth=0.5)
        ax.set_axisbelow(True)
        ax.set_facecolor('#fafafa')
        
        # Create legend elements
        global legend_elements
        if is_first:
            from matplotlib.lines import Line2D
            legend_elements = []
            
            for model_name in models_to_test:
                legend_elements.append(
                    Line2D([0], [0], marker=markers[model_name], color='w',
                          markerfacecolor=colors[model_name], markeredgecolor='black',
                          markersize=8, label=model_name.upper().replace('-', ' '))
                )
            
            for model_name in models_to_test:
                avg_val = all_classes_means[model_name]
                legend_elements.append(
                    Line2D([0], [0], color=colors[model_name],
                          linestyle=avg_line_styles[model_name], linewidth=2,
                          label=f'{model_name.upper().split("-")[0]} Average ({avg_val:.1f}%)')
                )
            
            legend_elements.append(
                Line2D([0], [0], color='#ffa500', linestyle=':', linewidth=1.5,
                      label='Chance Level (25%)')
            )
    
    # Plot both halves
    plot_on_axis(ax1, classes_first_half, is_first=True)
    ax1.set_title('SCDS Text-Vision Per-Class Performance - Part 1\nSame Class Different Size (Color & Texture Controlled)',
                 fontsize=13, fontweight='bold', pad=10)
    
    plot_on_axis(ax2, classes_second_half, is_first=False)
    ax2.set_title('SCDS Text-Vision Per-Class Performance - Part 2',
                 fontsize=13, fontweight='bold', pad=10)
    ax2.set_xlabel('Target Category', fontsize=11, fontweight='bold')
    
    # Add legend
    legend_ax = fig.add_axes([0.125, 0.44, 0.775, 0.08])
    legend_ax.axis('off')
    
    legend = legend_ax.legend(handles=legend_elements,
                             loc='center',
                             ncol=3,
                             fontsize=10,
                             frameon=True,
                             fancybox=True,
                             shadow=True,
                             framealpha=0.95,
                             columnspacing=2.5,
                             handlelength=3)
    
    legend.get_frame().set_facecolor('white')
    legend.get_frame().set_edgecolor('gray')
    legend.get_frame().set_linewidth(1.5)
    
    plt.tight_layout()
    plt.subplots_adjust(hspace=0.35)
    
    # Save plots
    png_path = os.path.join(RESULTS_DIR, 'scds_textvision_perclass.png')
    pdf_path = os.path.join(RESULTS_DIR, 'scds_textvision_perclass.pdf')
    
    plt.savefig(png_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.savefig(pdf_path, bbox_inches='tight', facecolor='white')
    plt.show()
    
    print(f"\nSaved plots to:")
    print(f"  - {png_path}")
    print(f"  - {pdf_path}")

In [None]:
# Statistical summary
if len(stats_results) > 0 and len(stats_results[models_to_test[0]]) > 0:
    classes = sorted(list(stats_results[models_to_test[0]].keys()))
    
    summary_data = []
    for cls in classes:
        row = {'Class': cls}
        for model in models_to_test:
            if cls in stats_results[model]:
                stats = stats_results[model][cls]
                row[f"{model}_mean"] = f"{stats['mean']:.3f}"
                row[f"{model}_ci95"] = f"±{stats['ci95']:.3f}"
                row[f"{model}_trials"] = stats['total_trials']
        
        # Add difference if both models have results
        if cls in stats_results['clip-res'] and cls in stats_results['cvcl-resnext']:
            diff = stats_results['clip-res'][cls]['mean'] - stats_results['cvcl-resnext'][cls]['mean']
            row['difference'] = f"{diff:+.3f}"
        
        summary_data.append(row)
    
    summary_df = pd.DataFrame(summary_data)
    print("\n" + "="*80)
    print("SCDS TEXT-VISION PER-CLASS PERFORMANCE SUMMARY")
    print("Task: Same Class Different Size (Color & Texture Controlled)")
    print("="*80)
    print(summary_df.to_string(index=False))
    
    # Overall statistics
    print("\n" + "="*80)
    print("OVERALL SCDS TEXT-VISION PERFORMANCE")
    print("="*80)
    
    for model in models_to_test:
        all_accs = []
        for cls in classes:
            if cls in stats_results[model]:
                all_accs.extend(stats_results[model][cls]['raw'])
        
        if len(all_accs) > 0:
            mean = np.mean(all_accs)
            std = np.std(all_accs)
            se = std / np.sqrt(len(all_accs))
            ci95 = 1.96 * se
            print(f"{model}: {mean:.3f} ± {ci95:.3f} (SE: {se:.3f}, n={len(all_accs)} samples)")
    
    # Statistical test
    from scipy import stats as scipy_stats
    
    cvcl_all = []
    clip_all = []
    for cls in classes:
        if cls in stats_results['cvcl-resnext']:
            cvcl_all.extend(stats_results['cvcl-resnext'][cls]['raw'])
        if cls in stats_results['clip-res']:
            clip_all.extend(stats_results['clip-res'][cls]['raw'])
    
    if len(cvcl_all) > 0 and len(clip_all) > 0:
        t_stat, p_value = scipy_stats.ttest_ind(cvcl_all, clip_all)
        print(f"\nt-test: t={t_stat:.3f}, p={p_value:.6f}")
        if p_value < 0.001:
            print("Result: Highly significant difference (p < 0.001)")
        elif p_value < 0.01:
            print("Result: Significant difference (p < 0.01)")
        elif p_value < 0.05:
            print("Result: Significant difference (p < 0.05)")
        else:
            print("Result: No significant difference")
    
    print("\nInterpretation:")
    print("This test measures how well models can distinguish sizes within the same object class.")
    print("Color and texture are held constant, so performance reflects pure size discrimination.")
    print("Note: Size discrimination is often harder than color as size is relative.")

In [None]:
# Create difference plot and comparison with other tests
if len(stats_results) > 0 and len(stats_results[models_to_test[0]]) > 0:
    plt.figure(figsize=(14, 7))
    
    classes = sorted(list(stats_results[models_to_test[0]].keys()))
    differences = []
    
    for cls in classes:
        if cls in stats_results['clip-res'] and cls in stats_results['cvcl-resnext']:
            diff = stats_results['clip-res'][cls]['mean'] - stats_results['cvcl-resnext'][cls]['mean']
            differences.append(diff)
        else:
            differences.append(0)
    
    colors_diff = ['#2ecc71' if d > 0 else '#e74c3c' for d in differences]
    bars = plt.bar(range(len(classes)), differences, color=colors_diff, alpha=0.7, edgecolor='black', linewidth=0.5)
    
    # Add value labels
    for i, (cls, diff) in enumerate(zip(classes, differences)):
        if diff != 0:
            plt.text(i, diff + (0.01 if diff > 0 else -0.02), f'{diff:.2f}',
                    ha='center', va='bottom' if diff > 0 else 'top', fontsize=8)
    
    plt.axhline(y=0, color='black', linestyle='-', linewidth=1)
    plt.xlabel('Object Class', fontsize=12, fontweight='bold')
    plt.ylabel('Performance Difference\n(CLIP - CVCL)', fontsize=12, fontweight='bold')
    plt.title('SCDS Text-Vision Model Performance Differences by Class\nSize Discrimination within Same Class',
             fontsize=14, fontweight='bold')
    plt.xticks(range(len(classes)), classes, rotation=45, ha='right')
    plt.grid(axis='y', alpha=0.3, linestyle='--')
    
    # Add legend
    from matplotlib.patches import Patch
    legend_elements = [
        Patch(facecolor='#2ecc71', alpha=0.7, label='CLIP Better'),
        Patch(facecolor='#e74c3c', alpha=0.7, label='CVCL Better')
    ]
    plt.legend(handles=legend_elements, loc='upper right')
    
    plt.tight_layout()
    
    diff_plot_path = os.path.join(RESULTS_DIR, 'scds_textvision_difference.png')
    plt.savefig(diff_plot_path, dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"\nSaved difference plot to {diff_plot_path}")
    
    # Print summary of differences
    clip_better = sum(1 for d in differences if d > 0)
    cvcl_better = sum(1 for d in differences if d < 0)
    tied = sum(1 for d in differences if d == 0)
    
    print(f"\nSummary for SCDS (Size Discrimination):")
    print(f"  CLIP performs better: {clip_better}/{len(classes)} classes")
    print(f"  CVCL performs better: {cvcl_better}/{len(classes)} classes")
    if tied > 0:
        print(f"  No difference: {tied}/{len(classes)} classes")
    
    avg_diff = np.mean([d for d in differences if d != 0])
    print(f"  Average difference: {avg_diff:.3f}")
    print(f"\nNote: Size discrimination is often challenging as size is a relative concept.")
    print(f"Performance may be lower than color discrimination due to this complexity.")