# Same Class Different Texture (SCDT) Text-Vision Analysis with Controlled Attributes

This notebook tests texture discrimination WITHIN the same class using text-vision alignment.
All trials have matched color and size, with only texture varying (smooth vs bumpy).
Text encoding uses texture+class format (e.g., "smooth apple", "bumpy apple").

In [None]:
import os
import sys
import random
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import clip
import time

# Path setup
REPO_ROOT = r'C:\Users\jbats\Projects\NTU-Synthetic'
DISCOVER_ROOT = os.path.join(REPO_ROOT, 'discover-hidden-visual-concepts')
sys.path.insert(0, DISCOVER_ROOT)
sys.path.insert(0, REPO_ROOT)

# Import from discover-hidden-visual-concepts repo
sys.path.append(os.path.join(DISCOVER_ROOT, 'src'))
from utils.model_loader import load_model
from models.feature_extractor import FeatureExtractor

# SyntheticKonkle paths - Using 224x224 resized images
DATA_DIR = os.path.join(REPO_ROOT, 'data', 'SyntheticKonkle_224')
RESULTS_DIR = os.path.join(REPO_ROOT, 'PatrickProject', 'Chart_Generation', 'Textvision')
os.makedirs(RESULTS_DIR, exist_ok=True)

print(f"Data directory: {DATA_DIR}")
print(f"Results will be saved to: {RESULTS_DIR}")

In [None]:
# Dataset setup with proper attribute tracking
def build_synthetic_dataset():
    """Load the master labels CSV with all visual properties."""
    # Use the master_labels.csv which has all the attribute information
    master_csv = os.path.join(DATA_DIR, 'master_labels.csv')
    
    if not os.path.exists(master_csv):
        print(f"Warning: {master_csv} not found, trying alternative path...")
        # Try the original SyntheticKonkle folder
        master_csv = os.path.join(REPO_ROOT, 'data', 'SyntheticKonkle', 'master_labels.csv')
    
    print(f"Loading master labels from: {master_csv}")
    df = pd.read_csv(master_csv)
    
    # Ensure all required columns are present
    required_cols = ['folder', 'filename', 'class', 'color', 'size', 'texture']
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns: {missing_cols}")
    
    # Clean the data
    df = df.dropna(subset=required_cols)
    
    # Filter to valid textures only
    valid_textures = ['smooth', 'bumpy']
    df = df[df['texture'].isin(valid_textures)].copy()
    
    print(f"Loaded {len(df)} images")
    print(f"Classes: {df['class'].nunique()} unique")
    print(f"Colors: {df['color'].nunique()} unique")
    print(f"Sizes: {df['size'].nunique()} unique")
    print(f"Textures: {df['texture'].nunique()} unique ({sorted(df['texture'].unique())})")
    
    # Check texture distribution within classes
    texture_per_class = df.groupby('class')['texture'].nunique()
    print(f"\nClasses with both textures: {(texture_per_class == 2).sum()}")
    print(f"Classes with only one texture: {(texture_per_class == 1).sum()}")
    
    # Check combinations that have both textures
    combo_check = df.groupby(['class', 'color', 'size'])['texture'].nunique()
    valid_combos = combo_check[combo_check == 2].index.tolist()
    print(f"Class-color-size combinations with both textures: {len(valid_combos)}")
    
    return df

class SyntheticImageDataset(Dataset):
    def __init__(self, df, data_dir, transform):
        self.df = df
        # For SyntheticKonkle_224, images are in nested structure
        self.data_dir = os.path.join(data_dir, 'SyntheticKonkle')
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join(self.data_dir, row['folder'], row['filename'])
        try:
            img = Image.open(img_path).convert('RGB')
            return self.transform(img), row['class'], row['color'], row['size'], row['texture'], idx
        except Exception as e:
            # Return a black image if file not found
            img = Image.new('RGB', (224, 224), color='black')
            return self.transform(img), row['class'], row['color'], row['size'], row['texture'], idx

def collate_fn(batch):
    imgs = torch.stack([b[0] for b in batch])
    classes = [b[1] for b in batch]
    colors = [b[2] for b in batch]
    sizes = [b[3] for b in batch]
    textures = [b[4] for b in batch]
    idxs = [b[5] for b in batch]
    return imgs, classes, colors, sizes, textures, idxs

In [None]:
def run_scdt_text_vision_test_per_class(model_name, seed=0, device='cuda' if torch.cuda.is_available() else 'cpu', 
                                        batch_size=32, trials_per_class=500):
    """
    Run Same Class Different Texture text-vision test with controlled color and size.
    Returns per-class accuracy results.
    
    Note: Since there are only 2 textures (smooth, bumpy), for 4-way choice we use:
    - 2 images with smooth texture
    - 2 images with bumpy texture
    """
    
    random.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Check if model supports text encoding
    if model_name in ['resnext', 'dino_s_resnext50']:
        print(f"[WARNING] {model_name} has no text encoder, skipping")
        return {}

    # Load model & transform
    print(f"Loading {model_name}...")
    model, transform = load_model(model_name, seed=seed, device=device)
    extractor = FeatureExtractor(model_name, model, device)
    
    # Build dataset and extract image embeddings
    df = build_synthetic_dataset()
    ds = SyntheticImageDataset(df, DATA_DIR, transform)
    loader = DataLoader(ds, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=collate_fn)

    print("Extracting image embeddings...")
    all_img_embs, all_classes, all_colors, all_sizes, all_textures, all_idxs = [], [], [], [], [], []
    
    with torch.no_grad():
        for imgs, classes, colors, sizes, textures, idxs in tqdm(loader, desc="Processing images"):
            imgs = imgs.to(device)
            feats = extractor.get_img_feature(imgs)
            feats = extractor.norm_features(feats).cpu().float()
            all_img_embs.append(feats)
            all_classes.extend(classes)
            all_colors.extend(colors)
            all_sizes.extend(sizes)
            all_textures.extend(textures)
            all_idxs.extend(idxs)
    
    all_img_embs = torch.cat(all_img_embs, dim=0)
    print(f"Extracted {len(all_img_embs)} image embeddings")

    # Group images by class, color, size, and texture
    # For SCDT: class, color, size are fixed; texture varies
    class_color_size_texture_idxs = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    idx_to_row = {idx: i for i, idx in enumerate(all_idxs)}
    
    for i, (idx, cls, col, size, texture) in enumerate(zip(all_idxs, all_classes, all_colors, all_sizes, all_textures)):
        class_color_size_texture_idxs[cls][(col, size)][texture].append(idx)

    # Get unique classes and textures
    unique_classes = list(set(all_classes))
    valid_textures = ['smooth', 'bumpy']
    
    print(f"Found {len(unique_classes)} classes")
    
    # Pre-encode all texture+class text combinations
    print("Encoding text labels for all texture-class combinations...")
    text_features_cache = {}
    
    with torch.no_grad():
        # Create all texture+class combinations
        text_labels = []
        label_keys = []
        for cls in unique_classes:
            for texture in valid_textures:
                # Format: "smooth apple", "bumpy apple", etc.
                label = f"{texture} {cls}"
                text_labels.append(label)
                label_keys.append((cls, texture))
        
        # Encode in batches for efficiency
        if "clip" in model_name:
            # CLIP text encoding
            tokens = clip.tokenize(text_labels, truncate=True).to(device)
            txt_features = model.encode_text(tokens)
            txt_features = extractor.norm_features(txt_features).cpu().float()
            for i, (cls, texture) in enumerate(label_keys):
                text_features_cache[(cls, texture)] = txt_features[i]
        else:  # CVCL
            # CVCL text encoding with token length
            tokens, token_len = model.tokenize(text_labels)
            tokens = tokens.to(device)
            if isinstance(token_len, torch.Tensor):
                token_len = token_len.to(device)
            txt_features = model.encode_text(tokens, token_len)
            txt_features = extractor.norm_features(txt_features).cpu().float()
            for i, (cls, texture) in enumerate(label_keys):
                text_features_cache[(cls, texture)] = txt_features[i]
    
    print(f"Encoded {len(text_features_cache)} texture-class text combinations")

    # Track per-class performance
    class_correct = defaultdict(int)
    class_total = defaultdict(int)
    
    print(f"Running {trials_per_class} trials per class for SCDT task...")
    
    # Run trials for each class
    for target_class in tqdm(unique_classes, desc=f"Testing {model_name} SCDT"):
        trials_done = 0
        
        # For each color-size combination in this class
        for (color, size), texture_dict in class_color_size_texture_idxs[target_class].items():
            if trials_done >= trials_per_class:
                break
            
            # Need both textures for this class-color-size combination
            if 'smooth' not in texture_dict or 'bumpy' not in texture_dict:
                continue
            
            # Need at least 2 images per texture for 4-way choice (2 smooth + 2 bumpy)
            if len(texture_dict['smooth']) < 2 or len(texture_dict['bumpy']) < 2:
                continue
            
            # Run multiple trials for this combination
            n_trials = min(20, trials_per_class - trials_done)
            
            for _ in range(n_trials):
                # For SCDT with only 2 textures, we create 4 candidates:
                # 2 smooth images + 2 bumpy images
                
                candidates = []
                candidate_textures = []
                
                # Select 2 smooth images
                smooth_imgs = random.sample(texture_dict['smooth'], 
                                          min(2, len(texture_dict['smooth'])))
                for img_idx in smooth_imgs:
                    candidates.append(img_idx)
                    candidate_textures.append('smooth')
                
                # Select 2 bumpy images
                bumpy_imgs = random.sample(texture_dict['bumpy'], 
                                         min(2, len(texture_dict['bumpy'])))
                for img_idx in bumpy_imgs:
                    candidates.append(img_idx)
                    candidate_textures.append('bumpy')
                
                if len(candidates) != 4:
                    continue
                
                # Randomly select target from the 4 candidates
                target_position = random.randint(0, 3)
                target_idx = candidates[target_position]
                target_texture = candidate_textures[target_position]
                
                # Shuffle candidates for presentation
                indices = list(range(4))
                random.shuffle(indices)
                shuffled_candidates = [candidates[i] for i in indices]
                shuffled_textures = [candidate_textures[i] for i in indices]
                
                # Find where target ended up after shuffle
                correct_position = indices.index(target_position)
                
                # Get image features for all candidates
                cand_features = torch.stack([all_img_embs[idx_to_row[idx]] 
                                            for idx in shuffled_candidates]).float()
                
                # Get text feature for target texture+class
                target_text_feature = text_features_cache[(target_class, target_texture)].float()
                
                # Compute similarity with text encoding
                similarities = cand_features @ target_text_feature
                
                # Check if model correctly identifies target
                prediction = similarities.argmax().item()
                
                # Update counts
                class_correct[target_class] += int(prediction == correct_position)
                class_total[target_class] += 1
                trials_done += 1
                
                if trials_done >= trials_per_class:
                    break
    
    # Calculate per-class accuracy
    class_accuracies = {}
    for cls in unique_classes:
        if class_total[cls] > 0:
            class_accuracies[cls] = class_correct[cls] / class_total[cls]
        else:
            class_accuracies[cls] = 0.0
    
    # Print summary
    overall_correct = sum(class_correct.values())
    overall_total = sum(class_total.values())
    overall_acc = overall_correct / overall_total if overall_total > 0 else 0
    
    print(f"\nSCDT Overall: {overall_correct}/{overall_total} = {overall_acc:.3f}")
    print(f"Classes tested: {len([c for c in class_accuracies if class_total[c] > 0])}")
    
    # Show top and bottom performers
    sorted_classes = sorted(class_accuracies.items(), key=lambda x: x[1], reverse=True)
    print("\nTop 5 classes for texture discrimination:")
    for cls, acc in sorted_classes[:5]:
        if class_total[cls] > 0:
            print(f"  {cls}: {acc:.3f} ({class_total[cls]} trials)")
    print("\nBottom 5 classes for texture discrimination:")
    for cls, acc in sorted_classes[-5:]:
        if class_total[cls] > 0:
            print(f"  {cls}: {acc:.3f} ({class_total[cls]} trials)")
    
    return class_accuracies

In [None]:
# Run multiple seeds for statistical analysis
n_seeds = 3  # Limited seeds due to potential rate limiting
trials_per_class = 500  # Consistent with other tests
models_to_test = ['cvcl-resnext', 'clip-res']

# Check dataset first
test_df = build_synthetic_dataset()
n_classes = len(test_df['class'].unique())
n_textures = len(test_df['texture'].unique())
print(f"Found {n_classes} unique classes and {n_textures} unique textures")

print(f"\nStarting SCDT Text-Vision evaluation:")
print(f"Configuration: {n_seeds} seeds × {trials_per_class} trials/class × {n_classes} classes")
print(f"Task: Same Class Different Texture discrimination")
print(f"Control: Color and size are held constant within each trial")
print(f"Text format: texture + class (e.g., 'smooth apple', 'bumpy apple')")
print(f"Note: With only 2 textures, each trial has 2 smooth + 2 bumpy images\n")

all_results = {model: defaultdict(list) for model in models_to_test}

# Run evaluation
for model_name in models_to_test:
    print(f"\n{'='*60}")
    print(f"Testing {model_name} with SCDT text-vision approach")
    print('='*60)
    
    for seed in range(n_seeds):
        print(f"\nSeed {seed+1}/{n_seeds} for {model_name}")
        
        try:
            class_acc = run_scdt_text_vision_test_per_class(
                model_name, 
                seed=seed, 
                trials_per_class=trials_per_class
            )
            
            # Store results
            for cls, acc in class_acc.items():
                all_results[model_name][cls].append(acc)
            
            # Print progress
            if len(class_acc) > 0:
                mean_acc = np.mean(list(class_acc.values()))
                print(f"  Mean accuracy across classes: {mean_acc:.3f}")
                print(f"  Classes successfully tested: {len(class_acc)}")
                
        except Exception as e:
            print(f"  Error: {e}")
            if "404" in str(e) or "rate" in str(e).lower():
                print(f"  Rate limit hit - waiting 60 seconds...")
                time.sleep(60)
                # Retry once
                try:
                    class_acc = run_scdt_text_vision_test_per_class(
                        model_name, seed=seed, trials_per_class=trials_per_class
                    )
                    for cls, acc in class_acc.items():
                        all_results[model_name][cls].append(acc)
                    print(f"  Retry successful!")
                except:
                    print(f"  Retry failed - skipping seed {seed}")
                    continue
        
        # Add delay between seeds for CVCL
        if 'cvcl' in model_name and seed < n_seeds - 1:
            print("  Waiting 30 seconds before next seed...")
            time.sleep(30)

# Calculate statistics
stats_results = {}
for model_name in models_to_test:
    stats_results[model_name] = {}
    for cls, accs in all_results[model_name].items():
        if len(accs) > 0:
            n_samples = len(accs)
            stats_results[model_name][cls] = {
                'mean': np.mean(accs),
                'std': np.std(accs, ddof=1) if n_samples > 1 else 0,
                'se': np.std(accs, ddof=1) / np.sqrt(n_samples) if n_samples > 1 else 0,
                'ci95': 1.96 * np.std(accs, ddof=1) / np.sqrt(n_samples) if n_samples > 1 else 0,
                'n_samples': n_samples,
                'total_trials': n_samples * trials_per_class,
                'raw': accs
            }

print("\n" + "="*60)
print("SCDT TEXT-VISION EVALUATION COMPLETE")
print("="*60)

In [None]:
# Save detailed results
detailed_df = []
for model_name in models_to_test:
    for cls, stats in stats_results[model_name].items():
        for seed_idx, acc in enumerate(stats['raw']):
            detailed_df.append({
                'model': model_name,
                'class': cls,
                'seed': seed_idx,
                'accuracy': acc,
                'n_trials': trials_per_class,
                'test_type': 'scdt_text_vision'
            })

if len(detailed_df) > 0:
    detailed_df = pd.DataFrame(detailed_df)
    output_path = os.path.join(RESULTS_DIR, 'scdt_textvision_perclass_results.csv')
    detailed_df.to_csv(output_path, index=False)
    print(f"\nSaved detailed results to {output_path}")
    
    # Save summary statistics
    summary_stats = []
    for model_name in models_to_test:
        for cls, stats in stats_results[model_name].items():
            summary_stats.append({
                'model': model_name,
                'class': cls,
                'mean_accuracy': stats['mean'],
                'std': stats['std'],
                'se': stats['se'],
                'ci95': stats['ci95'],
                'n_seeds': stats['n_samples'],
                'total_trials': stats['total_trials']
            })
    
    summary_df = pd.DataFrame(summary_stats)
    summary_path = os.path.join(RESULTS_DIR, 'scdt_textvision_perclass_summary.csv')
    summary_df.to_csv(summary_path, index=False)
    print(f"Saved summary statistics to {summary_path}")
else:
    print("\nNo results to save")

In [None]:
# Create visualization
if len(stats_results[models_to_test[0]]) > 0:
    fig = plt.figure(figsize=(14, 11))
    
    # Create subplots
    ax1 = plt.subplot2grid((20, 1), (0, 0), rowspan=8)
    ax2 = plt.subplot2grid((20, 1), (12, 0), rowspan=8)
    
    # Prepare data
    classes = sorted(list(stats_results[models_to_test[0]].keys()))
    mid_point = len(classes) // 2
    classes_first_half = classes[:mid_point]
    classes_second_half = classes[mid_point:]
    
    # Define colors and markers
    colors = {
        'cvcl-resnext': '#2a9d8f',  # Teal for CVCL
        'clip-res': '#e63946'  # Red for CLIP
    }
    markers = {
        'cvcl-resnext': 'o',
        'clip-res': 's'
    }
    avg_line_styles = {
        'cvcl-resnext': '--',
        'clip-res': '-.'
    }
    
    legend_elements = []
    
    def plot_on_axis(ax, class_subset, is_first=False):
        x_pos = np.arange(len(class_subset))
        
        for model_name in models_to_test:
            means = [stats_results[model_name][cls]['mean'] * 100 for cls in class_subset]
            errors = [stats_results[model_name][cls]['ci95'] * 100 for cls in class_subset]
            
            ax.errorbar(x_pos, means, yerr=errors,
                       label=model_name.upper().replace('-', ' '),
                       color=colors[model_name],
                       marker=markers[model_name],
                       markersize=7,
                       linewidth=0,
                       capsize=4,
                       capthick=1.5,
                       alpha=0.9,
                       markeredgecolor='black',
                       markeredgewidth=0.5)
        
        # Add chance level (50% for binary texture choice)
        ax.axhline(y=50, color='#ffa500', linestyle=':', alpha=0.8, linewidth=1.5)
        
        # Calculate overall averages
        all_classes_means = {}
        for model_name in models_to_test:
            all_means = [stats_results[model_name][cls]['mean'] * 100 for cls in classes]
            all_classes_means[model_name] = np.mean(all_means)
        
        # Add average lines
        for model_name in models_to_test:
            avg_performance = all_classes_means[model_name]
            ax.axhline(y=avg_performance,
                      color=colors[model_name],
                      linestyle=avg_line_styles[model_name],
                      alpha=0.7,
                      linewidth=2)
            
            if is_first:
                ax.text(len(class_subset) + 0.8, avg_performance,
                       f'{avg_performance:.1f}%',
                       fontsize=9,
                       color=colors[model_name],
                       va='center',
                       fontweight='bold')
        
        # Formatting
        ax.set_ylabel('SCDT Text-Vision Accuracy (%)', fontsize=11, fontweight='bold')
        ax.set_xticks(x_pos)
        ax.set_xticklabels(class_subset, rotation=45, ha='right', fontsize=10)
        ax.set_ylim(0, 105)
        ax.set_yticks([0, 25, 50, 75, 100])
        ax.grid(axis='y', alpha=0.3, linestyle='-', linewidth=0.5)
        ax.set_axisbelow(True)
        ax.set_facecolor('#fafafa')
        
        # Create legend elements
        global legend_elements
        if is_first:
            from matplotlib.lines import Line2D
            legend_elements = []
            
            for model_name in models_to_test:
                legend_elements.append(
                    Line2D([0], [0], marker=markers[model_name], color='w',
                          markerfacecolor=colors[model_name], markeredgecolor='black',
                          markersize=8, label=model_name.upper().replace('-', ' '))
                )
            
            for model_name in models_to_test:
                avg_val = all_classes_means[model_name]
                legend_elements.append(
                    Line2D([0], [0], color=colors[model_name],
                          linestyle=avg_line_styles[model_name], linewidth=2,
                          label=f'{model_name.upper().split("-")[0]} Average ({avg_val:.1f}%)')
                )
            
            legend_elements.append(
                Line2D([0], [0], color='#ffa500', linestyle=':', linewidth=1.5,
                      label='Chance Level (50%)')  # Note: 50% for binary texture
            )
    
    # Plot both halves
    plot_on_axis(ax1, classes_first_half, is_first=True)
    ax1.set_title('SCDT Text-Vision Per-Class Performance - Part 1\nSame Class Different Texture (Color & Size Controlled)',
                 fontsize=13, fontweight='bold', pad=10)
    
    plot_on_axis(ax2, classes_second_half, is_first=False)
    ax2.set_title('SCDT Text-Vision Per-Class Performance - Part 2',
                 fontsize=13, fontweight='bold', pad=10)
    ax2.set_xlabel('Target Category', fontsize=11, fontweight='bold')
    
    # Add legend
    legend_ax = fig.add_axes([0.125, 0.44, 0.775, 0.08])
    legend_ax.axis('off')
    
    legend = legend_ax.legend(handles=legend_elements,
                             loc='center',
                             ncol=3,
                             fontsize=10,
                             frameon=True,
                             fancybox=True,
                             shadow=True,
                             framealpha=0.95,
                             columnspacing=2.5,
                             handlelength=3)
    
    legend.get_frame().set_facecolor('white')
    legend.get_frame().set_edgecolor('gray')
    legend.get_frame().set_linewidth(1.5)
    
    plt.tight_layout()
    plt.subplots_adjust(hspace=0.35)
    
    # Save plots
    png_path = os.path.join(RESULTS_DIR, 'scdt_textvision_perclass.png')
    pdf_path = os.path.join(RESULTS_DIR, 'scdt_textvision_perclass.pdf')
    
    plt.savefig(png_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.savefig(pdf_path, bbox_inches='tight', facecolor='white')
    plt.show()
    
    print(f"\nSaved plots to:")
    print(f"  - {png_path}")
    print(f"  - {pdf_path}")

In [None]:
# Statistical summary
if len(stats_results) > 0 and len(stats_results[models_to_test[0]]) > 0:
    classes = sorted(list(stats_results[models_to_test[0]].keys()))
    
    summary_data = []
    for cls in classes:
        row = {'Class': cls}
        for model in models_to_test:
            if cls in stats_results[model]:
                stats = stats_results[model][cls]
                row[f"{model}_mean"] = f"{stats['mean']:.3f}"
                row[f"{model}_ci95"] = f"±{stats['ci95']:.3f}"
                row[f"{model}_trials"] = stats['total_trials']
        
        # Add difference if both models have results
        if cls in stats_results['clip-res'] and cls in stats_results['cvcl-resnext']:
            diff = stats_results['clip-res'][cls]['mean'] - stats_results['cvcl-resnext'][cls]['mean']
            row['difference'] = f"{diff:+.3f}"
        
        summary_data.append(row)
    
    summary_df = pd.DataFrame(summary_data)
    print("\n" + "="*80)
    print("SCDT TEXT-VISION PER-CLASS PERFORMANCE SUMMARY")
    print("Task: Same Class Different Texture (Color & Size Controlled)")
    print("Note: Only 2 textures (smooth vs bumpy), so binary discrimination")
    print("="*80)
    print(summary_df.to_string(index=False))
    
    # Overall statistics
    print("\n" + "="*80)
    print("OVERALL SCDT TEXT-VISION PERFORMANCE")
    print("="*80)
    
    for model in models_to_test:
        all_accs = []
        for cls in classes:
            if cls in stats_results[model]:
                all_accs.extend(stats_results[model][cls]['raw'])
        
        if len(all_accs) > 0:
            mean = np.mean(all_accs)
            std = np.std(all_accs)
            se = std / np.sqrt(len(all_accs))
            ci95 = 1.96 * se
            print(f"{model}: {mean:.3f} ± {ci95:.3f} (SE: {se:.3f}, n={len(all_accs)} samples)")
    
    # Statistical test
    from scipy import stats as scipy_stats
    
    cvcl_all = []
    clip_all = []
    for cls in classes:
        if cls in stats_results['cvcl-resnext']:
            cvcl_all.extend(stats_results['cvcl-resnext'][cls]['raw'])
        if cls in stats_results['clip-res']:
            clip_all.extend(stats_results['clip-res'][cls]['raw'])
    
    if len(cvcl_all) > 0 and len(clip_all) > 0:
        t_stat, p_value = scipy_stats.ttest_ind(cvcl_all, clip_all)
        print(f"\nt-test: t={t_stat:.3f}, p={p_value:.6f}")
        if p_value < 0.001:
            print("Result: Highly significant difference (p < 0.001)")
        elif p_value < 0.01:
            print("Result: Significant difference (p < 0.01)")
        elif p_value < 0.05:
            print("Result: Significant difference (p < 0.05)")
        else:
            print("Result: No significant difference")
    
    print("\nInterpretation:")
    print("This test measures texture discrimination (smooth vs bumpy) within the same object class.")
    print("Color and size are held constant, so performance reflects pure texture understanding.")
    print("Note: With only 2 textures, this is essentially a binary discrimination task.")
    print("Chance level is 50% (not 25%) due to the binary nature.")

In [None]:
# Create comparison across all property tests if CSVs exist
print("\n" + "="*80)
print("COMPARISON ACROSS ALL PROPERTY DISCRIMINATION TESTS")
print("="*80)

test_types = [
    ('Class', 'class_textvision_perclass_summary.csv'),
    ('Color (SCDC)', 'scdc_textvision_perclass_summary.csv'),
    ('Size (SCDS)', 'scds_textvision_perclass_summary.csv'),
    ('Texture (SCDT)', 'scdt_textvision_perclass_summary.csv')
]

comparison_results = {}

for test_name, csv_file in test_types:
    csv_path = os.path.join(RESULTS_DIR, csv_file)
    if os.path.exists(csv_path):
        df = pd.read_csv(csv_path)
        for model in models_to_test:
            model_data = df[df['model'] == model]
            if len(model_data) > 0:
                mean_acc = model_data['mean_accuracy'].mean()
                if model not in comparison_results:
                    comparison_results[model] = {}
                comparison_results[model][test_name] = mean_acc * 100

if comparison_results:
    # Create comparison plot
    fig, ax = plt.subplots(figsize=(10, 6))
    
    test_names = [name for name, _ in test_types if any(name in model_res for model_res in comparison_results.values())]
    x = np.arange(len(test_names))
    width = 0.35
    
    for i, model in enumerate(models_to_test):
        if model in comparison_results:
            values = [comparison_results[model].get(test, 0) for test in test_names]
            offset = width * (i - 0.5)
            bars = ax.bar(x + offset, values, width, 
                         label=model.upper().replace('-', ' '),
                         color=colors[model], alpha=0.8)
            
            # Add value labels on bars
            for j, bar in enumerate(bars):
                height = bar.get_height()
                ax.text(bar.get_x() + bar.get_width()/2., height + 1,
                       f'{height:.1f}%', ha='center', va='bottom', fontsize=9)
    
    ax.set_ylabel('Accuracy (%)', fontsize=12, fontweight='bold')
    ax.set_xlabel('Test Type', fontsize=12, fontweight='bold')
    ax.set_title('Text-Vision Performance Across Different Property Discrimination Tests',
                fontsize=14, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(test_names)
    ax.legend()
    ax.grid(axis='y', alpha=0.3)
    
    # Add horizontal lines for reference
    ax.axhline(y=25, color='orange', linestyle='--', alpha=0.5, label='4-way chance')
    ax.axhline(y=50, color='red', linestyle='--', alpha=0.5, label='Binary chance')
    
    plt.tight_layout()
    comparison_plot_path = os.path.join(RESULTS_DIR, 'all_properties_textvision_comparison.png')
    plt.savefig(comparison_plot_path, dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"\nSaved comparison plot to {comparison_plot_path}")
    
    # Print summary table
    print("\nPerformance Summary (%):\n")
    print(f"{'Test Type':<15} {'CVCL':>10} {'CLIP':>10} {'Difference':>12}")
    print("-" * 50)
    for test in test_names:
        cvcl_val = comparison_results.get('cvcl-resnext', {}).get(test, 0)
        clip_val = comparison_results.get('clip-res', {}).get(test, 0)
        diff = clip_val - cvcl_val
        print(f"{test:<15} {cvcl_val:>10.1f} {clip_val:>10.1f} {diff:>+12.1f}")
    
    print("\nKey Insights:")
    print("- Class discrimination: Easiest task (different objects)")
    print("- Color discrimination: Visual property within same class")
    print("- Size discrimination: Often harder due to relative nature")
    print("- Texture discrimination: Binary (smooth vs bumpy), 50% chance level")
else:
    print("\nNo comparison data available. Run other tests to compare.")