# Same Class Different Color (SCDC) Text-Vision Analysis with Controlled Attributes

This notebook tests color discrimination WITHIN the same class using text-vision alignment.
All trials have matched size and texture, with only color varying.
Text encoding uses color+class format (e.g., "red apple", "green apple").

In [1]:
import os
import sys
import random
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import clip
import time

# Path setup
REPO_ROOT = r'C:\Users\jbats\Projects\NTU-Synthetic'
DISCOVER_ROOT = os.path.join(REPO_ROOT, 'discover-hidden-visual-concepts')
sys.path.insert(0, DISCOVER_ROOT)
sys.path.insert(0, REPO_ROOT)

# Import from discover-hidden-visual-concepts repo
sys.path.append(os.path.join(DISCOVER_ROOT, 'src'))
from utils.model_loader import load_model
from models.feature_extractor import FeatureExtractor

# SyntheticKonkle paths - Using 224x224 resized images
DATA_DIR = os.path.join(REPO_ROOT, 'data', 'SyntheticKonkle_224')
RESULTS_DIR = os.path.join(REPO_ROOT, 'PatrickProject', 'Chart_Generation', 'Textvision')
os.makedirs(RESULTS_DIR, exist_ok=True)

print(f"Data directory: {DATA_DIR}")
print(f"Results will be saved to: {RESULTS_DIR}")

  from pkg_resources import packaging


Data directory: C:\Users\jbats\Projects\NTU-Synthetic\data\SyntheticKonkle_224
Results will be saved to: C:\Users\jbats\Projects\NTU-Synthetic\PatrickProject\Chart_Generation\Textvision


In [2]:
# Dataset setup with proper attribute tracking
def build_synthetic_dataset():
    """Load the master labels CSV with all visual properties."""
    # Use the master_labels.csv which has all the attribute information
    master_csv = os.path.join(DATA_DIR, 'master_labels.csv')
    
    if not os.path.exists(master_csv):
        print(f"Warning: {master_csv} not found, trying alternative path...")
        # Try the original SyntheticKonkle folder
        master_csv = os.path.join(REPO_ROOT, 'data', 'SyntheticKonkle', 'master_labels.csv')
    
    print(f"Loading master labels from: {master_csv}")
    df = pd.read_csv(master_csv)
    
    # Ensure all required columns are present
    required_cols = ['folder', 'filename', 'class', 'color', 'size', 'texture']
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns: {missing_cols}")
    
    # Clean the data
    df = df.dropna(subset=required_cols)
    
    print(f"Loaded {len(df)} images")
    print(f"Classes: {df['class'].nunique()} unique")
    print(f"Colors: {df['color'].nunique()} unique")
    print(f"Sizes: {df['size'].nunique()} unique")
    print(f"Textures: {df['texture'].nunique()} unique")
    
    # Check color distribution within classes
    color_per_class = df.groupby('class')['color'].nunique()
    print(f"\nAverage colors per class: {color_per_class.mean():.1f}")
    print(f"Min colors in a class: {color_per_class.min()}")
    print(f"Max colors in a class: {color_per_class.max()}")
    
    return df

class SyntheticImageDataset(Dataset):
    def __init__(self, df, data_dir, transform):
        self.df = df
        # For SyntheticKonkle_224, images are in nested structure
        self.data_dir = os.path.join(data_dir, 'SyntheticKonkle')
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join(self.data_dir, row['folder'], row['filename'])
        try:
            img = Image.open(img_path).convert('RGB')
            return self.transform(img), row['class'], row['color'], row['size'], row['texture'], idx
        except Exception as e:
            # Return a black image if file not found
            img = Image.new('RGB', (224, 224), color='black')
            return self.transform(img), row['class'], row['color'], row['size'], row['texture'], idx

def collate_fn(batch):
    imgs = torch.stack([b[0] for b in batch])
    classes = [b[1] for b in batch]
    colors = [b[2] for b in batch]
    sizes = [b[3] for b in batch]
    textures = [b[4] for b in batch]
    idxs = [b[5] for b in batch]
    return imgs, classes, colors, sizes, textures, idxs

In [3]:
def run_scdc_text_vision_test_per_class(model_name, seed=0, device='cuda' if torch.cuda.is_available() else 'cpu', 
                                        batch_size=32, trials_per_class=500):
    """
    Run Same Class Different Color text-vision test with controlled size and texture.
    Returns per-class accuracy results.
    """
    
    random.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Check if model supports text encoding
    if model_name in ['resnext', 'dino_s_resnext50']:
        print(f"[WARNING] {model_name} has no text encoder, skipping")
        return {}

    # Load model & transform
    print(f"Loading {model_name}...")
    model, transform = load_model(model_name, seed=seed, device=device)
    extractor = FeatureExtractor(model_name, model, device)
    
    # Build dataset and extract image embeddings
    df = build_synthetic_dataset()
    ds = SyntheticImageDataset(df, DATA_DIR, transform)
    loader = DataLoader(ds, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=collate_fn)

    print("Extracting image embeddings...")
    all_img_embs, all_classes, all_colors, all_sizes, all_textures, all_idxs = [], [], [], [], [], []
    
    with torch.no_grad():
        for imgs, classes, colors, sizes, textures, idxs in tqdm(loader, desc="Processing images"):
            imgs = imgs.to(device)
            feats = extractor.get_img_feature(imgs)
            feats = extractor.norm_features(feats).cpu().float()
            all_img_embs.append(feats)
            all_classes.extend(classes)
            all_colors.extend(colors)
            all_sizes.extend(sizes)
            all_textures.extend(textures)
            all_idxs.extend(idxs)
    
    all_img_embs = torch.cat(all_img_embs, dim=0)
    print(f"Extracted {len(all_img_embs)} image embeddings")

    # Group images by class, size, texture, and color
    # For SCDC: class, size, texture are fixed; color varies
    class_size_texture_color_idxs = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    idx_to_row = {idx: i for i, idx in enumerate(all_idxs)}
    
    for i, (idx, cls, col, size, texture) in enumerate(zip(all_idxs, all_classes, all_colors, all_sizes, all_textures)):
        class_size_texture_color_idxs[cls][(size, texture)][col].append(idx)

    # Get unique classes and colors
    unique_classes = list(set(all_classes))
    unique_colors = list(set(all_colors))
    
    print(f"Found {len(unique_classes)} classes and {len(unique_colors)} colors")
    
    # Pre-encode all color+class text combinations
    print("Encoding text labels for all color-class combinations...")
    text_features_cache = {}
    
    with torch.no_grad():
        # Create all color+class combinations
        text_labels = []
        label_keys = []
        for cls in unique_classes:
            for color in unique_colors:
                # Format: "red apple", "green apple", etc.
                label = f"{color} {cls}"
                text_labels.append(label)
                label_keys.append((cls, color))
        
        # Encode in batches for efficiency
        if "clip" in model_name:
            # CLIP text encoding
            tokens = clip.tokenize(text_labels, truncate=True).to(device)
            txt_features = model.encode_text(tokens)
            txt_features = extractor.norm_features(txt_features).cpu().float()
            for i, (cls, color) in enumerate(label_keys):
                text_features_cache[(cls, color)] = txt_features[i]
        else:  # CVCL
            # CVCL text encoding with token length
            tokens, token_len = model.tokenize(text_labels)
            tokens = tokens.to(device)
            if isinstance(token_len, torch.Tensor):
                token_len = token_len.to(device)
            txt_features = model.encode_text(tokens, token_len)
            txt_features = extractor.norm_features(txt_features).cpu().float()
            for i, (cls, color) in enumerate(label_keys):
                text_features_cache[(cls, color)] = txt_features[i]
    
    print(f"Encoded {len(text_features_cache)} color-class text combinations")

    # Track per-class performance
    class_correct = defaultdict(int)
    class_total = defaultdict(int)
    
    print(f"Running {trials_per_class} trials per class for SCDC task...")
    
    # Run trials for each class
    for target_class in tqdm(unique_classes, desc=f"Testing {model_name} SCDC"):
        trials_done = 0
        
        # For each size-texture combination in this class
        for (size, texture), color_dict in class_size_texture_color_idxs[target_class].items():
            if trials_done >= trials_per_class:
                break
            
            # Need at least 4 different colors for this class-size-texture combination
            available_colors = list(color_dict.keys())
            if len(available_colors) < 4:
                continue
            
            # Run multiple trials for this combination
            n_trials = min(20, trials_per_class - trials_done)  # More trials per combination
            
            for _ in range(n_trials):
                # Pick target color and 3 distractor colors
                selected_colors = random.sample(available_colors, 4)
                target_color = selected_colors[0]
                distractor_colors = selected_colors[1:4]
                
                # Pick one image for each color (all same class, size, texture)
                target_idx = random.choice(color_dict[target_color])
                distractor_idxs = [random.choice(color_dict[col]) for col in distractor_colors]
                
                # Create 4-way choice: target + 3 distractors
                candidates = [target_idx] + distractor_idxs
                
                # Get image features for all candidates
                cand_features = torch.stack([all_img_embs[idx_to_row[idx]] for idx in candidates]).float()
                
                # Get text feature for target color+class
                target_text_feature = text_features_cache[(target_class, target_color)].float()
                
                # Compute similarity with text encoding
                similarities = cand_features @ target_text_feature
                
                # Check if model correctly identifies target (index 0)
                prediction = similarities.argmax().item()
                
                # Update counts
                class_correct[target_class] += int(prediction == 0)
                class_total[target_class] += 1
                trials_done += 1
                
                if trials_done >= trials_per_class:
                    break
    
    # Calculate per-class accuracy
    class_accuracies = {}
    for cls in unique_classes:
        if class_total[cls] > 0:
            class_accuracies[cls] = class_correct[cls] / class_total[cls]
        else:
            class_accuracies[cls] = 0.0
    
    # Print summary
    overall_correct = sum(class_correct.values())
    overall_total = sum(class_total.values())
    overall_acc = overall_correct / overall_total if overall_total > 0 else 0
    
    print(f"\nSCDC Overall: {overall_correct}/{overall_total} = {overall_acc:.3f}")
    print(f"Classes tested: {len([c for c in class_accuracies if class_total[c] > 0])}")
    
    # Show top and bottom performers
    sorted_classes = sorted(class_accuracies.items(), key=lambda x: x[1], reverse=True)
    print("\nTop 5 classes for color discrimination:")
    for cls, acc in sorted_classes[:5]:
        print(f"  {cls}: {acc:.3f} ({class_total[cls]} trials)")
    print("\nBottom 5 classes for color discrimination:")
    for cls, acc in sorted_classes[-5:]:
        print(f"  {cls}: {acc:.3f} ({class_total[cls]} trials)")
    
    return class_accuracies

In [4]:
# Run multiple seeds for statistical analysis
n_seeds = 3  # Limited seeds due to potential rate limiting
trials_per_class = 500  # Consistent with class discrimination test
models_to_test = ['cvcl-resnext', 'clip-res']

# Check dataset first
test_df = build_synthetic_dataset()
n_classes = len(test_df['class'].unique())
n_colors = len(test_df['color'].unique())
print(f"Found {n_classes} unique classes and {n_colors} unique colors")

print(f"\nStarting SCDC Text-Vision evaluation:")
print(f"Configuration: {n_seeds} seeds × {trials_per_class} trials/class × {n_classes} classes")
print(f"Task: Same Class Different Color discrimination")
print(f"Control: Size and texture are held constant within each trial")
print(f"Text format: color + class (e.g., 'red apple')\n")

all_results = {model: defaultdict(list) for model in models_to_test}

# Run evaluation
for model_name in models_to_test:
    print(f"\n{'='*60}")
    print(f"Testing {model_name} with SCDC text-vision approach")
    print('='*60)
    
    for seed in range(n_seeds):
        print(f"\nSeed {seed+1}/{n_seeds} for {model_name}")
        
        try:
            class_acc = run_scdc_text_vision_test_per_class(
                model_name, 
                seed=seed, 
                trials_per_class=trials_per_class
            )
            
            # Store results
            for cls, acc in class_acc.items():
                all_results[model_name][cls].append(acc)
            
            # Print progress
            if len(class_acc) > 0:
                mean_acc = np.mean(list(class_acc.values()))
                print(f"  Mean accuracy across classes: {mean_acc:.3f}")
                print(f"  Classes successfully tested: {len(class_acc)}")
                
        except Exception as e:
            print(f"  Error: {e}")
            if "404" in str(e) or "rate" in str(e).lower():
                print(f"  Rate limit hit - waiting 60 seconds...")
                time.sleep(60)
                # Retry once
                try:
                    class_acc = run_scdc_text_vision_test_per_class(
                        model_name, seed=seed, trials_per_class=trials_per_class
                    )
                    for cls, acc in class_acc.items():
                        all_results[model_name][cls].append(acc)
                    print(f"  Retry successful!")
                except:
                    print(f"  Retry failed - skipping seed {seed}")
                    continue
        
        # Add delay between seeds for CVCL
        if 'cvcl' in model_name and seed < n_seeds - 1:
            print("  Waiting 30 seconds before next seed...")
            time.sleep(30)

# Calculate statistics
stats_results = {}
for model_name in models_to_test:
    stats_results[model_name] = {}
    for cls, accs in all_results[model_name].items():
        if len(accs) > 0:
            n_samples = len(accs)
            stats_results[model_name][cls] = {
                'mean': np.mean(accs),
                'std': np.std(accs, ddof=1) if n_samples > 1 else 0,
                'se': np.std(accs, ddof=1) / np.sqrt(n_samples) if n_samples > 1 else 0,
                'ci95': 1.96 * np.std(accs, ddof=1) / np.sqrt(n_samples) if n_samples > 1 else 0,
                'n_samples': n_samples,
                'total_trials': n_samples * trials_per_class,
                'raw': accs
            }

print("\n" + "="*60)
print("SCDC TEXT-VISION EVALUATION COMPLETE")
print("="*60)

Loading master labels from: C:\Users\jbats\Projects\NTU-Synthetic\data\SyntheticKonkle_224\master_labels.csv
Loaded 7881 images
Classes: 67 unique
Colors: 11 unique
Sizes: 4 unique
Textures: 2 unique

Average colors per class: 10.0
Min colors in a class: 10
Max colors in a class: 11
Found 67 unique classes and 11 unique colors

Starting SCDC Text-Vision evaluation:
Configuration: 3 seeds × 500 trials/class × 67 classes
Task: Same Class Different Color discrimination
Control: Size and texture are held constant within each trial
Text format: color + class (e.g., 'red apple')


Testing cvcl-resnext with SCDC text-vision approach

Seed 1/3 for cvcl-resnext
Loading cvcl-resnext...
Loading checkpoint from C:\Users\jbats\.cache\huggingface\hub\models--wkvong--cvcl_s_dino_resnext50_embedding\snapshots\f50eaa0c50a6076a5190b1dd52aeeb6c3e747045\cvcl_s_dino_resnext50_embedding.ckpt


Lightning automatically upgraded your loaded checkpoint from v1.5.8 to v2.5.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\jbats\.cache\huggingface\hub\models--wkvong--cvcl_s_dino_resnext50_embedding\snapshots\f50eaa0c50a6076a5190b1dd52aeeb6c3e747045\cvcl_s_dino_resnext50_embedding.ckpt`


Loading master labels from: C:\Users\jbats\Projects\NTU-Synthetic\data\SyntheticKonkle_224\master_labels.csv
Loaded 7881 images
Classes: 67 unique
Colors: 11 unique
Sizes: 4 unique
Textures: 2 unique

Average colors per class: 10.0
Min colors in a class: 10
Max colors in a class: 11
Extracting image embeddings...


Processing images: 100%|██████████| 247/247 [00:20<00:00, 12.11it/s]


Extracted 7881 image embeddings
Found 67 classes and 11 colors
Encoding text labels for all color-class combinations...
Encoded 737 color-class text combinations
Running 500 trials per class for SCDC task...


Testing cvcl-resnext SCDC: 100%|██████████| 67/67 [00:00<00:00, 533.78it/s]


SCDC Overall: 1992/7920 = 0.252
Classes tested: 67

Top 5 classes for color discrimination:
  apple: 0.375 (120 trials)
  meat: 0.367 (120 trials)
  grill: 0.358 (120 trials)
  phone: 0.340 (100 trials)
  keyboard: 0.333 (120 trials)

Bottom 5 classes for color discrimination:
  lantern: 0.192 (120 trials)
  babushkadolls: 0.192 (120 trials)
  seashell: 0.183 (120 trials)
  dumbell: 0.142 (120 trials)
  bell: 0.092 (120 trials)
  Mean accuracy across classes: 0.252
  Classes successfully tested: 67
  Waiting 30 seconds before next seed...






Seed 2/3 for cvcl-resnext
Loading cvcl-resnext...
Loading checkpoint from C:\Users\jbats\.cache\huggingface\hub\models--wkvong--cvcl_s_dino_resnext50_embedding\snapshots\f50eaa0c50a6076a5190b1dd52aeeb6c3e747045\cvcl_s_dino_resnext50_embedding_seed_1.ckpt


Lightning automatically upgraded your loaded checkpoint from v1.5.8 to v2.5.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\jbats\.cache\huggingface\hub\models--wkvong--cvcl_s_dino_resnext50_embedding\snapshots\f50eaa0c50a6076a5190b1dd52aeeb6c3e747045\cvcl_s_dino_resnext50_embedding_seed_1.ckpt`


Loading master labels from: C:\Users\jbats\Projects\NTU-Synthetic\data\SyntheticKonkle_224\master_labels.csv
Loaded 7881 images
Classes: 67 unique
Colors: 11 unique
Sizes: 4 unique
Textures: 2 unique

Average colors per class: 10.0
Min colors in a class: 10
Max colors in a class: 11
Extracting image embeddings...


Processing images: 100%|██████████| 247/247 [00:27<00:00,  8.94it/s]


Extracted 7881 image embeddings
Found 67 classes and 11 colors
Encoding text labels for all color-class combinations...
Encoded 737 color-class text combinations
Running 500 trials per class for SCDC task...


Testing cvcl-resnext SCDC: 100%|██████████| 67/67 [00:00<00:00, 496.24it/s]


SCDC Overall: 1904/7920 = 0.240
Classes tested: 67

Top 5 classes for color discrimination:
  phone: 0.390 (100 trials)
  sodacan: 0.333 (120 trials)
  grill: 0.325 (120 trials)
  ornament: 0.317 (120 trials)
  dresser: 0.300 (120 trials)

Bottom 5 classes for color discrimination:
  sippycup: 0.183 (120 trials)
  bell: 0.158 (120 trials)
  seashell: 0.158 (120 trials)
  trophy: 0.150 (120 trials)
  cookie: 0.142 (120 trials)
  Mean accuracy across classes: 0.241
  Classes successfully tested: 67
  Waiting 30 seconds before next seed...






Seed 3/3 for cvcl-resnext
Loading cvcl-resnext...
Loading checkpoint from C:\Users\jbats\.cache\huggingface\hub\models--wkvong--cvcl_s_dino_resnext50_embedding\snapshots\f50eaa0c50a6076a5190b1dd52aeeb6c3e747045\cvcl_s_dino_resnext50_embedding_seed_2.ckpt


Lightning automatically upgraded your loaded checkpoint from v1.5.8 to v2.5.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\jbats\.cache\huggingface\hub\models--wkvong--cvcl_s_dino_resnext50_embedding\snapshots\f50eaa0c50a6076a5190b1dd52aeeb6c3e747045\cvcl_s_dino_resnext50_embedding_seed_2.ckpt`


Loading master labels from: C:\Users\jbats\Projects\NTU-Synthetic\data\SyntheticKonkle_224\master_labels.csv
Loaded 7881 images
Classes: 67 unique
Colors: 11 unique
Sizes: 4 unique
Textures: 2 unique

Average colors per class: 10.0
Min colors in a class: 10
Max colors in a class: 11
Extracting image embeddings...


Processing images: 100%|██████████| 247/247 [00:20<00:00, 12.21it/s]


Extracted 7881 image embeddings
Found 67 classes and 11 colors
Encoding text labels for all color-class combinations...
Encoded 737 color-class text combinations
Running 500 trials per class for SCDC task...


Testing cvcl-resnext SCDC: 100%|██████████| 67/67 [00:00<00:00, 513.35it/s]


SCDC Overall: 1847/7920 = 0.233
Classes tested: 67

Top 5 classes for color discrimination:
  phone: 0.360 (100 trials)
  handbag: 0.317 (120 trials)
  handheldgame: 0.317 (120 trials)
  grill: 0.308 (120 trials)
  doorknob: 0.300 (120 trials)

Bottom 5 classes for color discrimination:
  helmet: 0.167 (120 trials)
  ring: 0.167 (120 trials)
  fan: 0.158 (120 trials)
  suitcase: 0.158 (120 trials)
  toothpaste: 0.100 (120 trials)
  Mean accuracy across classes: 0.233
  Classes successfully tested: 67

Testing clip-res with SCDC text-vision approach

Seed 1/3 for clip-res
Loading clip-res...





Loading master labels from: C:\Users\jbats\Projects\NTU-Synthetic\data\SyntheticKonkle_224\master_labels.csv
Loaded 7881 images
Classes: 67 unique
Colors: 11 unique
Sizes: 4 unique
Textures: 2 unique

Average colors per class: 10.0
Min colors in a class: 10
Max colors in a class: 11
Extracting image embeddings...


  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
Processing images: 100%|██████████| 247/247 [00:17<00:00, 14.03it/s]


Extracted 7881 image embeddings
Found 67 classes and 11 colors
Encoding text labels for all color-class combinations...
Encoded 737 color-class text combinations
Running 500 trials per class for SCDC task...


Testing clip-res SCDC: 100%|██████████| 67/67 [00:00<00:00, 445.14it/s]


SCDC Overall: 7648/7920 = 0.966
Classes tested: 67

Top 5 classes for color discrimination:
  ornament: 1.000 (120 trials)
  dresser: 1.000 (120 trials)
  camera: 1.000 (120 trials)
  suitcase: 1.000 (120 trials)
  doll: 1.000 (120 trials)

Bottom 5 classes for color discrimination:
  trophy: 0.900 (120 trials)
  axe: 0.883 (120 trials)
  pitcher: 0.875 (120 trials)
  tennisracquet: 0.850 (40 trials)
  basket: 0.767 (120 trials)
  Mean accuracy across classes: 0.965
  Classes successfully tested: 67

Seed 2/3 for clip-res
Loading clip-res...





Loading master labels from: C:\Users\jbats\Projects\NTU-Synthetic\data\SyntheticKonkle_224\master_labels.csv
Loaded 7881 images
Classes: 67 unique
Colors: 11 unique
Sizes: 4 unique
Textures: 2 unique

Average colors per class: 10.0
Min colors in a class: 10
Max colors in a class: 11
Extracting image embeddings...


Processing images: 100%|██████████| 247/247 [00:16<00:00, 15.07it/s]


Extracted 7881 image embeddings
Found 67 classes and 11 colors
Encoding text labels for all color-class combinations...
Encoded 737 color-class text combinations
Running 500 trials per class for SCDC task...


Testing clip-res SCDC: 100%|██████████| 67/67 [00:00<00:00, 355.40it/s]



SCDC Overall: 7654/7920 = 0.966
Classes tested: 67

Top 5 classes for color discrimination:
  helmet: 1.000 (120 trials)
  ornament: 1.000 (120 trials)
  dresser: 1.000 (120 trials)
  sodacan: 1.000 (120 trials)
  apple: 1.000 (120 trials)

Bottom 5 classes for color discrimination:
  butterfly: 0.900 (120 trials)
  lock: 0.892 (120 trials)
  trophy: 0.875 (120 trials)
  axe: 0.850 (120 trials)
  basket: 0.792 (120 trials)
  Mean accuracy across classes: 0.966
  Classes successfully tested: 67

Seed 3/3 for clip-res
Loading clip-res...
Loading master labels from: C:\Users\jbats\Projects\NTU-Synthetic\data\SyntheticKonkle_224\master_labels.csv
Loaded 7881 images
Classes: 67 unique
Colors: 11 unique
Sizes: 4 unique
Textures: 2 unique

Average colors per class: 10.0
Min colors in a class: 10
Max colors in a class: 11
Extracting image embeddings...


Processing images: 100%|██████████| 247/247 [00:17<00:00, 14.50it/s]


Extracted 7881 image embeddings
Found 67 classes and 11 colors
Encoding text labels for all color-class combinations...
Encoded 737 color-class text combinations
Running 500 trials per class for SCDC task...


Testing clip-res SCDC: 100%|██████████| 67/67 [00:00<00:00, 422.68it/s]


SCDC Overall: 7649/7920 = 0.966
Classes tested: 67

Top 5 classes for color discrimination:
  helmet: 1.000 (120 trials)
  microwave: 1.000 (120 trials)
  sodacan: 1.000 (120 trials)
  saddle: 1.000 (120 trials)
  phone: 1.000 (100 trials)

Bottom 5 classes for color discrimination:
  butterfly: 0.908 (120 trials)
  lock: 0.875 (120 trials)
  tennisracquet: 0.850 (40 trials)
  axe: 0.833 (120 trials)
  basket: 0.758 (120 trials)
  Mean accuracy across classes: 0.965
  Classes successfully tested: 67

SCDC TEXT-VISION EVALUATION COMPLETE





In [5]:
# Save detailed results
detailed_df = []
for model_name in models_to_test:
    for cls, stats in stats_results[model_name].items():
        for seed_idx, acc in enumerate(stats['raw']):
            detailed_df.append({
                'model': model_name,
                'class': cls,
                'seed': seed_idx,
                'accuracy': acc,
                'n_trials': trials_per_class,
                'test_type': 'scdc_text_vision'
            })

if len(detailed_df) > 0:
    detailed_df = pd.DataFrame(detailed_df)
    output_path = os.path.join(RESULTS_DIR, 'scdc_textvision_perclass_results.csv')
    detailed_df.to_csv(output_path, index=False)
    print(f"\nSaved detailed results to {output_path}")
    
    # Save summary statistics
    summary_stats = []
    for model_name in models_to_test:
        for cls, stats in stats_results[model_name].items():
            summary_stats.append({
                'model': model_name,
                'class': cls,
                'mean_accuracy': stats['mean'],
                'std': stats['std'],
                'se': stats['se'],
                'ci95': stats['ci95'],
                'n_seeds': stats['n_samples'],
                'total_trials': stats['total_trials']
            })
    
    summary_df = pd.DataFrame(summary_stats)
    summary_path = os.path.join(RESULTS_DIR, 'scdc_textvision_perclass_summary.csv')
    summary_df.to_csv(summary_path, index=False)
    print(f"Saved summary statistics to {summary_path}")
else:
    print("\nNo results to save")


Saved detailed results to C:\Users\jbats\Projects\NTU-Synthetic\PatrickProject\Chart_Generation\Textvision\scdc_textvision_perclass_results.csv
Saved summary statistics to C:\Users\jbats\Projects\NTU-Synthetic\PatrickProject\Chart_Generation\Textvision\scdc_textvision_perclass_summary.csv


In [None]:
# Create visualization
if len(stats_results[models_to_test[0]]) > 0:
    fig = plt.figure(figsize=(14, 11))
    
    # Create subplots
    ax1 = plt.subplot2grid((20, 1), (0, 0), rowspan=8)
    ax2 = plt.subplot2grid((20, 1), (12, 0), rowspan=8)
    
    # Prepare data
    classes = sorted(list(stats_results[models_to_test[0]].keys()))
    mid_point = len(classes) // 2
    classes_first_half = classes[:mid_point]
    classes_second_half = classes[mid_point:]
    
    # Define colors and markers
    colors = {
        'cvcl-resnext': '#2a9d8f',  # Teal for CVCL
        'clip-res': '#e63946'  # Red for CLIP
    }
    markers = {
        'cvcl-resnext': 'o',
        'clip-res': 's'
    }
    avg_line_styles = {
        'cvcl-resnext': '--',
        'clip-res': '-.'
    }
    
    legend_elements = []
    
    def plot_on_axis(ax, class_subset, is_first=False):
        x_pos = np.arange(len(class_subset))
        
        for model_name in models_to_test:
            means = [stats_results[model_name][cls]['mean'] * 100 for cls in class_subset]
            errors = [stats_results[model_name][cls]['ci95'] * 100 for cls in class_subset]
            
            ax.errorbar(x_pos, means, yerr=errors,
                       label=model_name.upper().replace('-', ' '),
                       color=colors[model_name],
                       marker=markers[model_name],
                       markersize=7,
                       linewidth=0,
                       capsize=4,
                       capthick=1.5,
                       alpha=0.9,
                       markeredgecolor='black',
                       markeredgewidth=0.5)
        
        # Add chance level
        ax.axhline(y=25, color='#ffa500', linestyle=':', alpha=0.8, linewidth=1.5)
        
        # Calculate overall averages
        all_classes_means = {}
        for model_name in models_to_test:
            all_means = [stats_results[model_name][cls]['mean'] * 100 for cls in classes]
            all_classes_means[model_name] = np.mean(all_means)
        
        # Add average lines
        for model_name in models_to_test:
            avg_performance = all_classes_means[model_name]
            ax.axhline(y=avg_performance,
                      color=colors[model_name],
                      linestyle=avg_line_styles[model_name],
                      alpha=0.7,
                      linewidth=2)
            
            if is_first:
                ax.text(len(class_subset) + 0.8, avg_performance,
                       f'{avg_performance:.1f}%',
                       fontsize=9,
                       color=colors[model_name],
                       va='center',
                       fontweight='bold')
        
        # Formatting
        ax.set_ylabel('SCDC Text-Vision Accuracy (%)', fontsize=11, fontweight='bold')
        ax.set_xticks(x_pos)
        ax.set_xticklabels(class_subset, rotation=45, ha='right', fontsize=10)
        ax.set_ylim(0, 105)
        ax.set_yticks([0, 25, 50, 75, 100])
        ax.grid(axis='y', alpha=0.3, linestyle='-', linewidth=0.5)
        ax.set_axisbelow(True)
        ax.set_facecolor('#fafafa')
        
        # Create legend elements
        global legend_elements
        if is_first:
            from matplotlib.lines import Line2D
            legend_elements = []
            
            for model_name in models_to_test:
                legend_elements.append(
                    Line2D([0], [0], marker=markers[model_name], color='w',
                          markerfacecolor=colors[model_name], markeredgecolor='black',
                          markersize=8, label=model_name.upper().replace('-', ' '))
                )
            
            for model_name in models_to_test:
                avg_val = all_classes_means[model_name]
                legend_elements.append(
                    Line2D([0], [0], color=colors[model_name],
                          linestyle=avg_line_styles[model_name], linewidth=2,
                          label=f'{model_name.upper().split("-")[0]} Average ({avg_val:.1f}%)')
                )
            
            legend_elements.append(
                Line2D([0], [0], color='#ffa500', linestyle=':', linewidth=1.5,
                      label='Chance Level (25%)')
            )
    
    # Plot both halves
    plot_on_axis(ax1, classes_first_half, is_first=True)
    ax1.set_title('SCDC Text-Vision Per-Class Performance - Part 1\nSame Class Different Color (Size & Texture Controlled)',
                 fontsize=13, fontweight='bold', pad=10)
    
    plot_on_axis(ax2, classes_second_half, is_first=False)
    ax2.set_title('SCDC Text-Vision Per-Class Performance - Part 2',
                 fontsize=13, fontweight='bold', pad=10)
    ax2.set_xlabel('Target Category', fontsize=11, fontweight='bold')
    
    # Add legend
    legend_ax = fig.add_axes([0.125, 0.44, 0.775, 0.08])
    legend_ax.axis('off')
    
    legend = legend_ax.legend(handles=legend_elements,
                             loc='center',
                             ncol=3,
                             fontsize=10,
                             frameon=True,
                             fancybox=True,
                             shadow=True,
                             framealpha=0.95,
                             columnspacing=2.5,
                             handlelength=3)
    
    legend.get_frame().set_facecolor('white')
    legend.get_frame().set_edgecolor('gray')
    legend.get_frame().set_linewidth(1.5)
    
    plt.tight_layout()
    plt.subplots_adjust(hspace=0.35)
    
    # Save plots
    png_path = os.path.join(RESULTS_DIR, 'scdc_textvision_perclass.png')
    pdf_path = os.path.join(RESULTS_DIR, 'scdc_textvision_perclass.pdf')
    
    plt.savefig(png_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.savefig(pdf_path, bbox_inches='tight', facecolor='white')
    plt.show()
    
    print(f"\nSaved plots to:")
    print(f"  - {png_path}")
    print(f"  - {pdf_path}")

In [None]:
# Statistical summary
if len(stats_results) > 0 and len(stats_results[models_to_test[0]]) > 0:
    classes = sorted(list(stats_results[models_to_test[0]].keys()))
    
    summary_data = []
    for cls in classes:
        row = {'Class': cls}
        for model in models_to_test:
            if cls in stats_results[model]:
                stats = stats_results[model][cls]
                row[f"{model}_mean"] = f"{stats['mean']:.3f}"
                row[f"{model}_ci95"] = f"±{stats['ci95']:.3f}"
                row[f"{model}_trials"] = stats['total_trials']
        
        # Add difference if both models have results
        if cls in stats_results['clip-res'] and cls in stats_results['cvcl-resnext']:
            diff = stats_results['clip-res'][cls]['mean'] - stats_results['cvcl-resnext'][cls]['mean']
            row['difference'] = f"{diff:+.3f}"
        
        summary_data.append(row)
    
    summary_df = pd.DataFrame(summary_data)
    print("\n" + "="*80)
    print("SCDC TEXT-VISION PER-CLASS PERFORMANCE SUMMARY")
    print("Task: Same Class Different Color (Size & Texture Controlled)")
    print("="*80)
    print(summary_df.to_string(index=False))
    
    # Overall statistics
    print("\n" + "="*80)
    print("OVERALL SCDC TEXT-VISION PERFORMANCE")
    print("="*80)
    
    for model in models_to_test:
        all_accs = []
        for cls in classes:
            if cls in stats_results[model]:
                all_accs.extend(stats_results[model][cls]['raw'])
        
        if len(all_accs) > 0:
            mean = np.mean(all_accs)
            std = np.std(all_accs)
            se = std / np.sqrt(len(all_accs))
            ci95 = 1.96 * se
            print(f"{model}: {mean:.3f} ± {ci95:.3f} (SE: {se:.3f}, n={len(all_accs)} samples)")
    
    # Statistical test
    from scipy import stats as scipy_stats
    
    cvcl_all = []
    clip_all = []
    for cls in classes:
        if cls in stats_results['cvcl-resnext']:
            cvcl_all.extend(stats_results['cvcl-resnext'][cls]['raw'])
        if cls in stats_results['clip-res']:
            clip_all.extend(stats_results['clip-res'][cls]['raw'])
    
    if len(cvcl_all) > 0 and len(clip_all) > 0:
        t_stat, p_value = scipy_stats.ttest_ind(cvcl_all, clip_all)
        print(f"\nt-test: t={t_stat:.3f}, p={p_value:.6f}")
        if p_value < 0.001:
            print("Result: Highly significant difference (p < 0.001)")
        elif p_value < 0.01:
            print("Result: Significant difference (p < 0.01)")
        elif p_value < 0.05:
            print("Result: Significant difference (p < 0.05)")
        else:
            print("Result: No significant difference")
    
    print("\nInterpretation:")
    print("This test measures how well models can distinguish colors within the same object class.")
    print("Size and texture are held constant, so performance reflects pure color discrimination.")
    print("Higher scores indicate better color-text alignment within object categories.")

In [None]:
# Create difference plot
if len(stats_results) > 0 and len(stats_results[models_to_test[0]]) > 0:
    plt.figure(figsize=(14, 7))
    
    classes = sorted(list(stats_results[models_to_test[0]].keys()))
    differences = []
    
    for cls in classes:
        if cls in stats_results['clip-res'] and cls in stats_results['cvcl-resnext']:
            diff = stats_results['clip-res'][cls]['mean'] - stats_results['cvcl-resnext'][cls]['mean']
            differences.append(diff)
        else:
            differences.append(0)
    
    colors_diff = ['#2ecc71' if d > 0 else '#e74c3c' for d in differences]
    bars = plt.bar(range(len(classes)), differences, color=colors_diff, alpha=0.7, edgecolor='black', linewidth=0.5)
    
    # Add value labels
    for i, (cls, diff) in enumerate(zip(classes, differences)):
        if diff != 0:
            plt.text(i, diff + (0.01 if diff > 0 else -0.02), f'{diff:.2f}',
                    ha='center', va='bottom' if diff > 0 else 'top', fontsize=8)
    
    plt.axhline(y=0, color='black', linestyle='-', linewidth=1)
    plt.xlabel('Object Class', fontsize=12, fontweight='bold')
    plt.ylabel('Performance Difference\n(CLIP - CVCL)', fontsize=12, fontweight='bold')
    plt.title('SCDC Text-Vision Model Performance Differences by Class\nColor Discrimination within Same Class',
             fontsize=14, fontweight='bold')
    plt.xticks(range(len(classes)), classes, rotation=45, ha='right')
    plt.grid(axis='y', alpha=0.3, linestyle='--')
    
    # Add legend
    from matplotlib.patches import Patch
    legend_elements = [
        Patch(facecolor='#2ecc71', alpha=0.7, label='CLIP Better'),
        Patch(facecolor='#e74c3c', alpha=0.7, label='CVCL Better')
    ]
    plt.legend(handles=legend_elements, loc='upper right')
    
    plt.tight_layout()
    
    diff_plot_path = os.path.join(RESULTS_DIR, 'scdc_textvision_difference.png')
    plt.savefig(diff_plot_path, dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"\nSaved difference plot to {diff_plot_path}")
    
    # Print summary of differences
    clip_better = sum(1 for d in differences if d > 0)
    cvcl_better = sum(1 for d in differences if d < 0)
    tied = sum(1 for d in differences if d == 0)
    
    print(f"\nSummary for SCDC (Color Discrimination):")
    print(f"  CLIP performs better: {clip_better}/{len(classes)} classes")
    print(f"  CVCL performs better: {cvcl_better}/{len(classes)} classes")
    if tied > 0:
        print(f"  No difference: {tied}/{len(classes)} classes")
    
    avg_diff = np.mean([d for d in differences if d != 0])
    print(f"  Average difference: {avg_diff:.3f}")
    print(f"\nNote: This measures color discrimination ability within the same object class.")
    print(f"Higher values suggest better color-text understanding.")