# Same Class Different Color and Size (SCDS) Comparison

This notebook compares CVCL and CLIP models on prototype evaluation where distractors are the same class but differ in both color AND size.
For example, testing a big red apple against small green apple, medium yellow apple, tiny blue apple.

In [1]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------  12.6/12.8 MB 78.5 MB/s eta 0:00:01
     ---------------------------------------- 12.8/12.8 MB 72.9 MB/s  0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
import os
import sys
import random
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from collections import defaultdict

# ─── Path setup ───
REPO_ROOT = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir, os.pardir, os.pardir))

# Add discover-hidden-visual-concepts to path
DISCOVER_ROOT = os.path.join(REPO_ROOT, 'discover-hidden-visual-concepts')
sys.path.insert(0, DISCOVER_ROOT)
sys.path.insert(0, REPO_ROOT)

# Import from discover-hidden-visual-concepts repo
sys.path.append(os.path.join(DISCOVER_ROOT, 'src'))
from utils.model_loader import load_model
from models.feature_extractor import FeatureExtractor
from models.multimodal.multimodal_lit import MultiModalLitModel

# ─── hard-coded paths ───
CSV_PATH = os.path.join(REPO_ROOT, 'data', 'KonkLab', 'testdata.csv')
IMG_DIR = os.path.join(REPO_ROOT, 'data', 'KonkLab', '17-objects')
MASTER_CSV = os.path.join(REPO_ROOT, 'PatrickProject', 'Chart_Generation', 'all_prototype_results.csv')

  from pkg_resources import packaging


In [3]:
# Shared Dataset and Helper Functions
class CSImageDataset(Dataset):
    """Dataset returning (img_tensor, class, color, size, idx)."""
    def __init__(self, csv_path, img_dir, transform):
        self.df = pd.read_csv(csv_path)
        assert all(col in self.df for col in ['Filename','Class','Color','Size']), \
            "CSV must have Filename, Class, Color and Size columns"
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        cls, fn, col, sz = row['Class'], row['Filename'], row['Color'], row['Size']
        path = os.path.join(self.img_dir, cls, fn)
        img = Image.open(path).convert('RGB')
        return self.transform(img), cls, col, sz, idx

def collate_fn(batch):
    imgs = torch.stack([b[0] for b in batch])
    classes = [b[1] for b in batch]
    colors = [b[2] for b in batch]
    sizes = [b[3] for b in batch]
    idxs = [b[4] for b in batch]
    return imgs, classes, colors, sizes, idxs

def run_scdcs_test(model_name, seed=0, device='cuda' if torch.cuda.is_available() else 'cpu',
                   batch_size=64, trials_per_pair=10, max_images=None):
    """
    Run Same Class Different Color and Size (SCDCS) evaluation.
    
    Tests if model can identify objects against distractors that are:
    - Same class
    - Different color AND different size
    
    This tests within-class discrimination based on visual attributes (color and size).
    Example: big red apple vs. small green apple, medium yellow apple, tiny blue apple
    """
    random.seed(seed)
    torch.manual_seed(seed)

    # 1) load model & transform
    model, transform = load_model(model_name, seed=seed, device=device)
    extractor = FeatureExtractor(model_name, model, device)
    print(f"[ℹ️] Loaded model '{model_name}'")

    # 2) prepare DataLoader & extract embeddings
    ds = CSImageDataset(CSV_PATH, IMG_DIR, transform)
    loader = DataLoader(ds, batch_size=batch_size, shuffle=False,
                       num_workers=0, collate_fn=collate_fn)

    all_embs, all_cls, all_col, all_sz, all_idxs = [], [], [], [], []
    with torch.no_grad():
        for imgs, classes, colors, sizes, idxs in loader:
            feats = extractor.get_img_feature(imgs.to(device))
            feats = extractor.norm_features(feats).cpu()
            feats = feats.float()
            all_embs.append(feats)
            all_cls.extend(classes)
            all_col.extend(colors)
            all_sz.extend(sizes)
            all_idxs.extend(idxs)
    all_embs = torch.cat(all_embs, dim=0)  # [N, D]
    print(f"[ℹ️] Extracted embeddings for {len(all_idxs)} images")

    # 3) group indices by class first, then by (color, size) combinations
    class_groups = defaultdict(lambda: defaultdict(list))
    for idx, cls, col, sz in zip(all_idxs, all_cls, all_col, all_sz):
        class_groups[cls][(col, sz)].append(idx)

    # 4) run evaluation
    class_color_size_results = {}
    total_correct = 0
    total_trials = 0

    print("[ℹ️] Running SCDCS 4-way trials (same class, different color and size)...")
    for cls, color_size_groups in class_groups.items():
        for (col, sz), idx_list in color_size_groups.items():
            if len(idx_list) < 1:
                continue

            # Get pool of objects of SAME class but DIFFERENT color AND size
            pool = []
            for (co2, sz2), other_idxs in color_size_groups.items():
                if co2 != col and sz2 != sz:  # Both color AND size must be different
                    pool.extend(other_idxs)

            if len(pool) < 3:  # Need at least 3 distractors
                continue

            correct = 0
            for _ in range(trials_per_pair):
                # Query image from this (class, color, size) group
                q = random.choice(idx_list)
                
                # Build prototype from other images with same class, color, and size
                others = [i for i in idx_list if i != q]
                if others:
                    proto = all_embs[[all_idxs.index(i) for i in others]].mean(0)
                else:
                    proto = all_embs[all_idxs.index(q)]  # Use query itself if no others
                proto = proto / proto.norm()

                # Pick 3 distractors with same class but different color AND size
                distractors = random.sample(pool, 3)
                cands = [q] + distractors
                feats = all_embs[[all_idxs.index(i) for i in cands]]
                sims = feats @ proto
                guess = cands[sims.argmax().item()]

                correct += int(guess == q)
                total_correct += int(guess == q)
                total_trials += 1

            acc = correct / trials_per_pair
            key = f"{cls}-{col}-{sz}"
            class_color_size_results[key] = {
                'correct': correct,
                'trials': trials_per_pair,
                'accuracy': acc
            }
            print(f"{col:>8s}/{sz:>6s} • {cls:20s}: {correct}/{trials_per_pair} ({acc:.1%})")

    overall_acc = total_correct / total_trials if total_trials else 0.0
    print(f"\n[✅] Overall accuracy: {total_correct}/{total_trials} ({overall_acc:.1%})")
    
    # 5) save results
    summary_df = pd.DataFrame([{
        'Model': model_name,
        'Test': 'Same-Class-Different-Color-Size',
        'Correct': total_correct,
        'Trials': total_trials,
        'Accuracy': overall_acc
    }])
    
    os.makedirs(os.path.dirname(MASTER_CSV), exist_ok=True)
    if os.path.exists(MASTER_CSV):
        summary_df.to_csv(MASTER_CSV, mode='a', header=False, index=False, float_format='%.4f')
    else:
        summary_df.to_csv(MASTER_CSV, index=False, float_format='%.4f')

    return class_color_size_results, overall_acc

## CVCL Test

In [4]:
# Run CVCL evaluation
cvcl_results, cvcl_overall = run_scdcs_test('cvcl-resnext')

print("\nCVCL Results by Class-Color-Size:")
for key, res in cvcl_results.items():
    print(f"{key:30s}: {res['correct']}/{res['trials']} ({res['accuracy']:.1%})")
print(f"\nCVCL Overall Accuracy: {cvcl_overall:.1%}")

Loading checkpoint from C:\Users\jbats\.cache\huggingface\hub\models--wkvong--cvcl_s_dino_resnext50_embedding\snapshots\f50eaa0c50a6076a5190b1dd52aeeb6c3e747045\cvcl_s_dino_resnext50_embedding.ckpt


Lightning automatically upgraded your loaded checkpoint from v1.5.8 to v2.5.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\jbats\.cache\huggingface\hub\models--wkvong--cvcl_s_dino_resnext50_embedding\snapshots\f50eaa0c50a6076a5190b1dd52aeeb6c3e747045\cvcl_s_dino_resnext50_embedding.ckpt`


[ℹ️] Loaded model 'cvcl-resnext'
[ℹ️] Extracted embeddings for 1005 images
[ℹ️] Running SCDCS 4-way trials (same class, different color and size)...
Multicolored/Medium • muffins             : 0/10 (0.0%)
  Orange/Medium • muffins             : 0/10 (0.0%)
  Orange/ Large • muffins             : 2/10 (20.0%)
  Yellow/ Large • muffins             : 10/10 (100.0%)
  Yellow/Medium • muffins             : 10/10 (100.0%)
Multicolored/ Large • muffins             : 6/10 (60.0%)
Multicolored/ Large • pitcher             : 10/10 (100.0%)
Multicolored/Medium • tennisracquet       : 10/10 (100.0%)
   Green/Medium • tennisracquet       : 10/10 (100.0%)
  Yellow/ Large • phone               : 10/10 (100.0%)
    Blue/ Small • phone               : 10/10 (100.0%)
  Purple/Medium • headband            : 10/10 (100.0%)
    Grey/Medium • headband            : 10/10 (100.0%)
  Orange/Medium • headband            : 10/10 (100.0%)
     Red/ Small • headband            : 10/10 (100.0%)
Multicolored/ Small 

## CLIP Test

In [5]:
# Run CLIP evaluation
clip_results, clip_overall = run_scdcs_test('clip-resnext')

print("\nCLIP Results by Class-Color-Size:")
for key, res in clip_results.items():
    print(f"{key:30s}: {res['correct']}/{res['trials']} ({res['accuracy']:.1%})")
print(f"\nCLIP Overall Accuracy: {clip_overall:.1%}")

[ℹ️] Loaded model 'clip-resnext'


  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)


[ℹ️] Extracted embeddings for 1005 images
[ℹ️] Running SCDCS 4-way trials (same class, different color and size)...
Multicolored/Medium • muffins             : 0/10 (0.0%)
  Orange/Medium • muffins             : 10/10 (100.0%)
  Orange/ Large • muffins             : 10/10 (100.0%)
  Yellow/ Large • muffins             : 10/10 (100.0%)
  Yellow/Medium • muffins             : 10/10 (100.0%)
Multicolored/ Large • muffins             : 0/10 (0.0%)
Multicolored/ Large • pitcher             : 10/10 (100.0%)
Multicolored/Medium • tennisracquet       : 10/10 (100.0%)
   Green/Medium • tennisracquet       : 10/10 (100.0%)
  Yellow/ Large • phone               : 10/10 (100.0%)
    Blue/ Small • phone               : 10/10 (100.0%)
  Purple/Medium • headband            : 10/10 (100.0%)
    Grey/Medium • headband            : 10/10 (100.0%)
  Orange/Medium • headband            : 10/10 (100.0%)
     Red/ Small • headband            : 10/10 (100.0%)
Multicolored/ Small • headband            : 10/10