# Different Class Different Color and Size (DCDCS) Comparison

This notebook compares CVCL and CLIP models on prototype evaluation where distractors differ in class, color, AND size.
For example, testing a big red apple against small blue car, medium green ball, tiny yellow book.

In [1]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------- ----------------- 7.3/12.8 MB 56.6 MB/s eta 0:00:01
     ---------------------------------------- 12.8/12.8 MB 61.8 MB/s  0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
import os
import sys
import random
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from collections import defaultdict

# ─── Path setup ───
REPO_ROOT = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir, os.pardir, os.pardir))

# Add discover-hidden-visual-concepts to path
DISCOVER_ROOT = os.path.join(REPO_ROOT, 'discover-hidden-visual-concepts')
sys.path.insert(0, os.path.join(DISCOVER_ROOT, 'src'))
sys.path.insert(0, DISCOVER_ROOT)
sys.path.insert(0, REPO_ROOT)

# Import from discover-hidden-visual-concepts
from src.utils.model_loader import load_model
from src.models.feature_extractor import FeatureExtractor

# ─── dataset paths ───
CSV_PATH = os.path.join(REPO_ROOT, 'data', 'KonkLab', 'testdata.csv')
IMG_DIR = os.path.join(REPO_ROOT, 'data', 'KonkLab', '17-objects')
MASTER_CSV = os.path.join(REPO_ROOT, 'PatrickProject', 'Chart_Generation', 'all_prototype_results.csv')

  from pkg_resources import packaging


In [3]:
class CSImageDataset(Dataset):
    """Dataset returning (img_tensor, class, color, size, idx)."""
    def __init__(self, csv_path, img_dir, transform):
        self.df = pd.read_csv(csv_path)
        assert all(col in self.df for col in ['Filename','Class','Color','Size']), \
            "CSV must have Filename, Class, Color and Size columns"
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        cls, fn, col, sz = row['Class'], row['Filename'], row['Color'], row['Size']
        path = os.path.join(self.img_dir, cls, fn)
        img = Image.open(path).convert('RGB')
        return self.transform(img), cls, col, sz, idx

def collate_fn(batch):
    imgs = torch.stack([b[0] for b in batch])
    classes = [b[1] for b in batch]
    colors = [b[2] for b in batch]
    sizes = [b[3] for b in batch]
    idxs = [b[4] for b in batch]
    return imgs, classes, colors, sizes, idxs

def run_dcdcs_test(model_name, seed=0, device='cuda' if torch.cuda.is_available() else 'cpu',
                   batch_size=64, trials_per_pair=10):
    """
    Run Different Class Different Color and Size (DCDCS) evaluation.
    Tests if model can identify objects against distractors that differ in class, color, AND size.
    Example: big red apple vs. small blue car, medium green ball, tiny yellow book
    """
    random.seed(seed)
    torch.manual_seed(seed)

    # 1) load model & transform
    model, transform = load_model(model_name, seed=seed, device=device)
    extractor = FeatureExtractor(model_name, model, device)

    # 2) prepare DataLoader & extract embeddings
    ds = CSImageDataset(CSV_PATH, IMG_DIR, transform)
    # Using single-process data loading to avoid worker issues
    loader = DataLoader(ds, batch_size=batch_size, shuffle=False,
                       num_workers=0, collate_fn=collate_fn)

    all_embs, all_cls, all_col, all_sz, all_idxs = [], [], [], [], []
    with torch.no_grad():
        for imgs, classes, colors, sizes, idxs in loader:
            feats = extractor.get_img_feature(imgs.to(device))
            feats = extractor.norm_features(feats).cpu()
            all_embs.append(feats)
            all_cls.extend(classes)
            all_col.extend(colors)
            all_sz.extend(sizes)
            all_idxs.extend(idxs)
    all_embs = torch.cat(all_embs, dim=0)  # [N, D]

    # 3) group indices by (class, color, size)
    group_map = defaultdict(list)
    for idx, cls, col, sz in zip(all_idxs, all_cls, all_col, all_sz):
        group_map[(cls, col, sz)].append(idx)

    # 4) run evaluation
    results = {}
    total_correct = 0
    total_trials = 0

    for (cls, col, sz), idx_list in group_map.items():
        if len(idx_list) < 1:
            continue

        # Get pool of objects with different class AND color AND size
        pool = []
        for (c2, co2, sz2), other_idxs in group_map.items():
            if c2 != cls and co2 != col and sz2 != sz:  # All attributes must be different
                pool.extend(other_idxs)

        if len(pool) < 3:  # Need at least 3 distractors
            continue

        correct = 0
        for _ in range(trials_per_pair):
            # Query image
            q = random.choice(idx_list)
            # Build prototype from other similar objects
            others = [i for i in idx_list if i != q]
            proto = (all_embs[[all_idxs.index(i) for i in others]].mean(0)
                     if others else all_embs[all_idxs.index(q)])
            proto = proto / proto.norm()

            # Pick distractors with different class, color, AND size
            distractors = random.sample(pool, 3)
            cands = [q] + distractors
            feats = all_embs[[all_idxs.index(i) for i in cands]]
            sims = feats @ proto
            guess = cands[sims.argmax().item()]

            correct += int(guess == q)
            total_correct += int(guess == q)
            total_trials += 1

        acc = correct / trials_per_pair
        results[f"{cls}-{col}-{sz}"] = {
            'correct': correct,
            'trials': trials_per_pair,
            'accuracy': acc
        }

    overall_acc = total_correct / total_trials if total_trials else 0.0
    
    # 5) save results
    os.makedirs(os.path.dirname(MASTER_CSV), exist_ok=True)
    row = pd.DataFrame([{
        'Model': model_name,
        'Test': 'Different-Class-Color-Size',
        'Correct': total_correct,
        'Trials': total_trials,
        'Accuracy': overall_acc
    }])
    if os.path.exists(MASTER_CSV):
        row.to_csv(MASTER_CSV, mode='a', header=False, index=False, float_format='%.4f')
    else:
        row.to_csv(MASTER_CSV, index=False, float_format='%.4f')

    return results, overall_acc

## CVCL Test

In [4]:
# Run CVCL evaluation
cvcl_results, cvcl_overall = run_dcdcs_test('cvcl-resnext')

print("\nCVCL Results by Class-Color-Size:")
for key, res in cvcl_results.items():
    print(f"{key:30s}: {res['correct']}/{res['trials']} ({res['accuracy']:.1%})")
print(f"\nCVCL Overall Accuracy: {cvcl_overall:.1%}")

Loading checkpoint from C:\Users\jbats\.cache\huggingface\hub\models--wkvong--cvcl_s_dino_resnext50_embedding\snapshots\f50eaa0c50a6076a5190b1dd52aeeb6c3e747045\cvcl_s_dino_resnext50_embedding.ckpt


Lightning automatically upgraded your loaded checkpoint from v1.5.8 to v2.5.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\jbats\.cache\huggingface\hub\models--wkvong--cvcl_s_dino_resnext50_embedding\snapshots\f50eaa0c50a6076a5190b1dd52aeeb6c3e747045\cvcl_s_dino_resnext50_embedding.ckpt`



CVCL Results by Class-Color-Size:
butterfly-Multicolored-Medium : 10/10 (100.0%)
butterfly-Yellow-Medium       : 7/10 (70.0%)
butterfly-Red-Medium          : 10/10 (100.0%)
muffins-Multicolored-Medium   : 10/10 (100.0%)
muffins-Orange-Medium         : 10/10 (100.0%)
muffins-Orange-Large          : 10/10 (100.0%)
muffins-Yellow-Large          : 10/10 (100.0%)
muffins-Yellow-Medium         : 10/10 (100.0%)
muffins-Multicolored-Large    : 10/10 (100.0%)
pitcher-Yellow-Medium         : 9/10 (90.0%)
pitcher-Multicolored-Medium   : 5/10 (50.0%)
pitcher-Green-Medium          : 10/10 (100.0%)
pitcher-Multicolored-Large    : 10/10 (100.0%)
pitcher-Grey-Medium           : 8/10 (80.0%)
pitcher-Orange-Medium         : 10/10 (100.0%)
pitcher-Blue-Medium           : 4/10 (40.0%)
tennisracquet-Multicolored-Small: 10/10 (100.0%)
tennisracquet-Grey-Small      : 10/10 (100.0%)
tennisracquet-Multicolored-Medium: 10/10 (100.0%)
tennisracquet-Pink-Small      : 10/10 (100.0%)
tennisracquet-Green-Medium    

## CLIP Test

In [5]:
# Run CLIP evaluation
clip_results, clip_overall = run_dcdcs_test('clip-resnext')

print("\nCLIP Results by Class-Color-Size:")
for key, res in clip_results.items():
    print(f"{key:30s}: {res['correct']}/{res['trials']} ({res['accuracy']:.1%})")
print(f"\nCLIP Overall Accuracy: {clip_overall:.1%}")

  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)



CLIP Results by Class-Color-Size:
butterfly-Multicolored-Medium : 10/10 (100.0%)
butterfly-Yellow-Medium       : 10/10 (100.0%)
butterfly-Red-Medium          : 10/10 (100.0%)
muffins-Multicolored-Medium   : 10/10 (100.0%)
muffins-Orange-Medium         : 10/10 (100.0%)
muffins-Orange-Large          : 10/10 (100.0%)
muffins-Yellow-Large          : 10/10 (100.0%)
muffins-Yellow-Medium         : 10/10 (100.0%)
muffins-Multicolored-Large    : 10/10 (100.0%)
pitcher-Yellow-Medium         : 10/10 (100.0%)
pitcher-Multicolored-Medium   : 10/10 (100.0%)
pitcher-Green-Medium          : 10/10 (100.0%)
pitcher-Multicolored-Large    : 10/10 (100.0%)
pitcher-Grey-Medium           : 10/10 (100.0%)
pitcher-Orange-Medium         : 10/10 (100.0%)
pitcher-Blue-Medium           : 10/10 (100.0%)
tennisracquet-Multicolored-Small: 10/10 (100.0%)
tennisracquet-Grey-Small      : 10/10 (100.0%)
tennisracquet-Multicolored-Medium: 10/10 (100.0%)
tennisracquet-Pink-Small      : 10/10 (100.0%)
tennisracquet-Green-