In [1]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------  12.6/12.8 MB 78.5 MB/s eta 0:00:01
     ---------------------------------------- 12.8/12.8 MB 66.7 MB/s  0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


# Class Text-Vision Comparison - CVCL Training Classes Only

This notebook compares CVCL and CLIP models using text-based prototypes for class discrimination.
**Only tests on the 25 classes that appear in CVCL's training data.**
Instead of averaging image features to create prototypes, we use text descriptions.
The task remains 4-way forced choice classification with 4000 trials.

In [2]:
import os
import sys
import random
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from collections import defaultdict
import clip

# Path setup - Use absolute paths to avoid any confusion
REPO_ROOT = r'C:\Users\jbats\Projects\NTU-Synthetic'

# Add discover-hidden-visual-concepts to path
DISCOVER_ROOT = os.path.join(REPO_ROOT, 'discover-hidden-visual-concepts')
sys.path.insert(0, DISCOVER_ROOT)
sys.path.insert(0, REPO_ROOT)

# Import from discover-hidden-visual-concepts repo
sys.path.append(os.path.join(DISCOVER_ROOT, 'src'))
from utils.model_loader import load_model
from models.feature_extractor import FeatureExtractor

# Paths
CVCL_CLASSES_PATH = os.path.join(REPO_ROOT, 'data', 'CVCL_Konkle_Overlap', 'CVCLKonkMatches.csv')
CSV_PATH = os.path.join(REPO_ROOT, 'data', 'SyntheticKonkle', 'master_labels.csv')
IMG_DIR = os.path.join(REPO_ROOT, 'data', 'SyntheticKonkle_224', 'SyntheticKonkle')
MASTER_CSV = os.path.join(REPO_ROOT, 'PatrickProject', 'Chart_Generation', 'cvcl_training_text_vision_results.csv')

  from pkg_resources import packaging


In [3]:
# Load CVCL training classes
cvcl_df = pd.read_csv(CVCL_CLASSES_PATH)
CVCL_TRAINING_CLASSES = cvcl_df['Class'].str.strip().tolist()

print(f"CVCL Training Classes ({len(CVCL_TRAINING_CLASSES)}):")
for cls in CVCL_TRAINING_CLASSES:
    print(f"  {cls}")

CVCL Training Classes (24):
  ball
  butterfly
  phone
  bagel
  basket
  bell
  fan
  seashell
  bird
  stool
  train
  ring
  tricycle
  toothpaste
  pen
  tree
  apple
  cookie
  bread
  pumpkin
  camera
  rabbit
  pillow
  horse


In [4]:
# Quick test to check if GPU is available
import torch
import time

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")
    
    # Test GPU speed
    x = torch.randn(32, 3, 224, 224).cuda()
    torch.cuda.synchronize()
    start = time.time()
    for _ in range(10):
        _ = x * 2
    torch.cuda.synchronize()
    print(f"GPU test time: {time.time() - start:.3f}s")
else:
    print("WARNING: Running on CPU will be VERY slow!")
    print("If you have a GPU, make sure CUDA is properly installed")

PyTorch version: 2.3.1
CUDA available: True
CUDA device: NVIDIA GeForce RTX 4070 SUPER
CUDA version: 11.8
GPU test time: 0.019s


In [5]:
# Dataset class that filters to CVCL training classes only
class CVCLClassImageDataset(Dataset):
    def __init__(self, csv_path, img_dir, transform, cvcl_classes):
        print(f"[DEBUG] Loading CSV from: {csv_path}")
        print(f"[DEBUG] Image directory: {img_dir}")
        
        self.df = pd.read_csv(csv_path)
        print(f"[DEBUG] CSV loaded with {len(self.df)} rows")
        
        # Filter to only CVCL training classes
        self.df = self.df[self.df['class'].isin(cvcl_classes)].reset_index(drop=True)
        print(f"[DEBUG] Filtered to {len(self.df)} rows for CVCL training classes")
        
        # Handle missing ball and bread in master_labels.csv
        missing_classes = set(cvcl_classes) - set(self.df['class'].unique())
        if missing_classes:
            print(f"[DEBUG] Adding missing classes: {missing_classes}")
            for cls in missing_classes:
                # Add images from ball_color and bread_color folders
                folder = f"{cls}_color"
                folder_path = os.path.join(img_dir, folder)
                if os.path.exists(folder_path):
                    image_files = [f for f in os.listdir(folder_path) if f.endswith('.png')]
                    for img_file in image_files:
                        # Parse filename to extract metadata
                        parts = img_file.replace('.png', '').split('_')
                        if len(parts) >= 5:
                            new_row = {
                                'folder': folder,
                                'filename': img_file,
                                'class': cls,
                                'color': '_'.join(parts[4:]),
                                'size': parts[1],
                                'texture': parts[2],
                                'variant': parts[3]
                            }
                            self.df = pd.concat([self.df, pd.DataFrame([new_row])], ignore_index=True)
        
        print(f"[DEBUG] Final dataset has {len(self.df)} rows")
        print(f"[DEBUG] Classes in dataset: {sorted(self.df['class'].unique())}")
        
        assert 'filename' in self.df and 'class' in self.df and 'folder' in self.df, \
            "CSV needs filename, class, and folder columns"
        
        self.img_dir = img_dir
        self.transform = transform
        
        # Pre-compute paths
        self.paths = [os.path.join(img_dir, row['folder'], row['filename']) 
                      for _, row in self.df.iterrows()]
        
        # Pre-filter to only valid images
        valid_indices = []
        for idx, path in enumerate(self.paths):
            if os.path.exists(path):
                valid_indices.append(idx)
        
        print(f"[DEBUG] Found {len(valid_indices)} valid images")
        
        # Filter dataframe and paths to only valid images
        self.df = self.df.iloc[valid_indices].reset_index(drop=True)
        self.paths = [self.paths[i] for i in valid_indices]
        print(f"Dataset initialized with {len(self.paths)} valid images from {len(self.df['class'].unique())} CVCL classes")

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        cls = row['class']
        path = self.paths[idx]
        try:
            img = Image.open(path).convert('RGB')
        except Exception as e:
            print(f"[ERROR] Failed to load image at index {idx}: {path}")
            img = Image.new('RGB', (224, 224), color='white')
        return self.transform(img), cls, idx

def collate_fn(batch):
    imgs = torch.stack([b[0] for b in batch])
    classes = [b[1] for b in batch]
    idxs = [b[2] for b in batch]
    return imgs, classes, idxs

In [6]:
def run_cvcl_class_text_vision_test(model_name, seed=0, device=None, batch_size=16, 
                                    trials_per_class=None, max_trials=4000):
    """
    Run 4-way classification test using text encoders on CVCL training classes only.
    Uses text descriptions as prototypes instead of averaging image features.
    """
    random.seed(seed)
    torch.manual_seed(seed)
    
    # Device selection
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    if device == 'cuda' and not torch.cuda.is_available():
        print("[ERROR] CUDA requested but not available! Falling back to CPU.")
        device = 'cpu'
    
    if device == 'cpu':
        print("[WARNING] Running on CPU - this will be SLOW!")
        print("Reducing batch size to 4 for CPU")
        batch_size = 4
    else:
        print(f"[INFO] Using GPU: {torch.cuda.get_device_name(0)}")

    # Check if model supports text encoding
    if model_name in ['resnext', 'dino_s_resnext50']:
        print(f"[WARNING] {model_name} has no text encoder, skipping")
        return {}, 0.0

    # 1) Load model
    print(f"[INFO] Loading {model_name} on {device}...")
    import time
    start_time = time.time()
    model, transform = load_model(model_name, seed=seed, device=device)
    extractor = FeatureExtractor(model_name, model, device)
    print(f"[INFO] Model loaded in {time.time() - start_time:.1f}s")

    # 2) Load dataset filtered to CVCL training classes
    print(f"[DEBUG] Creating dataset with CVCL training classes only")
    ds = CVCLClassImageDataset(CSV_PATH, IMG_DIR, transform, CVCL_TRAINING_CLASSES)
    print(f"[INFO] Dataset: {len(ds)} images from CVCL training classes")
    
    if len(ds) == 0:
        print("[ERROR] Dataset is empty! No valid images found.")
        return {}, 0.0
    
    # Use multiple workers only if not on Windows
    num_workers = 0 if os.name == 'nt' else 2
    
    dl = DataLoader(ds, batch_size=batch_size, shuffle=False,
                   num_workers=num_workers, collate_fn=collate_fn, 
                   pin_memory=(device=='cuda'))
    
    # 3) Extract image embeddings
    print(f"[INFO] Extracting embeddings (batch_size={batch_size})...")
    all_img_embs, all_classes, all_idxs = [], [], []
    
    from tqdm import tqdm
    start_time = time.time()
    
    with torch.no_grad():
        for imgs, classes, idxs in tqdm(dl, desc="Extracting embeddings"):
            imgs = imgs.to(device, non_blocking=True)
            feats = extractor.get_img_feature(imgs)
            feats = extractor.norm_features(feats).cpu()
            
            all_img_embs.append(feats)
            all_classes.extend(classes)
            all_idxs.extend(idxs)
    
    if not all_img_embs:
        print("[ERROR] No embeddings extracted!")
        return {}, 0.0
                
    all_img_embs = torch.cat(all_img_embs, dim=0)
    print(f"[INFO] Extracted {len(all_idxs)} embeddings in {time.time() - start_time:.1f}s")

    # 4) Encode text labels for CVCL training classes
    unique_classes = list(set(all_classes))
    print(f"[INFO] Encoding {len(unique_classes)} CVCL training class labels...")
    
    class_text_features = {}
    with torch.no_grad():
        if "clip" in model_name:
            tokens = clip.tokenize(unique_classes, truncate=True).to(device)
            txt_features = model.encode_text(tokens)
            txt_features = extractor.norm_features(txt_features).cpu()
            for i, cls in enumerate(unique_classes):
                class_text_features[cls] = txt_features[i]
        else:  # CVCL
            tokens, token_len = model.tokenize(unique_classes)
            tokens = tokens.to(device)
            if isinstance(token_len, torch.Tensor):
                token_len = token_len.to(device)
            txt_features = model.encode_text(tokens, token_len)
            txt_features = extractor.norm_features(txt_features).cpu()
            for i, cls in enumerate(unique_classes):
                class_text_features[cls] = txt_features[i]
    
    print(f"[INFO] Text encoding complete")
    
    # 5) Build mappings
    idx2class = {i:c for i,c in zip(all_idxs, all_classes)}
    idx2row = {i:r for r,i in enumerate(all_idxs)}
    class2idxs = defaultdict(list)
    for i,c in idx2class.items():
        class2idxs[c].append(i)

    # 6) Run trials - exactly max_trials
    class_results = {}
    total_correct = 0
    total_trials = 0
    
    # Get valid classes (those with at least 1 image)
    valid_classes = [c for c in class2idxs if len(class2idxs[c]) >= 1]
    n_classes = len(valid_classes)
    
    # Calculate trials per class
    if trials_per_class is None:
        trials_per_class = max_trials // n_classes
        extra_trials = max_trials % n_classes
    else:
        extra_trials = 0
    
    print(f"[INFO] Running {max_trials} total trials across {n_classes} CVCL training classes")
    print(f"[INFO] Base trials per class: {trials_per_class}, extra trials for first {extra_trials} classes")
    print(f"[INFO] Using text-based prototypes (not visual averaging)")
    
    # Run trials for each class
    for class_idx, cls in enumerate(valid_classes):
        if total_trials >= max_trials:
            break
            
        idxs = class2idxs[cls]
        
        # Add extra trial for first few classes to reach exactly max_trials
        current_trials = trials_per_class + (1 if class_idx < extra_trials else 0)
        current_trials = min(current_trials, max_trials - total_trials)
        
        correct = 0
        txt_feature = class_text_features[cls].unsqueeze(0)
        
        for trial in range(current_trials):
            # Pick query from this class
            q = random.choice(idxs)
            
            # Pick distractors from other classes
            others = [i for i in all_idxs if idx2class[i] != cls]
            if len(others) < 3:
                continue
            distractors = random.sample(others, 3)
            
            # 4-way classification
            candidates = [q] + distractors
            cand_features = torch.stack([all_img_embs[idx2row[i]] for i in candidates])
            cand_features = cand_features.unsqueeze(0)
            
            # Compute similarity using text prototype
            txt_feature_expanded = txt_feature.unsqueeze(1)
            similarity = (100.0 * cand_features @ txt_feature_expanded.transpose(-2, -1)).softmax(dim=1)
            similarity = similarity.squeeze()
            
            # Predict (query is at index 0)
            if similarity.argmax().item() == 0:
                correct += 1
                total_correct += 1
            total_trials += 1

        acc = correct / current_trials if current_trials > 0 else 0
        class_results[cls] = {'correct': correct, 'trials': current_trials, 'accuracy': acc}
        
        if total_trials % 500 == 0:
            print(f"[PROGRESS] {total_trials}/{max_trials} trials completed")

    # Final progress update
    print(f"[FINAL] Completed {total_trials} trials")
    
    overall_acc = total_correct / total_trials if total_trials else 0.0
    
    print(f"\n[RESULTS] Accuracy: {total_correct}/{total_trials} ({overall_acc:.1%})")
    
    # Save results
    summary_df = pd.DataFrame([{
        'Model': model_name,
        'Test': 'Class-TextVision-CVCLTraining',
        'Dataset': 'SyntheticKonkle_224',
        'Num_Classes': n_classes,
        'Correct': total_correct,
        'Trials': total_trials,
        'Accuracy': overall_acc
    }])
    
    os.makedirs(os.path.dirname(MASTER_CSV), exist_ok=True)
    if os.path.exists(MASTER_CSV):
        summary_df.to_csv(MASTER_CSV, mode='a', header=False, index=False, float_format='%.4f')
    else:
        summary_df.to_csv(MASTER_CSV, index=False, float_format='%.4f')

    return class_results, overall_acc

## CVCL Text-Vision Test (CVCL Training Classes Only)

In [7]:
# Run CVCL text-vision classification on its training classes
cvcl_results, cvcl_overall = run_cvcl_class_text_vision_test('cvcl-resnext', max_trials=4000)

print("\nCVCL Results per Class (Top 20):")
for cls, res in sorted(cvcl_results.items(), key=lambda x: x[1]['accuracy'], reverse=True)[:20]:
    print(f"{cls:20s}: {res['correct']}/{res['trials']} ({res['accuracy']:.1%})")
print(f"\nCVCL Overall Accuracy on Training Classes: {cvcl_overall:.1%}")

[INFO] Using GPU: NVIDIA GeForce RTX 4070 SUPER
[INFO] Loading cvcl-resnext on cuda...
Loading checkpoint from C:\Users\jbats\.cache\huggingface\hub\models--wkvong--cvcl_s_dino_resnext50_embedding\snapshots\f50eaa0c50a6076a5190b1dd52aeeb6c3e747045\cvcl_s_dino_resnext50_embedding.ckpt


Lightning automatically upgraded your loaded checkpoint from v1.5.8 to v2.5.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\jbats\.cache\huggingface\hub\models--wkvong--cvcl_s_dino_resnext50_embedding\snapshots\f50eaa0c50a6076a5190b1dd52aeeb6c3e747045\cvcl_s_dino_resnext50_embedding.ckpt`


[INFO] Model loaded in 2.4s
[DEBUG] Creating dataset with CVCL training classes only
[DEBUG] Loading CSV from: C:\Users\jbats\Projects\NTU-Synthetic\data\SyntheticKonkle\master_labels.csv
[DEBUG] Image directory: C:\Users\jbats\Projects\NTU-Synthetic\data\SyntheticKonkle_224\SyntheticKonkle
[DEBUG] CSV loaded with 7882 rows
[DEBUG] Filtered to 2699 rows for CVCL training classes
[DEBUG] Adding missing classes: {'ball'}
[DEBUG] Final dataset has 2832 rows
[DEBUG] Classes in dataset: ['apple', 'bagel', 'ball', 'basket', 'bell', 'bird', 'bread', 'butterfly', 'camera', 'cookie', 'fan', 'horse', 'pen', 'phone', 'pillow', 'pumpkin', 'rabbit', 'ring', 'seashell', 'stool', 'toothpaste', 'train', 'tree', 'tricycle']
[DEBUG] Found 2809 valid images
Dataset initialized with 2809 valid images from 24 CVCL classes
[INFO] Dataset: 2809 images from CVCL training classes
[INFO] Extracting embeddings (batch_size=16)...


Extracting embeddings: 100%|██████████| 176/176 [00:08<00:00, 21.41it/s]


[INFO] Extracted 2809 embeddings in 8.2s
[INFO] Encoding 24 CVCL training class labels...
[INFO] Text encoding complete
[INFO] Running 4000 total trials across 24 CVCL training classes
[INFO] Base trials per class: 166, extra trials for first 16 classes
[INFO] Using text-based prototypes (not visual averaging)
[PROGRESS] 4000/4000 trials completed
[FINAL] Completed 4000 trials

[RESULTS] Accuracy: 1198/4000 (29.9%)

CVCL Results per Class (Top 20):
ball                : 137/166 (82.5%)
butterfly           : 101/167 (60.5%)
phone               : 94/167 (56.3%)
ring                : 85/167 (50.9%)
basket              : 81/167 (48.5%)
apple               : 61/167 (36.5%)
bell                : 60/167 (35.9%)
pillow              : 51/167 (30.5%)
seashell            : 46/167 (27.5%)
horse               : 45/166 (27.1%)
tree                : 44/166 (26.5%)
toothpaste          : 42/166 (25.3%)
bird                : 40/167 (24.0%)
rabbit              : 39/166 (23.5%)
tricycle            : 38/16

## CLIP Text-Vision Test (CVCL Training Classes Only)

In [8]:
# Run CLIP text-vision classification on CVCL training classes
clip_results, clip_overall = run_cvcl_class_text_vision_test('clip-resnext', max_trials=4000)

print("\nCLIP Results per Class (Top 20):")
for cls, res in sorted(clip_results.items(), key=lambda x: x[1]['accuracy'], reverse=True)[:20]:
    print(f"{cls:20s}: {res['correct']}/{res['trials']} ({res['accuracy']:.1%})")
print(f"\nCLIP Overall Accuracy on CVCL Training Classes: {clip_overall:.1%}")

[INFO] Using GPU: NVIDIA GeForce RTX 4070 SUPER
[INFO] Loading clip-resnext on cuda...
[INFO] Model loaded in 1.3s
[DEBUG] Creating dataset with CVCL training classes only
[DEBUG] Loading CSV from: C:\Users\jbats\Projects\NTU-Synthetic\data\SyntheticKonkle\master_labels.csv
[DEBUG] Image directory: C:\Users\jbats\Projects\NTU-Synthetic\data\SyntheticKonkle_224\SyntheticKonkle
[DEBUG] CSV loaded with 7882 rows
[DEBUG] Filtered to 2699 rows for CVCL training classes
[DEBUG] Adding missing classes: {'ball'}
[DEBUG] Final dataset has 2832 rows
[DEBUG] Classes in dataset: ['apple', 'bagel', 'ball', 'basket', 'bell', 'bird', 'bread', 'butterfly', 'camera', 'cookie', 'fan', 'horse', 'pen', 'phone', 'pillow', 'pumpkin', 'rabbit', 'ring', 'seashell', 'stool', 'toothpaste', 'train', 'tree', 'tricycle']
[DEBUG] Found 2809 valid images
Dataset initialized with 2809 valid images from 24 CVCL classes
[INFO] Dataset: 2809 images from CVCL training classes
[INFO] Extracting embeddings (batch_size=16).

  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
Extracting embeddings: 100%|██████████| 176/176 [00:06<00:00, 27.02it/s]


[INFO] Extracted 2809 embeddings in 6.5s
[INFO] Encoding 24 CVCL training class labels...
[INFO] Text encoding complete
[INFO] Running 4000 total trials across 24 CVCL training classes
[INFO] Base trials per class: 166, extra trials for first 16 classes
[INFO] Using text-based prototypes (not visual averaging)
[PROGRESS] 4000/4000 trials completed
[FINAL] Completed 4000 trials

[RESULTS] Accuracy: 3425/4000 (85.6%)

CLIP Results per Class (Top 20):
bird                : 167/167 (100.0%)
phone               : 167/167 (100.0%)
rabbit              : 166/166 (100.0%)
tree                : 166/166 (100.0%)
apple               : 165/167 (98.8%)
ball                : 164/166 (98.8%)
butterfly           : 163/167 (97.6%)
horse               : 162/166 (97.6%)
pumpkin             : 162/167 (97.0%)
pen                 : 161/167 (96.4%)
tricycle            : 158/166 (95.2%)
bell                : 154/167 (92.2%)
pillow              : 154/167 (92.2%)
ring                : 153/167 (91.6%)
bagel      

## Comparison Summary

In [9]:
# Detailed per-class comparison
print("\n" + "="*60)
print("DETAILED CLASS-BY-CLASS COMPARISON")
print("="*60)

comparison_data = []
for cls in CVCL_TRAINING_CLASSES:
    if cls in cvcl_results and cls in clip_results:
        cvcl_acc = cvcl_results[cls]['accuracy']
        clip_acc = clip_results[cls]['accuracy']
        diff = cvcl_acc - clip_acc
        
        comparison_data.append({
            'Class': cls,
            'CVCL_Accuracy': cvcl_acc,
            'CLIP_Accuracy': clip_acc,
            'Difference': diff,
            'Better_Model': 'CVCL' if diff > 0 else ('CLIP' if diff < 0 else 'Tie')
        })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('Difference', ascending=False)

print("\nClasses where CVCL text encoder performs better:")
cvcl_better = comparison_df[comparison_df['Difference'] > 0.05]
if len(cvcl_better) > 0:
    for _, row in cvcl_better.iterrows():
        print(f"  {row['Class']:20s}: CVCL {row['CVCL_Accuracy']:.1%} vs CLIP {row['CLIP_Accuracy']:.1%} (+{row['Difference']:.1%})")
else:
    print("  None")

print("\nClasses where CLIP text encoder performs better:")
clip_better = comparison_df[comparison_df['Difference'] < -0.05]
if len(clip_better) > 0:
    for _, row in clip_better.head(10).iterrows():  # Show top 10
        print(f"  {row['Class']:20s}: CLIP {row['CLIP_Accuracy']:.1%} vs CVCL {row['CVCL_Accuracy']:.1%} (+{abs(row['Difference']):.1%})")
else:
    print("  None")

print("\nSummary:")
print(f"  CVCL text encoder better on {len(cvcl_better)} classes")
print(f"  CLIP text encoder better on {len(clip_better)} classes")
print(f"  Similar performance on {len(comparison_df) - len(cvcl_better) - len(clip_better)} classes")


DETAILED CLASS-BY-CLASS COMPARISON

Classes where CVCL text encoder performs better:
  None

Classes where CLIP text encoder performs better:
  ball                : CLIP 98.8% vs CVCL 82.5% (+16.3%)
  cookie              : CLIP 49.1% vs CVCL 16.2% (+32.9%)
  toothpaste          : CLIP 60.2% vs CVCL 25.3% (+34.9%)
  butterfly           : CLIP 97.6% vs CVCL 60.5% (+37.1%)
  fan                 : CLIP 59.9% vs CVCL 19.2% (+40.7%)
  ring                : CLIP 91.6% vs CVCL 50.9% (+40.7%)
  bread               : CLIP 63.5% vs CVCL 21.6% (+41.9%)
  phone               : CLIP 100.0% vs CVCL 56.3% (+43.7%)
  seashell            : CLIP 80.2% vs CVCL 27.5% (+52.7%)
  bell                : CLIP 92.2% vs CVCL 35.9% (+56.3%)

Summary:
  CVCL text encoder better on 0 classes
  CLIP text encoder better on 23 classes
  Similar performance on 1 classes
