# DCDS Text-Vision Test - CVCL Training Classes Only\n\nDifferent Class Different Size\n\n**This version only tests on the 25 classes that appear in CVCL's training data.**

In [1]:
# Imports
import os
import sys
import pandas as pd
import numpy as np
import torch
from PIL import Image
from tqdm import tqdm
import random
from datetime import datetime
import clip
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader

# Path setup
REPO_ROOT = r'C:\Users\jbats\Projects\NTU-Synthetic'

# Add discover-hidden-visual-concepts to path
DISCOVER_ROOT = os.path.join(REPO_ROOT, 'discover-hidden-visual-concepts')
sys.path.insert(0, DISCOVER_ROOT)
sys.path.insert(0, REPO_ROOT)

# Import from discover-hidden-visual-concepts repo
sys.path.append(os.path.join(DISCOVER_ROOT, 'src'))
from utils.model_loader import load_model
from models.feature_extractor import FeatureExtractor

# Paths
CVCL_CLASSES_PATH = os.path.join(REPO_ROOT, 'data', 'CVCL_Konkle_Overlap', 'CVCLKonkMatches.csv')
DATA_PATH = os.path.join(REPO_ROOT, 'data', 'SyntheticKonkle_224', 'SyntheticKonkle')
METADATA_PATH = os.path.join(REPO_ROOT, 'data', 'SyntheticKonkle', 'master_labels.csv')
RESULTS_PATH = os.path.join(REPO_ROOT, 'PatrickProject', 'Chart_Generation', 'cvcl_training_text_vision_results.csv')

print(f"Data path: {DATA_PATH}")
print(f"CVCL classes file: {CVCL_CLASSES_PATH}")
print(f"Results will be saved to: {RESULTS_PATH}")

  from pkg_resources import packaging


Data path: C:\Users\jbats\Projects\NTU-Synthetic\data\SyntheticKonkle_224\SyntheticKonkle
CVCL classes file: C:\Users\jbats\Projects\NTU-Synthetic\data\CVCL_Konkle_Overlap\CVCLKonkMatches.csv
Results will be saved to: C:\Users\jbats\Projects\NTU-Synthetic\PatrickProject\Chart_Generation\cvcl_training_text_vision_results.csv


In [2]:
# Load CVCL training classes
cvcl_df = pd.read_csv(CVCL_CLASSES_PATH)
CVCL_TRAINING_CLASSES = cvcl_df['Class'].str.strip().tolist()

print(f"CVCL Training Classes ({len(CVCL_TRAINING_CLASSES)}):")
for cls in CVCL_TRAINING_CLASSES:
    print(f"  {cls}")

CVCL Training Classes (24):
  ball
  butterfly
  phone
  bagel
  basket
  bell
  fan
  seashell
  bird
  stool
  train
  ring
  tricycle
  toothpaste
  pen
  tree
  apple
  cookie
  bread
  pumpkin
  camera
  rabbit
  pillow
  horse


In [3]:
# Load and prepare data - FILTERED TO CVCL TRAINING CLASSES
def load_cvcl_synthetickonkle_data():
    """Load SyntheticKonkle dataset filtered to CVCL training classes"""
    # Read metadata
    df = pd.read_csv(METADATA_PATH)
    
    # Filter to only CVCL training classes
    df = df[df['class'].isin(CVCL_TRAINING_CLASSES)].copy()
    
    # Handle missing ball and bread
    missing_classes = set(CVCL_TRAINING_CLASSES) - set(df['class'].unique())
    if missing_classes:
        print(f"Adding missing classes from folders: {missing_classes}")
        for cls in missing_classes:
            folder = f"{cls}_color"
            folder_path = os.path.join(DATA_PATH, folder)
            if os.path.exists(folder_path):
                image_files = [f for f in os.listdir(folder_path) if f.endswith('.png')]
                for img_file in image_files:
                    # Parse filename to extract metadata
                    # Format: ball_large_bumpy_01_black.png
                    # Parts: [class, size, texture, variant, color]
                    parts = img_file.replace('.png', '').split('_')
                    if len(parts) >= 5:
                        new_row = {
                            'folder': folder,
                            'filename': img_file,
                            'class': cls,
                            'size': parts[1],  # This is the size (large/medium/small)
                            'texture': parts[2],  # This is the texture (bumpy/smooth)
                            'variant': parts[3],  # This is the variant (01/02)
                            'color': '_'.join(parts[4:])  # Rest is color
                        }
                        df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
    
    # Build full paths
    df['image_path'] = df.apply(lambda row: os.path.join(DATA_PATH, row['folder'], row['filename']), axis=1)
    
    # Filter to only entries with valid metadata
    df = df[df['color'].notna() & (df['color'] != '')].copy()
    df = df[df['size'].notna() & (df['size'] != '')].copy()
    df = df[df['texture'].notna() & (df['texture'] != '')].copy()
    
    # Standardize names (lowercase)
    df['color'] = df['color'].str.lower().str.strip()
    df['size'] = df['size'].str.lower().str.strip()
    df['texture'] = df['texture'].str.lower().str.strip()
    
    # Filter to only valid size values (remove any texture values that got mixed in)
    valid_sizes = ['small', 'medium', 'large']
    df = df[df['size'].isin(valid_sizes)].copy()
    
    print(f"Loaded {len(df)} images from {df['class'].nunique()} CVCL training classes")
    print(f"Classes: {sorted(df['class'].unique())}")
    print(f"Unique colors: {df['color'].nunique()}")
    print(f"Unique sizes: {df['size'].nunique()} - {sorted(df['size'].unique())}")
    print(f"Unique textures: {df['texture'].nunique()} - {sorted(df['texture'].unique())}")
    
    return df

# Load data
data_df = load_cvcl_synthetickonkle_data()
print("\nSample data:")
print(data_df[['class', 'color', 'size', 'texture']].head())

Adding missing classes from folders: {'ball'}
Loaded 2823 images from 24 CVCL training classes
Classes: ['apple', 'bagel', 'ball', 'basket', 'bell', 'bird', 'bread', 'butterfly', 'camera', 'cookie', 'fan', 'horse', 'pen', 'phone', 'pillow', 'pumpkin', 'rabbit', 'ring', 'seashell', 'stool', 'toothpaste', 'train', 'tree', 'tricycle']
Unique colors: 12
Unique sizes: 3 - ['large', 'medium', 'small']
Unique textures: 3 - ['bumpy', 'bumpy_01', 'smooth']

Sample data:
   class   color   size texture
0  apple     red  large   bumpy
1  apple   green  large   bumpy
2  apple    blue  large   bumpy
3  apple  yellow  large   bumpy
4  apple  orange  large   bumpy


In [5]:
def run_dcds_test(model_name='cvcl-resnext', seed=0, device=None, num_trials=4000):
    """Run DCDS text-vision test on CVCL training classes only
    
    Args:
        model_name: Model to test ('cvcl-resnext' or 'clip-resnext')
        seed: Random seed for reproducibility
        device: Device to use (None for auto-detect)
        num_trials: Total number of trials to run
    """
    # Set seeds
    random.seed(seed)
    torch.manual_seed(seed)
    
    print(f"\n{'='*60}")
    print(f"Running DCDS Text-Vision Test with {model_name}")
    print(f"CVCL Training Classes Only")
    print(f"(Different Class Different Size - Controlled Color/Texture)")
    print(f"{'='*60}")
    
    # Device selection
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    print(f"Using device: {device}")
    
    # Load model
    print(f"[INFO] Loading {model_name} on {device}...")
    model, transform = load_model(model_name, seed=seed, device=device)
    extractor = FeatureExtractor(model_name, model, device)
    model.eval()
    
    # Load and prepare data (already filtered to CVCL training classes)
    df = load_cvcl_synthetickonkle_data()
    
    # Create combination columns
    df['color_texture'] = df['color'] + '_' + df['texture']
    df['class_size'] = df['class'] + '_' + df['size']
    df['full_combo'] = df['class'] + '_' + df['color'] + '_' + df['texture'] + '_' + df['size']
    
    print(f"Loaded {len(df)} images with size, color, and texture annotations")
    print(f"Unique classes: {df['class'].nunique()}")
    print(f"Unique sizes: {df['size'].nunique()}")
    print(f"Size values: {sorted(df['size'].unique())}")
    print(f"Unique color-texture combinations: {df['color_texture'].nunique()}")
    
    # Find color-texture combinations with at least 3 different class-size pairs
    ct_groups = df.groupby('color_texture')
    valid_ct = []
    for ct, group in ct_groups:
        unique_class_sizes = group['class_size'].unique()
        unique_classes = group['class'].unique()
        unique_sizes = group['size'].unique()
        # Need at least 3 different class-size combinations for 4-way choice
        # We'll duplicate one if needed
        if len(unique_class_sizes) >= 3 and len(unique_classes) >= 2 and len(unique_sizes) >= 2:
            valid_ct.append(ct)
    
    if len(valid_ct) == 0:
        print("ERROR: No color-texture combinations have enough class-size diversity.")
        print("Cannot run DCDS test with strict controls.")
        return [], 0.0
    
    print(f"\nFound {len(valid_ct)} CVCL training class color-texture combinations with sufficient diversity")
    
    # Pre-compute image embeddings
    print("\nExtracting image embeddings...")
    image_embeddings = {}
    skipped_images = []
    
    # Get all relevant images
    df_valid = df[df['color_texture'].isin(valid_ct)]
    all_image_paths = df_valid['image_path'].unique().tolist()
    batch_size = 16
    
    for i in tqdm(range(0, len(all_image_paths), batch_size), desc="Extracting embeddings"):
        batch_paths = all_image_paths[i:i+batch_size]
        batch_images = []
        
        for img_path in batch_paths:
            if img_path not in image_embeddings:  # Skip if already processed
                try:
                    img = Image.open(img_path).convert('RGB')
                    img_processed = transform(img).unsqueeze(0).to(device)
                    batch_images.append((img_path, img_processed))
                except Exception as e:
                    # Skip corrupted/invalid images
                    skipped_images.append(img_path)
                    continue
        
        if batch_images:
            paths = [p for p, _ in batch_images]
            imgs = torch.cat([img for _, img in batch_images], dim=0)
            
            with torch.no_grad():
                embeddings = extractor.get_img_feature(imgs)
                embeddings = extractor.norm_features(embeddings)
            
            for path, emb in zip(paths, embeddings):
                image_embeddings[path] = emb.cpu().float()
    
    print(f"Extracted embeddings for {len(image_embeddings)} images")
    if skipped_images:
        print(f"Skipped {len(skipped_images)} corrupted/invalid images")
    
    # Prepare for trials
    correct_count = 0
    trial_results = []
    
    # Calculate trials per color-texture combination
    trials_per_ct = num_trials // len(valid_ct)
    remaining_trials = num_trials % len(valid_ct)
    
    print(f"\nRunning {num_trials} trials across {len(valid_ct)} color-texture combinations...")
    
    # Run trials
    for ct_idx, ct in enumerate(tqdm(valid_ct, desc="Processing combinations")):
        # Get all images for this color-texture combination
        ct_data = df_valid[df_valid['color_texture'] == ct]
        
        # Group by class-size
        class_size_groups = ct_data.groupby('class_size').agg({
            'image_path': list,
            'class': 'first',
            'size': 'first'
        }).to_dict('index')
        
        available_class_sizes = list(class_size_groups.keys())
        
        if len(available_class_sizes) < 3:  # Changed from 4 to 3
            continue
        
        # Determine number of trials for this combination
        n_trials = trials_per_ct + (1 if ct_idx < remaining_trials else 0)
        
        for trial in range(n_trials):
            # Select class-size pairs for 4-way choice
            if len(available_class_sizes) == 3:
                # Use all 3 pairs plus duplicate one for 4-way choice
                selected_pairs = available_class_sizes.copy()
                # Add a duplicate to make 4 options
                selected_pairs.append(random.choice(available_class_sizes))
            elif len(available_class_sizes) >= 4:
                # Try to select 4 diverse pairs
                selected_pairs = []
                used_classes = set()
                used_sizes = set()
                
                # Shuffle to get random selection
                shuffled_pairs = available_class_sizes.copy()
                random.shuffle(shuffled_pairs)
                
                for pair in shuffled_pairs:
                    pair_class = class_size_groups[pair]['class']
                    pair_size = class_size_groups[pair]['size']
                    
                    # Try to get diverse classes and sizes
                    if len(selected_pairs) < 4:
                        if pair_class not in used_classes or pair_size not in used_sizes or len(selected_pairs) < 4:
                            selected_pairs.append(pair)
                            used_classes.add(pair_class)
                            used_sizes.add(pair_size)
                
                if len(selected_pairs) < 4:
                    # If we can't get 4 diverse pairs, just take any 4
                    selected_pairs = random.sample(available_class_sizes, 4)
            else:
                continue
            
            # First pair is the query
            query_pair = selected_pairs[0]
            query_data = class_size_groups[query_pair]
            
            # Select random query image from valid images
            valid_query_paths = [p for p in query_data['image_path'] if p in image_embeddings]
            if not valid_query_paths:
                continue
            query_img_path = random.choice(valid_query_paths)
            query_class = query_data['class']
            query_size = query_data['size']
            
            # Shuffle for candidate order
            random.shuffle(selected_pairs)
            correct_idx = selected_pairs.index(query_pair)
            
            # Create text prompts - ONLY size + class, no color/texture
            candidate_texts = []
            for pair in selected_pairs:
                pair_data = class_size_groups[pair]
                text_prompt = f"{pair_data['size']} {pair_data['class'].lower()}"
                candidate_texts.append(text_prompt)
            
            # Encode text prompts
            with torch.no_grad():
                if "clip" in model_name:
                    tokens = clip.tokenize(candidate_texts, truncate=True).to(device)
                    txt_features = model.encode_text(tokens)
                    txt_features = extractor.norm_features(txt_features)
                else:  # CVCL
                    tokens, token_len = model.tokenize(candidate_texts)
                    tokens = tokens.to(device)
                    if isinstance(token_len, torch.Tensor):
                        token_len = token_len.to(device)
                    txt_features = model.encode_text(tokens, token_len)
                    txt_features = extractor.norm_features(txt_features)
            
            # Get query image embedding
            query_embedding = image_embeddings[query_img_path].unsqueeze(0).to(device)
            
            # Calculate similarity
            query_embedding = query_embedding.float()
            txt_features = txt_features.float()
            
            similarity = (100.0 * query_embedding @ txt_features.transpose(-2, -1)).softmax(dim=1)
            
            # Get prediction
            pred_idx = similarity.argmax(dim=1).item()
            
            # Check if correct
            is_correct = (pred_idx == correct_idx)
            if is_correct:
                correct_count += 1
            
            # Store trial result
            trial_results.append({
                'trial': len(trial_results) + 1,
                'query_class': query_class,
                'query_size': query_size,
                'color_texture': ct,
                'query_img': os.path.basename(query_img_path),
                'correct_idx': correct_idx,
                'predicted_idx': pred_idx,
                'correct': is_correct,
                'candidate_texts': candidate_texts,
                'similarity_scores': similarity.cpu().numpy().tolist()
            })
    
    # Calculate accuracy
    accuracy = correct_count / len(trial_results) if trial_results else 0
    
    print(f"\n{'='*60}")
    print(f"Results for {model_name} - DCDS Text-Vision Test:")
    print(f"Total trials: {len(trial_results)}")
    print(f"Correct: {correct_count}")
    print(f"Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"{'='*60}")
    
    # Save results
    results_row = {
        'Model': model_name,
        'Test': 'DCDS-TextVision-CVCLTraining',
        'Dataset': 'SyntheticKonkle_224',
        'Correct': correct_count,
        'Trials': len(trial_results),
        'Accuracy': accuracy
    }
    
    # Append to results file
    os.makedirs(os.path.dirname(RESULTS_PATH), exist_ok=True)
    if os.path.exists(RESULTS_PATH):
        results_df = pd.read_csv(RESULTS_PATH)
    else:
        results_df = pd.DataFrame()
    
    results_df = pd.concat([results_df, pd.DataFrame([results_row])], ignore_index=True)
    results_df.to_csv(RESULTS_PATH, index=False, float_format='%.4f')
    print(f"\nResults saved to {RESULTS_PATH}")
    
    return trial_results, accuracy

## Run CVCL DCDS Text-Vision Test

In [6]:
# Run CVCL test
cvcl_trials, cvcl_accuracy = run_dcds_test('cvcl-resnext', seed=0, num_trials=4000)


Running DCDS Text-Vision Test with cvcl-resnext
CVCL Training Classes Only
(Different Class Different Size - Controlled Color/Texture)
Using device: cuda
[INFO] Loading cvcl-resnext on cuda...
Loading checkpoint from C:\Users\jbats\.cache\huggingface\hub\models--wkvong--cvcl_s_dino_resnext50_embedding\snapshots\f50eaa0c50a6076a5190b1dd52aeeb6c3e747045\cvcl_s_dino_resnext50_embedding.ckpt


Lightning automatically upgraded your loaded checkpoint from v1.5.8 to v2.5.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\jbats\.cache\huggingface\hub\models--wkvong--cvcl_s_dino_resnext50_embedding\snapshots\f50eaa0c50a6076a5190b1dd52aeeb6c3e747045\cvcl_s_dino_resnext50_embedding.ckpt`


Adding missing classes from folders: {'ball'}
Loaded 2823 images from 24 CVCL training classes
Classes: ['apple', 'bagel', 'ball', 'basket', 'bell', 'bird', 'bread', 'butterfly', 'camera', 'cookie', 'fan', 'horse', 'pen', 'phone', 'pillow', 'pumpkin', 'rabbit', 'ring', 'seashell', 'stool', 'toothpaste', 'train', 'tree', 'tricycle']
Unique colors: 12
Unique sizes: 3 - ['large', 'medium', 'small']
Unique textures: 3 - ['bumpy', 'bumpy_01', 'smooth']
Loaded 2823 images with size, color, and texture annotations
Unique classes: 24
Unique sizes: 3
Size values: ['large', 'medium', 'small']
Unique color-texture combinations: 29

Found 22 CVCL training class color-texture combinations with sufficient diversity

Extracting image embeddings...


Extracting embeddings: 100%|██████████| 176/176 [00:08<00:00, 21.74it/s]


Extracted embeddings for 2799 images
Skipped 14 corrupted/invalid images

Running 4000 trials across 22 color-texture combinations...


Processing combinations: 100%|██████████| 22/22 [00:35<00:00,  1.61s/it]


Results for cvcl-resnext - DCDS Text-Vision Test:
Total trials: 4000
Correct: 1233
Accuracy: 0.3083 (30.83%)

Results saved to C:\Users\jbats\Projects\NTU-Synthetic\PatrickProject\Chart_Generation\cvcl_training_text_vision_results.csv





## Run CLIP DCDS Text-Vision Test

In [7]:
# Run CLIP test
clip_trials, clip_accuracy = run_dcds_test('clip-resnext', seed=0, num_trials=4000)


Running DCDS Text-Vision Test with clip-resnext
CVCL Training Classes Only
(Different Class Different Size - Controlled Color/Texture)
Using device: cuda
[INFO] Loading clip-resnext on cuda...
Adding missing classes from folders: {'ball'}
Loaded 2823 images from 24 CVCL training classes
Classes: ['apple', 'bagel', 'ball', 'basket', 'bell', 'bird', 'bread', 'butterfly', 'camera', 'cookie', 'fan', 'horse', 'pen', 'phone', 'pillow', 'pumpkin', 'rabbit', 'ring', 'seashell', 'stool', 'toothpaste', 'train', 'tree', 'tricycle']
Unique colors: 12
Unique sizes: 3 - ['large', 'medium', 'small']
Unique textures: 3 - ['bumpy', 'bumpy_01', 'smooth']
Loaded 2823 images with size, color, and texture annotations
Unique classes: 24
Unique sizes: 3
Size values: ['large', 'medium', 'small']
Unique color-texture combinations: 29

Found 22 CVCL training class color-texture combinations with sufficient diversity

Extracting image embeddings...


  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
Extracting embeddings: 100%|██████████| 176/176 [00:06<00:00, 26.89it/s]


Extracted embeddings for 2799 images
Skipped 14 corrupted/invalid images

Running 4000 trials across 22 color-texture combinations...


Processing combinations: 100%|██████████| 22/22 [00:20<00:00,  1.07it/s]


Results for clip-resnext - DCDS Text-Vision Test:
Total trials: 4000
Correct: 3087
Accuracy: 0.7718 (77.18%)

Results saved to C:\Users\jbats\Projects\NTU-Synthetic\PatrickProject\Chart_Generation\cvcl_training_text_vision_results.csv





## Compare Results

In [8]:
# Display comparison
print("\n" + "="*60)
print("DCDS TEXT-VISION TEST COMPARISON - CVCL TRAINING CLASSES")
print("="*60)
print(f"\nResults:")
print(f"  CVCL Accuracy: {cvcl_accuracy:.4f} ({cvcl_accuracy*100:.2f}%)")
print(f"  CLIP Accuracy: {clip_accuracy:.4f} ({clip_accuracy*100:.2f}%)")
print(f"\nDifference: {abs(cvcl_accuracy - clip_accuracy):.4f} ({abs(cvcl_accuracy - clip_accuracy)*100:.2f}%)")
if cvcl_accuracy > clip_accuracy:
    print(f"CVCL performs better by {(cvcl_accuracy - clip_accuracy)*100:.2f}% on its training classes")
elif clip_accuracy > cvcl_accuracy:
    print(f"CLIP performs better by {(clip_accuracy - cvcl_accuracy)*100:.2f}% even on CVCL's training classes")
else:
    print("Both models perform equally")


DCDS TEXT-VISION TEST COMPARISON - CVCL TRAINING CLASSES

Results:
  CVCL Accuracy: 0.3083 (30.83%)
  CLIP Accuracy: 0.7718 (77.18%)

Difference: 0.4635 (46.35%)
CLIP performs better by 46.35% even on CVCL's training classes
