# SCDCST Text-Vision Test - CVCL Training Classes Only\n\nSame Class Different Color, Size and Texture\n\n**This version only tests on the 25 classes that appear in CVCL's training data.**

In [2]:
# Imports
import os
import sys
import pandas as pd
import numpy as np
import torch
from PIL import Image
from tqdm import tqdm
import random
from datetime import datetime
import clip
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader

# Path setup
REPO_ROOT = r'C:\Users\jbats\Projects\NTU-Synthetic'

# Add discover-hidden-visual-concepts to path
DISCOVER_ROOT = os.path.join(REPO_ROOT, 'discover-hidden-visual-concepts')
sys.path.insert(0, DISCOVER_ROOT)
sys.path.insert(0, REPO_ROOT)

# Import from discover-hidden-visual-concepts repo
sys.path.append(os.path.join(DISCOVER_ROOT, 'src'))
from utils.model_loader import load_model
from models.feature_extractor import FeatureExtractor

# Paths
CVCL_CLASSES_PATH = os.path.join(REPO_ROOT, 'data', 'CVCL_Konkle_Overlap', 'CVCLKonkMatches.csv')
DATA_PATH = os.path.join(REPO_ROOT, 'data', 'SyntheticKonkle_224', 'SyntheticKonkle')
METADATA_PATH = os.path.join(REPO_ROOT, 'data', 'SyntheticKonkle', 'master_labels.csv')
RESULTS_PATH = os.path.join(REPO_ROOT, 'PatrickProject', 'Chart_Generation', 'cvcl_training_text_vision_results.csv')

print(f"Data path: {DATA_PATH}")
print(f"CVCL classes file: {CVCL_CLASSES_PATH}")
print(f"Results will be saved to: {RESULTS_PATH}")

Data path: C:\Users\jbats\Projects\NTU-Synthetic\data\SyntheticKonkle_224\SyntheticKonkle
CVCL classes file: C:\Users\jbats\Projects\NTU-Synthetic\data\CVCL_Konkle_Overlap\CVCLKonkMatches.csv
Results will be saved to: C:\Users\jbats\Projects\NTU-Synthetic\PatrickProject\Chart_Generation\cvcl_training_text_vision_results.csv


In [3]:
# Load CVCL training classes
cvcl_df = pd.read_csv(CVCL_CLASSES_PATH)
CVCL_TRAINING_CLASSES = cvcl_df['Class'].str.strip().tolist()

print(f"CVCL Training Classes ({len(CVCL_TRAINING_CLASSES)}):")
for cls in CVCL_TRAINING_CLASSES:
    print(f"  {cls}")

CVCL Training Classes (24):
  ball
  butterfly
  phone
  bagel
  basket
  bell
  fan
  seashell
  bird
  stool
  train
  ring
  tricycle
  toothpaste
  pen
  tree
  apple
  cookie
  bread
  pumpkin
  camera
  rabbit
  pillow
  horse


In [4]:
# Load and prepare data - FILTERED TO CVCL TRAINING CLASSES
def load_cvcl_synthetickonkle_data():
    """Load SyntheticKonkle dataset filtered to CVCL training classes"""
    # Read metadata
    df = pd.read_csv(METADATA_PATH)
    
    # Filter to only CVCL training classes
    df = df[df['class'].isin(CVCL_TRAINING_CLASSES)].copy()
    
    # Handle missing ball and bread
    missing_classes = set(CVCL_TRAINING_CLASSES) - set(df['class'].unique())
    if missing_classes:
        print(f"Adding missing classes from folders: {missing_classes}")
        for cls in missing_classes:
            folder = f"{cls}_color"
            folder_path = os.path.join(DATA_PATH, folder)
            if os.path.exists(folder_path):
                image_files = [f for f in os.listdir(folder_path) if f.endswith('.png')]
                for img_file in image_files:
                    # Parse filename to extract metadata
                    parts = img_file.replace('.png', '').split('_')
                    if len(parts) >= 5:
                        new_row = {
                            'folder': folder,
                            'filename': img_file,
                            'class': cls,
                            'color': '_'.join(parts[4:]),
                            'size': parts[1],
                            'texture': parts[2],
                            'variant': parts[3]
                        }
                        df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
    
    # Build full paths
    df['image_path'] = df.apply(lambda row: os.path.join(DATA_PATH, row['folder'], row['filename']), axis=1)
    
    # Filter to only entries with valid metadata
    df = df[df['color'].notna() & (df['color'] != '')].copy()
    df = df[df['size'].notna() & (df['size'] != '')].copy()
    df = df[df['texture'].notna() & (df['texture'] != '')].copy()
    
    # Standardize names (lowercase)
    df['color'] = df['color'].str.lower().str.strip()
    df['size'] = df['size'].str.lower().str.strip()
    df['texture'] = df['texture'].str.lower().str.strip()
    
    print(f"Loaded {len(df)} images from {df['class'].nunique()} CVCL training classes")
    print(f"Classes: {sorted(df['class'].unique())}")
    print(f"Unique colors: {df['color'].nunique()}")
    print(f"Unique sizes: {df['size'].nunique()}")
    print(f"Unique textures: {df['texture'].nunique()}")
    
    return df

# Load data
data_df = load_cvcl_synthetickonkle_data()
print("\nSample data:")
print(data_df[['class', 'color', 'size', 'texture']].head())

Adding missing classes from folders: {'ball'}
Loaded 2832 images from 24 CVCL training classes
Classes: ['apple', 'bagel', 'ball', 'basket', 'bell', 'bird', 'bread', 'butterfly', 'camera', 'cookie', 'fan', 'horse', 'pen', 'phone', 'pillow', 'pumpkin', 'rabbit', 'ring', 'seashell', 'stool', 'toothpaste', 'train', 'tree', 'tricycle']
Unique colors: 12
Unique sizes: 4
Unique textures: 4

Sample data:
   class   color   size texture
0  apple     red  large   bumpy
1  apple   green  large   bumpy
2  apple    blue  large   bumpy
3  apple  yellow  large   bumpy
4  apple  orange  large   bumpy


In [5]:
def run_scdcst_test(model_name='cvcl-resnext', seed=0, device=None, num_trials=4000):
    """Run SCDCST text-vision test on CVCL training classes only
    Same Class Different Color, Size and Texture test.
    Text format uses natural English ordering: "{size} {color} {texture} {class}"
    Example: "large red smooth apple", "small blue bumpy apple", "medium green smooth apple"
    """
    # Set seeds
    random.seed(seed)
    torch.manual_seed(seed)
    
    print(f"\n{'='*60}")
    print(f"Running SCDCST Text-Vision Test with {model_name}")
    print(f"CVCL Training Classes Only")
    print(f"(Same Class Different Color, Size & Texture)")
    print(f"Text format: {{size}} {{color}} {{texture}} {{class}} (natural English order)")
    print(f"{'='*60}")
    
    # Device selection
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    print(f"Using device: {device}")
    
    # Load model
    print(f"[INFO] Loading {model_name} on {device}...")
    model, transform = load_model(model_name, seed=seed, device=device)
    extractor = FeatureExtractor(model_name, model, device)
    model.eval()
    
    # Load filtered data - CVCL training classes only
    df = load_cvcl_synthetickonkle_data()
    
    # Filter to only valid size and texture values
    valid_sizes = ['small', 'medium', 'large']
    valid_textures = ['smooth', 'bumpy']
    df = df[df['size'].isin(valid_sizes) & df['texture'].isin(valid_textures)].copy()
    
    # Create combination column
    df['color_size_texture'] = df['color'] + '_' + df['size'] + '_' + df['texture']
    
    # Find classes with at least 3 different color-size-texture combinations
    class_groups = df.groupby('class')
    valid_classes = []
    for class_name, group in class_groups:
        unique_cst = group['color_size_texture'].unique()
        if len(unique_cst) >= 3:
            valid_classes.append(class_name)
    
    if len(valid_classes) == 0:
        print("ERROR: No CVCL training classes have 3+ different color-size-texture combinations.")
        return 0.0
    
    print(f"\nFound {len(valid_classes)} CVCL training classes with 3+ color-size-texture combinations")
    
    # Pre-compute image embeddings
    print("\nExtracting image embeddings...")
    image_embeddings = {}
    skipped_images = []
    
    # Get all relevant images
    df_valid = df[df['class'].isin(valid_classes)]
    all_image_paths = df_valid['image_path'].unique().tolist()
    batch_size = 16
    
    for i in tqdm(range(0, len(all_image_paths), batch_size), desc="Extracting embeddings"):
        batch_paths = all_image_paths[i:i+batch_size]
        batch_images = []
        
        for img_path in batch_paths:
            try:
                img = Image.open(img_path).convert('RGB')
                img_processed = transform(img).unsqueeze(0).to(device)
                batch_images.append((img_path, img_processed))
            except Exception as e:
                skipped_images.append(img_path)
                continue
        
        if batch_images:
            paths = [p for p, _ in batch_images]
            imgs = torch.cat([img for _, img in batch_images], dim=0)
            
            with torch.no_grad():
                embeddings = extractor.get_img_feature(imgs)
                embeddings = extractor.norm_features(embeddings)
            
            for path, emb in zip(paths, embeddings):
                image_embeddings[path] = emb.cpu().float()
    
    print(f"Extracted embeddings for {len(image_embeddings)} images")
    if skipped_images:
        print(f"Skipped {len(skipped_images)} corrupted/invalid images")
    
    # Prepare for trials
    correct_count = 0
    trial_results = []
    
    # Calculate trials per class
    trials_per_class = num_trials // len(valid_classes) if valid_classes else 0
    remaining_trials = num_trials % len(valid_classes) if valid_classes else 0
    
    print(f"\nRunning {num_trials} trials across {len(valid_classes)} classes...")
    
    # Run trials
    for class_idx, class_name in enumerate(tqdm(valid_classes, desc="Processing classes")):
        # Get all images for this class
        class_data = df_valid[df_valid['class'] == class_name]
        
        # Group by color-size-texture
        cst_groups = class_data.groupby('color_size_texture').agg({
            'image_path': list,
            'color': 'first',
            'size': 'first',
            'texture': 'first'
        }).to_dict('index')
        
        available_cst = list(cst_groups.keys())
        
        if len(available_cst) < 3:
            continue
        
        # Determine number of trials for this class
        n_trials = trials_per_class + (1 if class_idx < remaining_trials else 0)
        
        for trial in range(n_trials):
            if len(trial_results) >= num_trials:
                break
                
            # Select color-size-texture combinations for 4-way choice
            if len(available_cst) == 3:
                # Use all 3 plus duplicate one
                selected_cst = available_cst.copy()
                selected_cst.append(random.choice(available_cst))
            else:
                # Select 4 different combinations if possible
                selected_cst = random.sample(available_cst, min(4, len(available_cst)))
            
            # First combination is the query
            query_cst = selected_cst[0]
            query_data = cst_groups[query_cst]
            
            # Select random query image from valid images
            valid_query_paths = [p for p in query_data['image_path'] if p in image_embeddings]
            if not valid_query_paths:
                continue
            query_img_path = random.choice(valid_query_paths)
            query_color = query_data['color']
            query_size = query_data['size']
            query_texture = query_data['texture']
            
            # Shuffle for candidate order
            random.shuffle(selected_cst)
            correct_idx = selected_cst.index(query_cst)
            
            # Create text prompts - NATURAL ENGLISH ORDER: {size} {color} {texture} {class}
            candidate_texts = []
            for cst in selected_cst:
                cst_data = cst_groups[cst]
                # Natural English order: size → color → texture → noun
                text_prompt = f"{cst_data['size']} {cst_data['color']} {cst_data['texture']} {class_name.lower()}"
                candidate_texts.append(text_prompt)
            
            # Encode text prompts
            with torch.no_grad():
                if "clip" in model_name:
                    tokens = clip.tokenize(candidate_texts, truncate=True).to(device)
                    txt_features = model.encode_text(tokens)
                    txt_features = extractor.norm_features(txt_features)
                else:  # CVCL
                    tokens, token_len = model.tokenize(candidate_texts)
                    tokens = tokens.to(device)
                    if isinstance(token_len, torch.Tensor):
                        token_len = token_len.to(device)
                    txt_features = model.encode_text(tokens, token_len)
                    txt_features = extractor.norm_features(txt_features)
            
            # Get query image embedding
            query_embedding = image_embeddings[query_img_path].unsqueeze(0).to(device)
            
            # Calculate similarity
            query_embedding = query_embedding.float()
            txt_features = txt_features.float()
            
            similarity = (100.0 * query_embedding @ txt_features.transpose(-2, -1)).softmax(dim=1)
            
            # Get prediction
            pred_idx = similarity.argmax(dim=1).item()
            
            # Check if correct
            is_correct = (pred_idx == correct_idx)
            if is_correct:
                correct_count += 1
            
            # Store trial result
            trial_results.append({
                'trial': len(trial_results) + 1,
                'query_class': class_name,
                'query_color': query_color,
                'query_size': query_size,
                'query_texture': query_texture,
                'correct_idx': correct_idx,
                'predicted_idx': pred_idx,
                'correct': is_correct
            })
    
    # Calculate accuracy
    accuracy = correct_count / len(trial_results) if trial_results else 0
    
    print(f"\n{'='*60}")
    print(f"Results for {model_name} - SCDCST Text-Vision Test:")
    print(f"Total trials: {len(trial_results)}")
    print(f"Correct: {correct_count}")
    print(f"Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"{'='*60}")
    
    # Save results
    results_row = {
        'Model': model_name,
        'Test': 'SCDCST-TextVision-CVCLTraining',
        'Dataset': 'SyntheticKonkle_224',
        'Correct': correct_count,
        'Trials': len(trial_results),
        'Accuracy': accuracy
    }
    
    # Append to results file
    os.makedirs(os.path.dirname(RESULTS_PATH), exist_ok=True)
    if os.path.exists(RESULTS_PATH):
        results_df = pd.read_csv(RESULTS_PATH)
    else:
        results_df = pd.DataFrame()
    
    results_df = pd.concat([results_df, pd.DataFrame([results_row])], ignore_index=True)
    results_df.to_csv(RESULTS_PATH, index=False, float_format='%.4f')
    print(f"\nResults saved to {RESULTS_PATH}")
    
    return accuracy

## Run CVCL SCDCST Text-Vision Test

In [6]:
# Run CVCL test
cvcl_accuracy = run_scdcst_test('cvcl-resnext', seed=0, num_trials=4000)


Running SCDCST Text-Vision Test with cvcl-resnext
CVCL Training Classes Only
(Same Class Different Color, Size & Texture)
Text format: {size} {color} {texture} {class} (natural English order)
Using device: cuda
[INFO] Loading cvcl-resnext on cuda...
Loading checkpoint from C:\Users\jbats\.cache\huggingface\hub\models--wkvong--cvcl_s_dino_resnext50_embedding\snapshots\f50eaa0c50a6076a5190b1dd52aeeb6c3e747045\cvcl_s_dino_resnext50_embedding.ckpt


Lightning automatically upgraded your loaded checkpoint from v1.5.8 to v2.5.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\jbats\.cache\huggingface\hub\models--wkvong--cvcl_s_dino_resnext50_embedding\snapshots\f50eaa0c50a6076a5190b1dd52aeeb6c3e747045\cvcl_s_dino_resnext50_embedding.ckpt`


Adding missing classes from folders: {'ball'}
Loaded 2832 images from 24 CVCL training classes
Classes: ['apple', 'bagel', 'ball', 'basket', 'bell', 'bird', 'bread', 'butterfly', 'camera', 'cookie', 'fan', 'horse', 'pen', 'phone', 'pillow', 'pumpkin', 'rabbit', 'ring', 'seashell', 'stool', 'toothpaste', 'train', 'tree', 'tricycle']
Unique colors: 12
Unique sizes: 4
Unique textures: 4

Found 24 CVCL training classes with 3+ color-size-texture combinations

Extracting image embeddings...


Extracting embeddings: 100%|██████████| 176/176 [00:08<00:00, 21.42it/s]


Extracted embeddings for 2800 images
Skipped 14 corrupted/invalid images

Running 4000 trials across 24 classes...


Processing classes: 100%|██████████| 24/24 [00:36<00:00,  1.53s/it]


Results for cvcl-resnext - SCDCST Text-Vision Test:
Total trials: 4000
Correct: 1162
Accuracy: 0.2905 (29.05%)

Results saved to C:\Users\jbats\Projects\NTU-Synthetic\PatrickProject\Chart_Generation\cvcl_training_text_vision_results.csv





## Run CLIP SCDCST Text-Vision Test

In [7]:
# Run CLIP test
clip_accuracy = run_scdcst_test('clip-resnext', seed=0, num_trials=4000)


Running SCDCST Text-Vision Test with clip-resnext
CVCL Training Classes Only
(Same Class Different Color, Size & Texture)
Text format: {size} {color} {texture} {class} (natural English order)
Using device: cuda
[INFO] Loading clip-resnext on cuda...
Adding missing classes from folders: {'ball'}
Loaded 2832 images from 24 CVCL training classes
Classes: ['apple', 'bagel', 'ball', 'basket', 'bell', 'bird', 'bread', 'butterfly', 'camera', 'cookie', 'fan', 'horse', 'pen', 'phone', 'pillow', 'pumpkin', 'rabbit', 'ring', 'seashell', 'stool', 'toothpaste', 'train', 'tree', 'tricycle']
Unique colors: 12
Unique sizes: 4
Unique textures: 4

Found 24 CVCL training classes with 3+ color-size-texture combinations

Extracting image embeddings...


  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
Extracting embeddings: 100%|██████████| 176/176 [00:06<00:00, 27.80it/s]


Extracted embeddings for 2800 images
Skipped 14 corrupted/invalid images

Running 4000 trials across 24 classes...


Processing classes: 100%|██████████| 24/24 [00:19<00:00,  1.22it/s]


Results for clip-resnext - SCDCST Text-Vision Test:
Total trials: 4000
Correct: 3599
Accuracy: 0.8998 (89.98%)

Results saved to C:\Users\jbats\Projects\NTU-Synthetic\PatrickProject\Chart_Generation\cvcl_training_text_vision_results.csv





## Compare Results

In [8]:
# Display comparison
print("\n" + "="*60)
print("SCDCST TEXT-VISION TEST COMPARISON - CVCL TRAINING CLASSES")
print("="*60)
print(f"\nResults:")
print(f"  CVCL Accuracy: {cvcl_accuracy:.4f} ({cvcl_accuracy*100:.2f}%)")
print(f"  CLIP Accuracy: {clip_accuracy:.4f} ({clip_accuracy*100:.2f}%)")
print(f"\nDifference: {abs(cvcl_accuracy - clip_accuracy):.4f} ({abs(cvcl_accuracy - clip_accuracy)*100:.2f}%)")
if cvcl_accuracy > clip_accuracy:
    print(f"CVCL performs better by {(cvcl_accuracy - clip_accuracy)*100:.2f}% on its training classes")
elif clip_accuracy > cvcl_accuracy:
    print(f"CLIP performs better by {(clip_accuracy - cvcl_accuracy)*100:.2f}% even on CVCL's training classes")
else:
    print("Both models perform equally")


SCDCST TEXT-VISION TEST COMPARISON - CVCL TRAINING CLASSES

Results:
  CVCL Accuracy: 0.2905 (29.05%)
  CLIP Accuracy: 0.8998 (89.98%)

Difference: 0.6093 (60.93%)
CLIP performs better by 60.93% even on CVCL's training classes
