# SCDT Text-Vision Test - CVCL Training Classes Only\n\nSame Class Different Texture\n\n**This version only tests on the 25 classes that appear in CVCL's training data.**

In [1]:
# Imports
import os
import sys
import pandas as pd
import numpy as np
import torch
from PIL import Image
from tqdm import tqdm
import random
from datetime import datetime
import clip
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader

# Path setup
REPO_ROOT = r'C:\Users\jbats\Projects\NTU-Synthetic'

# Add discover-hidden-visual-concepts to path
DISCOVER_ROOT = os.path.join(REPO_ROOT, 'discover-hidden-visual-concepts')
sys.path.insert(0, DISCOVER_ROOT)
sys.path.insert(0, REPO_ROOT)

# Import from discover-hidden-visual-concepts repo
sys.path.append(os.path.join(DISCOVER_ROOT, 'src'))
from utils.model_loader import load_model
from models.feature_extractor import FeatureExtractor

# Paths
CVCL_CLASSES_PATH = os.path.join(REPO_ROOT, 'data', 'CVCL_Konkle_Overlap', 'CVCLKonkMatches.csv')
DATA_PATH = os.path.join(REPO_ROOT, 'data', 'SyntheticKonkle_224', 'SyntheticKonkle')
METADATA_PATH = os.path.join(REPO_ROOT, 'data', 'SyntheticKonkle', 'master_labels.csv')
RESULTS_PATH = os.path.join(REPO_ROOT, 'PatrickProject', 'Chart_Generation', 'cvcl_training_text_vision_results.csv')

print(f"Data path: {DATA_PATH}")
print(f"CVCL classes file: {CVCL_CLASSES_PATH}")
print(f"Results will be saved to: {RESULTS_PATH}")

  from pkg_resources import packaging


Data path: C:\Users\jbats\Projects\NTU-Synthetic\data\SyntheticKonkle_224\SyntheticKonkle
CVCL classes file: C:\Users\jbats\Projects\NTU-Synthetic\data\CVCL_Konkle_Overlap\CVCLKonkMatches.csv
Results will be saved to: C:\Users\jbats\Projects\NTU-Synthetic\PatrickProject\Chart_Generation\cvcl_training_text_vision_results.csv


In [2]:
# Load CVCL training classes
cvcl_df = pd.read_csv(CVCL_CLASSES_PATH)
CVCL_TRAINING_CLASSES = cvcl_df['Class'].str.strip().tolist()

print(f"CVCL Training Classes ({len(CVCL_TRAINING_CLASSES)}):")
for cls in CVCL_TRAINING_CLASSES:
    print(f"  {cls}")

CVCL Training Classes (24):
  ball
  butterfly
  phone
  bagel
  basket
  bell
  fan
  seashell
  bird
  stool
  train
  ring
  tricycle
  toothpaste
  pen
  tree
  apple
  cookie
  bread
  pumpkin
  camera
  rabbit
  pillow
  horse


In [3]:
# Load and prepare data - FILTERED TO CVCL TRAINING CLASSES
def load_cvcl_synthetickonkle_data():
    """Load SyntheticKonkle dataset filtered to CVCL training classes"""
    # Read metadata
    df = pd.read_csv(METADATA_PATH)
    
    # Filter to only CVCL training classes
    df = df[df['class'].isin(CVCL_TRAINING_CLASSES)].copy()
    
    # Handle missing ball and bread
    missing_classes = set(CVCL_TRAINING_CLASSES) - set(df['class'].unique())
    if missing_classes:
        print(f"Adding missing classes from folders: {missing_classes}")
        for cls in missing_classes:
            folder = f"{cls}_color"
            folder_path = os.path.join(DATA_PATH, folder)
            if os.path.exists(folder_path):
                image_files = [f for f in os.listdir(folder_path) if f.endswith('.png')]
                for img_file in image_files:
                    # Parse filename to extract metadata
                    # Format: ball_large_bumpy_01_black.png
                    # Parts: [class, size, texture, variant, color]
                    parts = img_file.replace('.png', '').split('_')
                    if len(parts) >= 5:
                        new_row = {
                            'folder': folder,
                            'filename': img_file,
                            'class': cls,
                            'size': parts[1],  # This is the size (large/medium/small)
                            'texture': parts[2],  # This is the texture (bumpy/smooth)
                            'variant': parts[3],  # This is the variant (01/02)
                            'color': '_'.join(parts[4:])  # Rest is color
                        }
                        df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
    
    # Build full paths
    df['image_path'] = df.apply(lambda row: os.path.join(DATA_PATH, row['folder'], row['filename']), axis=1)
    
    # Filter to only entries with valid metadata
    df = df[df['color'].notna() & (df['color'] != '')].copy()
    df = df[df['size'].notna() & (df['size'] != '')].copy()
    df = df[df['texture'].notna() & (df['texture'] != '')].copy()
    
    # Standardize names (lowercase)
    df['color'] = df['color'].str.lower().str.strip()
    df['size'] = df['size'].str.lower().str.strip()
    df['texture'] = df['texture'].str.lower().str.strip()
    
    # Filter to only valid texture values
    valid_textures = ['smooth', 'bumpy']
    valid_sizes = ['small', 'medium', 'large']
    df = df[df['texture'].isin(valid_textures) & df['size'].isin(valid_sizes)].copy()
    
    print(f"Loaded {len(df)} images from {df['class'].nunique()} CVCL training classes")
    print(f"Classes: {sorted(df['class'].unique())}")
    print(f"Unique colors: {df['color'].nunique()}")
    print(f"Unique sizes: {df['size'].nunique()} - {sorted(df['size'].unique())}")
    print(f"Unique textures: {df['texture'].nunique()} - {sorted(df['texture'].unique())}")
    
    return df

# Load data
data_df = load_cvcl_synthetickonkle_data()
print("\nSample data:")
print(data_df[['class', 'color', 'size', 'texture']].head())

Adding missing classes from folders: {'ball'}
Loaded 2817 images from 24 CVCL training classes
Classes: ['apple', 'bagel', 'ball', 'basket', 'bell', 'bird', 'bread', 'butterfly', 'camera', 'cookie', 'fan', 'horse', 'pen', 'phone', 'pillow', 'pumpkin', 'rabbit', 'ring', 'seashell', 'stool', 'toothpaste', 'train', 'tree', 'tricycle']
Unique colors: 12
Unique sizes: 3 - ['large', 'medium', 'small']
Unique textures: 2 - ['bumpy', 'smooth']

Sample data:
   class   color   size texture
0  apple     red  large   bumpy
1  apple   green  large   bumpy
2  apple    blue  large   bumpy
3  apple  yellow  large   bumpy
4  apple  orange  large   bumpy


In [None]:
def run_scdt_test(model_name='cvcl-resnext', seed=0, device=None, num_trials=4000):
    """Run SCDT text-vision test on CVCL training classes only
    Same Class Different Texture - allows different colors/sizes for variety
    
    Test design:
    - Query: One image of a specific texture
    - Distractors: 3 images of the SAME CLASS but with different texture distribution
    - Colors and sizes can vary for image diversity
    - Text format: "{texture} {class}" (e.g., "smooth apple", "bumpy apple")
    - Uses 3-1 split: 3 images of one texture, 1 of the other
    
    Args:
        model_name: Model to test ('cvcl-resnext' or 'clip-resnext')
        seed: Random seed for reproducibility
        device: Device to use (None for auto-detect)
        num_trials: Total number of trials to run
    """
    # Set seeds
    random.seed(seed)
    torch.manual_seed(seed)
    
    print(f"\n{'='*60}")
    print(f"Running SCDT Text-Vision Test with {model_name}")
    print(f"CVCL Training Classes Only")
    print(f"(Same Class Different Texture - Varied Colors/Sizes)")
    print(f"Text format: {{texture}} {{class}}")
    print(f"Note: Using 3-1 split (3 of one texture, 1 of the other)")
    print(f"{'='*60}")
    
    # Device selection
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    print(f"Using device: {device}")
    
    # Load model
    print(f"[INFO] Loading {model_name} on {device}...")
    model, transform = load_model(model_name, seed=seed, device=device)
    extractor = FeatureExtractor(model_name, model, device)
    model.eval()
    
    # Load filtered data - CVCL training classes only
    df = load_cvcl_synthetickonkle_data()
    
    # Filter to only valid texture values
    valid_textures = ['smooth', 'bumpy']
    df = df[df['texture'].isin(valid_textures)].copy()
    
    print(f"Loaded {len(df)} images with texture annotations")
    print(f"Unique classes: {df['class'].nunique()}")
    print(f"Unique textures: {sorted(df['texture'].unique())}")
    
    # Group by class to ensure we have both textures for each class
    class_groups = df.groupby('class')
    valid_classes = []
    for class_name, group in class_groups:
        unique_textures = group['texture'].unique()
        if len(unique_textures) == 2:  # Has both smooth and bumpy
            # Also check we have enough images per texture for variety
            texture_counts = group.groupby('texture').size()
            if texture_counts.min() >= 4:  # At least 4 images per texture (for 3-1 split)
                valid_classes.append(class_name)
    
    if len(valid_classes) == 0:
        print("ERROR: No CVCL training classes have both textures with enough images.")
        return [], 0.0
    
    print(f"\nFound {len(valid_classes)} CVCL classes with both textures")
    print(f"Classes: {sorted(valid_classes)}")
    
    # Pre-compute image embeddings
    print("\nExtracting image embeddings...")
    image_embeddings = {}
    skipped_images = []
    
    # Get all relevant images from valid classes
    df_valid = df[df['class'].isin(valid_classes)]
    all_image_paths = df_valid['image_path'].unique().tolist()
    batch_size = 16
    
    for i in tqdm(range(0, len(all_image_paths), batch_size), desc="Extracting embeddings"):
        batch_paths = all_image_paths[i:i+batch_size]
        batch_images = []
        
        for img_path in batch_paths:
            try:
                img = Image.open(img_path).convert('RGB')
                img_processed = transform(img).unsqueeze(0).to(device)
                batch_images.append((img_path, img_processed))
            except Exception as e:
                skipped_images.append(img_path)
                continue
        
        if batch_images:
            paths = [p for p, _ in batch_images]
            imgs = torch.cat([img for _, img in batch_images], dim=0)
            
            with torch.no_grad():
                embeddings = extractor.get_img_feature(imgs)
                embeddings = extractor.norm_features(embeddings)
            
            for path, emb in zip(paths, embeddings):
                image_embeddings[path] = emb.cpu().float()
    
    print(f"Extracted embeddings for {len(image_embeddings)} images")
    if skipped_images:
        print(f"Skipped {len(skipped_images)} corrupted/invalid images")
    
    # Prepare for trials
    correct_count = 0
    trial_results = []
    
    # Calculate trials per class
    trials_per_class = num_trials // len(valid_classes)
    remaining_trials = num_trials % len(valid_classes)
    
    print(f"\nRunning {num_trials} trials across {len(valid_classes)} classes...")
    print(f"Trials per class: {trials_per_class}, with {remaining_trials} getting 1 extra")
    
    # Run trials
    for class_idx, class_name in enumerate(tqdm(valid_classes, desc="Processing classes")):
        # Get all images for this class
        class_data = df_valid[df_valid['class'] == class_name]
        
        # Group by texture
        smooth_images = class_data[class_data['texture'] == 'smooth']['image_path'].tolist()
        bumpy_images = class_data[class_data['texture'] == 'bumpy']['image_path'].tolist()
        
        # Filter to valid embeddings
        smooth_images = [p for p in smooth_images if p in image_embeddings]
        bumpy_images = [p for p in bumpy_images if p in image_embeddings]
        
        # Determine number of trials for this class
        n_trials = trials_per_class + (1 if class_idx < remaining_trials else 0)
        
        for trial in range(n_trials):
            if len(trial_results) >= num_trials:
                break
            
            # For 4-way choice with 2 textures, use 3-1 split for unambiguous mapping
            # Randomly choose which texture gets 3 images vs 1
            if random.random() < 0.5:
                majority_texture = 'smooth'
                minority_texture = 'bumpy'
                majority_images = smooth_images
                minority_images = bumpy_images
            else:
                majority_texture = 'bumpy'
                minority_texture = 'smooth'
                majority_images = bumpy_images
                minority_images = smooth_images
            
            # Need at least 3 majority and 1 minority
            if len(majority_images) < 3 or len(minority_images) < 1:
                continue
            
            # Select 3 different images from majority texture (can be different colors/sizes)
            selected_majority = random.sample(majority_images, 3)
            # Select 1 from minority texture
            selected_minority = random.sample(minority_images, 1)
            
            # Build candidates list with (image_path, texture) tuples
            candidates = []
            for img_path in selected_majority:
                candidates.append((img_path, majority_texture))
            for img_path in selected_minority:
                candidates.append((img_path, minority_texture))
            
            # Select query from candidates
            query_idx = random.randint(0, 3)
            query_img_path, query_texture = candidates[query_idx]
            
            # Create text prompts
            candidate_texts = [f"{texture} {class_name.lower()}" for _, texture in candidates]
            
            # Shuffle for random presentation
            shuffled_order = list(range(4))
            random.shuffle(shuffled_order)
            shuffled_candidates = [candidates[i] for i in shuffled_order]
            shuffled_texts = [candidate_texts[i] for i in shuffled_order]
            correct_idx = shuffled_order.index(query_idx)
            
            # Encode text prompts
            with torch.no_grad():
                if "clip" in model_name:
                    tokens = clip.tokenize(shuffled_texts, truncate=True).to(device)
                    txt_features = model.encode_text(tokens)
                    txt_features = extractor.norm_features(txt_features)
                else:  # CVCL
                    tokens, token_len = model.tokenize(shuffled_texts)
                    tokens = tokens.to(device)
                    if isinstance(token_len, torch.Tensor):
                        token_len = token_len.to(device)
                    txt_features = model.encode_text(tokens, token_len)
                    txt_features = extractor.norm_features(txt_features)
            
            # Get query image embedding
            query_embedding = image_embeddings[query_img_path].unsqueeze(0).to(device)
            
            # Calculate similarity
            query_embedding = query_embedding.float()
            txt_features = txt_features.float()
            
            similarity = (100.0 * query_embedding @ txt_features.transpose(-2, -1)).softmax(dim=1)
            
            # Get prediction
            pred_idx = similarity.argmax(dim=1).item()
            
            # Check if correct
            is_correct = (pred_idx == correct_idx)
            if is_correct:
                correct_count += 1
            
            # Store trial result
            trial_results.append({
                'trial': len(trial_results) + 1,
                'query_class': class_name,
                'query_texture': query_texture,
                'query_img': os.path.basename(query_img_path),
                'correct_idx': correct_idx,
                'predicted_idx': pred_idx,
                'correct': is_correct,
                'candidate_texts': shuffled_texts,
                'similarity_scores': similarity.cpu().numpy().tolist()
            })
    
    # Calculate accuracy
    accuracy = correct_count / len(trial_results) if trial_results else 0
    
    print(f"\n{'='*60}")
    print(f"Results for {model_name} - SCDT Text-Vision Test:")
    print(f"Total trials: {len(trial_results)}")
    print(f"Correct: {correct_count}")
    print(f"Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"{'='*60}")
    
    # Save results
    results_row = {
        'Model': model_name,
        'Test': 'SCDT-TextVision-CVCLTraining',
        'Dataset': 'SyntheticKonkle_224',
        'Correct': correct_count,
        'Trials': len(trial_results),
        'Accuracy': accuracy
    }
    
    # Append to results file
    os.makedirs(os.path.dirname(RESULTS_PATH), exist_ok=True)
    if os.path.exists(RESULTS_PATH):
        results_df = pd.read_csv(RESULTS_PATH)
    else:
        results_df = pd.DataFrame()
    
    results_df = pd.concat([results_df, pd.DataFrame([results_row])], ignore_index=True)
    results_df.to_csv(RESULTS_PATH, index=False, float_format='%.4f')
    print(f"\nResults saved to {RESULTS_PATH}")
    
    return trial_results, accuracy

## Run CVCL SCDT Text-Vision Test

In [None]:
# Run CVCL test
cvcl_trials, cvcl_accuracy = run_scdt_test('cvcl-resnext', seed=0, num_trials=4000)

## Run CLIP SCDT Text-Vision Test

In [None]:
# Run CLIP test
clip_trials, clip_accuracy = run_scdt_test('clip-resnext', seed=0, num_trials=4000)

## Compare Results

In [None]:
# Display comparison
print("\n" + "="*60)
print("SCDT TEXT-VISION TEST COMPARISON - CVCL TRAINING CLASSES")
print("="*60)
print(f"\nTest: Same Class Different Texture (4-way forced choice)")
print(f"Control: Colors and sizes can vary (not mentioned in text)")
print(f"Text format: '{{texture}} {{class}}'")
print(f"\nResults:")
print(f"  CVCL Accuracy: {cvcl_accuracy:.4f} ({cvcl_accuracy*100:.2f}%)")
print(f"  CLIP Accuracy: {clip_accuracy:.4f} ({clip_accuracy*100:.2f}%)")
print(f"\nDifference: {abs(cvcl_accuracy - clip_accuracy):.4f} ({abs(cvcl_accuracy - clip_accuracy)*100:.2f}%)")
if cvcl_accuracy > clip_accuracy:
    print(f"CVCL performs better by {(cvcl_accuracy - clip_accuracy)*100:.2f}% on its training classes")
elif clip_accuracy > cvcl_accuracy:
    print(f"CLIP performs better by {(clip_accuracy - cvcl_accuracy)*100:.2f}% even on CVCL's training classes")
else:
    print("Both models perform equally")