In [2]:
# Imports
import os
import sys
import pandas as pd
import numpy as np
import torch
from PIL import Image
from tqdm import tqdm
import random
from datetime import datetime
import clip
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader

# Path setup - Use absolute paths to avoid any confusion
REPO_ROOT = r'C:\Users\jbats\Projects\NTU-Synthetic'

# Add discover-hidden-visual-concepts to path
DISCOVER_ROOT = os.path.join(REPO_ROOT, 'discover-hidden-visual-concepts')
sys.path.insert(0, DISCOVER_ROOT)
sys.path.insert(0, REPO_ROOT)

# Import from discover-hidden-visual-concepts repo
sys.path.append(os.path.join(DISCOVER_ROOT, 'src'))
from utils.model_loader import load_model
from models.feature_extractor import FeatureExtractor

# Paths
DATA_PATH = os.path.join(REPO_ROOT, 'data', 'SyntheticKonkle_224', 'SyntheticKonkle')
IMG_PATH = os.path.join(DATA_PATH, )
METADATA_PATH = os.path.join(REPO_ROOT, 'data', 'SyntheticKonkle', 'master_labels.csv')
RESULTS_PATH = os.path.join(REPO_ROOT, 'PatrickProject', 'Chart_Generation', 'text_vision_results.csv')

print(f"Data path: {DATA_PATH}")
print(f"Image path: {IMG_PATH}")
print(f"Metadata path: {METADATA_PATH}")
print(f"Results will be saved to: {RESULTS_PATH}")

  from pkg_resources import packaging


Data path: C:\Users\jbats\Projects\NTU-Synthetic\data\SyntheticKonkle_224\SyntheticKonkle
Image path: C:\Users\jbats\Projects\NTU-Synthetic\data\SyntheticKonkle_224\SyntheticKonkle
Metadata path: C:\Users\jbats\Projects\NTU-Synthetic\data\SyntheticKonkle\master_labels.csv
Results will be saved to: C:\Users\jbats\Projects\NTU-Synthetic\PatrickProject\Chart_Generation\text_vision_results.csv


In [3]:
# Load and prepare data
def load_konklab_data():
    """Load KonkLab dataset with metadata for size testing"""
    # Read metadata
    df = pd.read_csv(METADATA_PATH)
    
    # Standardize column names
    # SyntheticKonkle already has lowercase 'color' column
    
    # Build full paths
    df['image_path'] = df.apply(lambda row: os.path.join(DATA_PATH, row['folder'], row['filename']), axis=1)
    
    # Filter to only entries with valid size, color, and texture information
    df = df[
        df['size'].notna() & (df['size'] != '') &
        df['color'].notna() & (df['color'] != '') &
        df['texture'].notna() & (df['texture'] != '')
    ].copy()
    
    # Standardize values (lowercase)
    df['size'] = df['size'].str.lower().str.strip()
    df['color'] = df['color'].str.lower().str.strip()
    df['texture'] = df['texture'].str.lower().str.strip()
    
    # Create combination columns
    df['class_color_texture'] = df['class'] + '_' + df['color'] + '_' + df['texture']
    df['full_combo'] = df['class'] + '_' + df['color'] + '_' + df['texture'] + '_' + df['size']
    
    print(f"Loaded {len(df)} images with size, color, and texture annotations")
    print(f"Unique classes: {df['class'].nunique()}")
    print(f"Unique sizes: {df['size'].nunique()}")
    print(f"Size values: {sorted(df['size'].unique())}")
    
    # Find class-color-texture combinations that have multiple sizes (needed for SCDS test)
    cct_size_counts = df.groupby('class_color_texture')['size'].nunique()
    valid_cct = cct_size_counts[cct_size_counts >= 3].index.tolist()  # Changed from 4 to 3
    
    print(f"\nClass-Color-Texture combinations with 3 sizes: {len(valid_cct)}")
    if len(valid_cct) > 0:
        print(f"Examples: {valid_cct[:3]}")
        # Show size distribution for first example
        if len(valid_cct) > 0:
            example = valid_cct[0]
            sizes = df[df['class_color_texture'] == example]['size'].unique()
            print(f"  {example} has sizes: {sorted(sizes)}")
    
    return df, valid_cct

# Load data
data_df, valid_combinations = load_konklab_data()
print("\nSample data:")
print(data_df[['class', 'color', 'texture', 'size', 'class_color_texture']].head())

Loaded 7882 images with size, color, and texture annotations
Unique classes: 67
Unique sizes: 4
Size values: ['bumpy', 'large', 'medium', 'small']

Class-Color-Texture combinations with 3 sizes: 1286
Examples: ['abacus_black_bumpy', 'abacus_black_smooth', 'abacus_blue_bumpy']
  abacus_black_bumpy has sizes: ['large', 'medium', 'small']

Sample data:
    class   color texture   size  class_color_texture
0  abacus     red   bumpy  large     abacus_red_bumpy
1  abacus   green   bumpy  large   abacus_green_bumpy
2  abacus    blue   bumpy  large    abacus_blue_bumpy
3  abacus  yellow   bumpy  large  abacus_yellow_bumpy
4  abacus  orange   bumpy  large  abacus_orange_bumpy


In [4]:
def run_scds_text_vision_test(model_name='cvcl-resnext', seed=0, device=None, num_trials=4000):
    """Run Same Class Different Size text-vision test with controlled color/texture
    
    Args:
        model_name: Model to test ('cvcl-resnext' or 'clip-res')
        seed: Random seed for reproducibility
        device: Device to use (None for auto-detect)
        num_trials: Total number of trials to run
    """
    # Set seeds to match original test methodology
    random.seed(seed)
    torch.manual_seed(seed)
    
    print(f"\n{'='*60}")
    print(f"Running SCDS Text-Vision Test with {model_name}")
    print(f"(Same Class Different Size - Controlled Color/Texture)")
    print(f"{'='*60}")
    
    # Device selection
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    if device == 'cuda' and not torch.cuda.is_available():
        print("[ERROR] CUDA requested but not available! Falling back to CPU.")
        device = 'cpu'
    
    print(f"Using device: {device}")
    
    # Load model
    print(f"[INFO] Loading {model_name} on {device}...")
    model, transform = load_model(model_name, seed=seed, device=device)
    extractor = FeatureExtractor(model_name, model, device)
    model.eval()
    
    # Load and prepare data
    df = pd.read_csv(METADATA_PATH)
    
    # Standardize column names
    # SyntheticKonkle already has lowercase 'color' column
    
    # Build full paths
    df['image_path'] = df.apply(lambda row: os.path.join(DATA_PATH, row['folder'], row['filename']), axis=1)
    
    # Filter to entries with all annotations
    df = df[
        df['size'].notna() & (df['size'] != '') &
        df['color'].notna() & (df['color'] != '') &
        df['texture'].notna() & (df['texture'] != '')
    ].copy()
    
    # Standardize values
    df['size'] = df['size'].str.lower().str.strip()
    df['color'] = df['color'].str.lower().str.strip()
    df['texture'] = df['texture'].str.lower().str.strip()
    
    # Create combination columns
    df['class_color_texture'] = df['class'] + '_' + df['color'] + '_' + df['texture']
    df['full_combo'] = df['class'] + '_' + df['color'] + '_' + df['texture'] + '_' + df['size']
    
    # Find class-color-texture combinations with at least 3 different sizes (small, medium, large)
    cct_groups = df.groupby('class_color_texture')
    valid_cct = []
    for cct, group in cct_groups:
        unique_sizes = group['size'].unique()
        if len(unique_sizes) >= 3:  # Changed from 4 to 3
            valid_cct.append(cct)
    
    if len(valid_cct) == 0:
        print("ERROR: No class-color-texture combinations have 3+ different sizes.")
        print("Cannot run SCDS test with strict controls.")
        return [], 0.0
    
    print(f"\nFound {len(valid_cct)} class-color-texture combinations with 3+ sizes")
    
    # Pre-compute image embeddings for efficiency
    print("\nExtracting image embeddings...")
    image_embeddings = {}
    skipped_images = []
    
    # Get all relevant images
    df_valid = df[df['class_color_texture'].isin(valid_cct)]
    all_image_paths = df_valid['image_path'].unique().tolist()
    batch_size = 16
    
    for i in tqdm(range(0, len(all_image_paths), batch_size), desc="Extracting embeddings"):
        batch_paths = all_image_paths[i:i+batch_size]
        batch_images = []
        
        for img_path in batch_paths:
            try:
                img = Image.open(img_path).convert('RGB')
                img_processed = transform(img).unsqueeze(0).to(device)
                batch_images.append((img_path, img_processed))
            except Exception as e:
                # Skip corrupted/invalid images
                skipped_images.append(img_path)
                continue
        
        if batch_images:
            paths = [p for p, _ in batch_images]
            imgs = torch.cat([img for _, img in batch_images], dim=0)
            
            with torch.no_grad():
                embeddings = extractor.get_img_feature(imgs)
                embeddings = extractor.norm_features(embeddings)
            
            for path, emb in zip(paths, embeddings):
                image_embeddings[path] = emb.cpu().float()
    
    print(f"Extracted embeddings for {len(image_embeddings)} images")
    if skipped_images:
        print(f"Skipped {len(skipped_images)} corrupted/invalid images")
    
    # Prepare for trials
    correct_count = 0
    trial_results = []
    
    # Calculate trials per combination
    trials_per_cct = num_trials // len(valid_cct)
    remaining_trials = num_trials % len(valid_cct)
    
    print(f"\nRunning {num_trials} trials across {len(valid_cct)} combinations...")
    print(f"Trials per combination: {trials_per_cct}, with {remaining_trials} getting 1 extra")
    
    # Run trials
    for cct_idx, cct in enumerate(tqdm(valid_cct, desc="Processing combinations")):
        # Get all images for this class-color-texture combination
        cct_data = df_valid[df_valid['class_color_texture'] == cct]
        
        # Get available sizes for this combination
        size_groups = cct_data.groupby('size')['image_path'].apply(list).to_dict()
        available_sizes = list(size_groups.keys())
        
        if len(available_sizes) < 3:  # Changed from 4 to 3
            continue
        
        # Parse class from combination string
        class_name = cct.split('_')[0]
        
        # Determine number of trials for this combination
        n_trials = trials_per_cct + (1 if cct_idx < remaining_trials else 0)
        
        for trial in range(n_trials):
            # For 4-way choice, we'll use 3 sizes + duplicate one
            if len(available_sizes) == 3:
                # Use all 3 sizes plus duplicate one for 4-way choice
                selected_sizes = available_sizes.copy()
                # Add a duplicate of a random size to make 4 options
                selected_sizes.append(random.choice(available_sizes))
            else:
                # If somehow we have more than 3 sizes, select 4
                selected_sizes = random.sample(available_sizes, min(4, len(available_sizes)))
            
            # First size is the query
            query_size = selected_sizes[0]
            
            # Select random query image from valid images
            valid_query_paths = [p for p in size_groups[query_size] if p in image_embeddings]
            if not valid_query_paths:
                continue
            query_img_path = random.choice(valid_query_paths)
            
            # Shuffle for candidate order
            random.shuffle(selected_sizes)
            correct_idx = selected_sizes.index(query_size)
            
            # Create text prompts - ONLY size + class, no color/texture
            candidate_texts = [f"{size} {class_name.lower()}" for size in selected_sizes]
            
            # Encode text prompts
            with torch.no_grad():
                if "clip" in model_name:
                    tokens = clip.tokenize(candidate_texts, truncate=True).to(device)
                    txt_features = model.encode_text(tokens)
                    txt_features = extractor.norm_features(txt_features)
                else:  # CVCL
                    tokens, token_len = model.tokenize(candidate_texts)
                    tokens = tokens.to(device)
                    if isinstance(token_len, torch.Tensor):
                        token_len = token_len.to(device)
                    txt_features = model.encode_text(tokens, token_len)
                    txt_features = extractor.norm_features(txt_features)
            
            # Get query image embedding
            query_embedding = image_embeddings[query_img_path].unsqueeze(0).to(device)
            
            # Calculate similarity
            query_embedding = query_embedding.float()
            txt_features = txt_features.float()
            
            similarity = (100.0 * query_embedding @ txt_features.transpose(-2, -1)).softmax(dim=1)
            
            # Get prediction
            pred_idx = similarity.argmax(dim=1).item()
            
            # Check if correct
            is_correct = (pred_idx == correct_idx)
            if is_correct:
                correct_count += 1
            
            # Store trial result
            trial_results.append({
                'trial': len(trial_results) + 1,
                'query_class': class_name,
                'query_size': query_size,
                'class_color_texture': cct,
                'query_img': os.path.basename(query_img_path),
                'correct_idx': correct_idx,
                'predicted_idx': pred_idx,
                'correct': is_correct,
                'candidate_texts': candidate_texts,
                'similarity_scores': similarity.cpu().numpy().tolist()
            })
    
    # Calculate accuracy
    accuracy = correct_count / len(trial_results) if trial_results else 0
    
    print(f"\n{'='*60}")
    print(f"Results for {model_name} - SCDS Text-Vision Test:")
    print(f"Total trials: {len(trial_results)}")
    print(f"Correct: {correct_count}")
    print(f"Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"{'='*60}")
    
    # Save results
    results_row = {
        'Model': model_name,
        'Test': 'SCDS-TextVision-Controlled',
        'Dataset': 'SyntheticKonkle',
        'Correct': correct_count,
        'Trials': len(trial_results),
        'Accuracy': accuracy
    }
    
    os.makedirs(os.path.dirname(RESULTS_PATH), exist_ok=True)
    if os.path.exists(RESULTS_PATH):
        results_df = pd.read_csv(RESULTS_PATH)
    else:
        results_df = pd.DataFrame()
    
    results_df = pd.concat([results_df, pd.DataFrame([results_row])], ignore_index=True)
    results_df.to_csv(RESULTS_PATH, index=False, float_format='%.4f')
    print(f"\nResults saved to {RESULTS_PATH}")
    
    return trial_results, accuracy

## Run CVCL SCDS Text-Vision Test

In [5]:
# Run CVCL test with seed=0 (matching original tests)
cvcl_trials, cvcl_accuracy = run_scds_text_vision_test('cvcl-resnext', seed=0, num_trials=4000)


Running SCDS Text-Vision Test with cvcl-resnext
(Same Class Different Size - Controlled Color/Texture)
Using device: cuda
[INFO] Loading cvcl-resnext on cuda...
Loading checkpoint from C:\Users\jbats\.cache\huggingface\hub\models--wkvong--cvcl_s_dino_resnext50_embedding\snapshots\f50eaa0c50a6076a5190b1dd52aeeb6c3e747045\cvcl_s_dino_resnext50_embedding.ckpt


Lightning automatically upgraded your loaded checkpoint from v1.5.8 to v2.5.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\jbats\.cache\huggingface\hub\models--wkvong--cvcl_s_dino_resnext50_embedding\snapshots\f50eaa0c50a6076a5190b1dd52aeeb6c3e747045\cvcl_s_dino_resnext50_embedding.ckpt`



Found 1286 class-color-texture combinations with 3+ sizes

Extracting image embeddings...


Extracting embeddings: 100%|██████████| 480/480 [00:21<00:00, 22.54it/s]


Extracted embeddings for 7657 images
Skipped 22 corrupted/invalid images

Running 4000 trials across 1286 combinations...
Trials per combination: 3, with 142 getting 1 extra


Processing combinations: 100%|██████████| 1286/1286 [00:38<00:00, 33.60it/s]


Results for cvcl-resnext - SCDS Text-Vision Test:
Total trials: 4000
Correct: 2103
Accuracy: 0.5258 (52.58%)

Results saved to C:\Users\jbats\Projects\NTU-Synthetic\PatrickProject\Chart_Generation\text_vision_results.csv





## Run CLIP SCDS Text-Vision Test

In [6]:
# Run CLIP test with seed=0 (matching original tests)
clip_trials, clip_accuracy = run_scds_text_vision_test('clip-resnext', seed=0, num_trials=4000)


Running SCDS Text-Vision Test with clip-resnext
(Same Class Different Size - Controlled Color/Texture)
Using device: cuda
[INFO] Loading clip-resnext on cuda...

Found 1286 class-color-texture combinations with 3+ sizes

Extracting image embeddings...


  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
Extracting embeddings: 100%|██████████| 480/480 [00:16<00:00, 28.72it/s]


Extracted embeddings for 7657 images
Skipped 22 corrupted/invalid images

Running 4000 trials across 1286 combinations...
Trials per combination: 3, with 142 getting 1 extra


Processing combinations: 100%|██████████| 1286/1286 [00:21<00:00, 60.13it/s]


Results for clip-resnext - SCDS Text-Vision Test:
Total trials: 4000
Correct: 387
Accuracy: 0.0968 (9.68%)

Results saved to C:\Users\jbats\Projects\NTU-Synthetic\PatrickProject\Chart_Generation\text_vision_results.csv





## Compare Results

In [8]:
# Display comparison
print("\n" + "="*60)
print("SCDS TEXT-VISION TEST COMPARISON (CONTROLLED)")
print("="*60)
print(f"\nTest: Same Class Different Size (4-way forced choice)")
print(f"Control: Color and Texture held constant (not mentioned in text)")
print(f"\nResults:")
print(f"  CVCL Accuracy: {cvcl_accuracy:.4f} ({cvcl_accuracy*100:.2f}%)")
print(f"  CLIP Accuracy: {clip_accuracy:.4f} ({clip_accuracy*100:.2f}%)")
print(f"\nDifference: {abs(cvcl_accuracy - clip_accuracy):.4f} ({abs(cvcl_accuracy - clip_accuracy)*100:.2f}%)")
if cvcl_accuracy > clip_accuracy:
    print(f"CVCL performs better by {(cvcl_accuracy - clip_accuracy)*100:.2f}%")
elif clip_accuracy > cvcl_accuracy:
    print(f"CLIP performs better by {(clip_accuracy - cvcl_accuracy)*100:.2f}%")
else:
    print("Both models perform equally")

print("\n" + "="*60)
print("\nAnalysis:")
print("- This is a PURE size discrimination test")
print("- Color and texture are visually controlled but not mentioned in text")
print("- Tests whether models can map size concepts from vision to language")
print("- Lower accuracy indicates difficulty with size understanding")


SCDS TEXT-VISION TEST COMPARISON (CONTROLLED)

Test: Same Class Different Size (4-way forced choice)
Control: Color and Texture held constant (not mentioned in text)

Results:
  CVCL Accuracy: 0.5258 (52.58%)
  CLIP Accuracy: 0.0968 (9.68%)

Difference: 0.4290 (42.90%)
CVCL performs better by 42.90%


Analysis:
- This is a PURE size discrimination test
- Color and texture are visually controlled but not mentioned in text
- Tests whether models can map size concepts from vision to language
- Lower accuracy indicates difficulty with size understanding


## Analysis Notes

### SCDS Text-Vision Test Characteristics (Controlled Version):
- **Visual Control**: All 4 candidates have same class, color, and texture
- **Size Variation**: ONLY size differs between candidates
- **Text Prompts**: Only mention size + class (e.g., "large apple", "small apple")
- **NOT mentioned**: Color and texture are controlled but excluded from text

