In [1]:
# Imports
import os
import sys
import pandas as pd
import numpy as np
import torch
from PIL import Image
from tqdm import tqdm
import random
from datetime import datetime
import clip
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader

# Path setup - Use absolute paths to avoid any confusion
REPO_ROOT = r'C:\Users\jbats\Projects\NTU-Synthetic'

# Add discover-hidden-visual-concepts to path
DISCOVER_ROOT = os.path.join(REPO_ROOT, 'discover-hidden-visual-concepts')
sys.path.insert(0, DISCOVER_ROOT)
sys.path.insert(0, REPO_ROOT)

# Import from discover-hidden-visual-concepts repo
sys.path.append(os.path.join(DISCOVER_ROOT, 'src'))
from utils.model_loader import load_model
from models.feature_extractor import FeatureExtractor

# Paths
DATA_PATH = os.path.join(REPO_ROOT, 'data', 'SyntheticKonkle_224', 'SyntheticKonkle')
IMG_PATH = os.path.join(DATA_PATH, )
METADATA_PATH = os.path.join(REPO_ROOT, 'data', 'SyntheticKonkle', 'master_labels.csv')
RESULTS_PATH = os.path.join(REPO_ROOT, 'PatrickProject', 'Chart_Generation', 'text_vision_results.csv')

print(f"Data path: {DATA_PATH}")
print(f"Image path: {IMG_PATH}")
print(f"Metadata path: {METADATA_PATH}")
print(f"Results will be saved to: {RESULTS_PATH}")

  from pkg_resources import packaging


Data path: C:\Users\jbats\Projects\NTU-Synthetic\data\SyntheticKonkle_224\SyntheticKonkle
Image path: C:\Users\jbats\Projects\NTU-Synthetic\data\SyntheticKonkle_224\SyntheticKonkle
Metadata path: C:\Users\jbats\Projects\NTU-Synthetic\data\SyntheticKonkle\master_labels.csv
Results will be saved to: C:\Users\jbats\Projects\NTU-Synthetic\PatrickProject\Chart_Generation\text_vision_results.csv


In [2]:
# Load and prepare data
def load_konklab_data():
    """Load KonkLab dataset with metadata for size testing"""
    # Read metadata
    df = pd.read_csv(METADATA_PATH)
    
    # Standardize column names
    # SyntheticKonkle already has lowercase 'color' column
    
    # Build full paths
    df['image_path'] = df.apply(lambda row: os.path.join(DATA_PATH, row['folder'], row['filename']), axis=1)
    
    # Filter to only entries with valid size, color, and texture information
    df = df[
        df['size'].notna() & (df['size'] != '') &
        df['color'].notna() & (df['color'] != '') &
        df['texture'].notna() & (df['texture'] != '')
    ].copy()
    
    # Standardize values (lowercase)
    df['size'] = df['size'].str.lower().str.strip()
    df['color'] = df['color'].str.lower().str.strip()
    df['texture'] = df['texture'].str.lower().str.strip()
    
    # Create combination columns
    df['color_texture'] = df['color'] + '_' + df['texture']
    df['class_size'] = df['class'] + '_' + df['size']
    df['full_combo'] = df['class'] + '_' + df['color'] + '_' + df['texture'] + '_' + df['size']
    
    print(f"Loaded {len(df)} images with size, color, and texture annotations")
    print(f"Unique classes: {df['class'].nunique()}")
    print(f"Unique sizes: {df['size'].nunique()}")
    print(f"Size values: {sorted(df['size'].unique())}")
    print(f"Unique color-texture combinations: {df['color_texture'].nunique()}")
    
    # Find color-texture combinations that have multiple class-size pairs
    # For 4-way choice with 3 sizes, we need at least 3-4 class-size combinations
    ct_groups = df.groupby('color_texture')['class_size'].nunique()
    valid_ct = ct_groups[ct_groups >= 3].index.tolist()  # Changed from 4 to 3
    
    print(f"\nColor-Texture combinations with 3+ class-size pairs: {len(valid_ct)}")
    if len(valid_ct) > 0:
        print(f"Examples: {valid_ct[:3]}")
        # Show class-size distribution for first example
        if len(valid_ct) > 0:
            example = valid_ct[0]
            class_sizes = df[df['color_texture'] == example]['class_size'].unique()[:4]
            print(f"  {example} has class-size pairs like: {class_sizes}")
    
    return df, valid_ct

# Load data
data_df, valid_color_textures = load_konklab_data()
print("\nSample data:")
print(data_df[['class', 'size', 'color', 'texture', 'class_size']].head())

Loaded 7882 images with size, color, and texture annotations
Unique classes: 67
Unique sizes: 4
Size values: ['bumpy', 'large', 'medium', 'small']
Unique color-texture combinations: 38

Color-Texture combinations with 3+ class-size pairs: 22
Examples: ['black_bumpy', 'black_smooth', 'blue_bumpy']
  black_bumpy has class-size pairs like: ['abacus_large' 'abacus_medium' 'abacus_small' 'apple_large']

Sample data:
    class   size   color texture    class_size
0  abacus  large     red   bumpy  abacus_large
1  abacus  large   green   bumpy  abacus_large
2  abacus  large    blue   bumpy  abacus_large
3  abacus  large  yellow   bumpy  abacus_large
4  abacus  large  orange   bumpy  abacus_large


In [3]:
def run_dcds_text_vision_test(model_name='cvcl-resnext', seed=0, device=None, num_trials=4000):
    """Run Different Class Different Size text-vision test with controlled color/texture
    
    Args:
        model_name: Model to test ('cvcl-resnext' or 'clip-res')
        seed: Random seed for reproducibility
        device: Device to use (None for auto-detect)
        num_trials: Total number of trials to run
    """
    # Set seeds to match original test methodology
    random.seed(seed)
    torch.manual_seed(seed)
    
    print(f"\n{'='*60}")
    print(f"Running DCDS Text-Vision Test with {model_name}")
    print(f"(Different Class Different Size - Controlled Color/Texture)")
    print(f"{'='*60}")
    
    # Device selection
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    if device == 'cuda' and not torch.cuda.is_available():
        print("[ERROR] CUDA requested but not available! Falling back to CPU.")
        device = 'cpu'
    
    print(f"Using device: {device}")
    
    # Load model
    print(f"[INFO] Loading {model_name} on {device}...")
    model, transform = load_model(model_name, seed=seed, device=device)
    extractor = FeatureExtractor(model_name, model, device)
    model.eval()
    
    # Load and prepare data
    df = pd.read_csv(METADATA_PATH)
    
    # Standardize column names
    # SyntheticKonkle already has lowercase 'color' column
    
    # Build full paths
    df['image_path'] = df.apply(lambda row: os.path.join(DATA_PATH, row['folder'], row['filename']), axis=1)
    
    # Filter to entries with all annotations
    df = df[
        df['size'].notna() & (df['size'] != '') &
        df['color'].notna() & (df['color'] != '') &
        df['texture'].notna() & (df['texture'] != '')
    ].copy()
    
    # Standardize values
    df['size'] = df['size'].str.lower().str.strip()
    df['color'] = df['color'].str.lower().str.strip()
    df['texture'] = df['texture'].str.lower().str.strip()
    
    # Create combination columns
    df['color_texture'] = df['color'] + '_' + df['texture']
    df['class_size'] = df['class'] + '_' + df['size']
    df['full_combo'] = df['class'] + '_' + df['color'] + '_' + df['texture'] + '_' + df['size']
    
    # Find color-texture combinations with at least 3 different class-size pairs
    ct_groups = df.groupby('color_texture')
    valid_ct = []
    for ct, group in ct_groups:
        unique_class_sizes = group['class_size'].unique()
        unique_classes = group['class'].unique()
        unique_sizes = group['size'].unique()
        # Need at least 3 different class-size combinations for 4-way choice
        # We'll duplicate one if needed
        if len(unique_class_sizes) >= 3 and len(unique_classes) >= 2 and len(unique_sizes) >= 2:
            valid_ct.append(ct)
    
    if len(valid_ct) == 0:
        print("ERROR: No color-texture combinations have enough class-size diversity.")
        print("Cannot run DCDS test with strict controls.")
        return [], 0.0
    
    print(f"\nFound {len(valid_ct)} color-texture combinations with sufficient diversity")
    
    # Pre-compute image embeddings
    print("\nExtracting image embeddings...")
    image_embeddings = {}
    skipped_images = []
    
    # Get all relevant images
    df_valid = df[df['color_texture'].isin(valid_ct)]
    all_image_paths = df_valid['image_path'].unique().tolist()
    batch_size = 16
    
    for i in tqdm(range(0, len(all_image_paths), batch_size), desc="Extracting embeddings"):
        batch_paths = all_image_paths[i:i+batch_size]
        batch_images = []
        
        for img_path in batch_paths:
            try:
                img = Image.open(img_path).convert('RGB')
                img_processed = transform(img).unsqueeze(0).to(device)
                batch_images.append((img_path, img_processed))
            except Exception as e:
                # Skip corrupted/invalid images
                skipped_images.append(img_path)
                continue
        
        if batch_images:
            paths = [p for p, _ in batch_images]
            imgs = torch.cat([img for _, img in batch_images], dim=0)
            
            with torch.no_grad():
                embeddings = extractor.get_img_feature(imgs)
                embeddings = extractor.norm_features(embeddings)
            
            for path, emb in zip(paths, embeddings):
                image_embeddings[path] = emb.cpu().float()
    
    print(f"Extracted embeddings for {len(image_embeddings)} images")
    if skipped_images:
        print(f"Skipped {len(skipped_images)} corrupted/invalid images")
    
    # Prepare for trials
    correct_count = 0
    trial_results = []
    
    # Calculate trials per color-texture combination
    trials_per_ct = num_trials // len(valid_ct)
    remaining_trials = num_trials % len(valid_ct)
    
    print(f"\nRunning {num_trials} trials across {len(valid_ct)} color-texture combinations...")
    
    # Run trials
    for ct_idx, ct in enumerate(tqdm(valid_ct, desc="Processing combinations")):
        # Get all images for this color-texture combination
        ct_data = df_valid[df_valid['color_texture'] == ct]
        
        # Group by class-size
        class_size_groups = ct_data.groupby('class_size').agg({
            'image_path': list,
            'class': 'first',
            'size': 'first'
        }).to_dict('index')
        
        available_class_sizes = list(class_size_groups.keys())
        
        if len(available_class_sizes) < 3:  # Changed from 4 to 3
            continue
        
        # Determine number of trials for this combination
        n_trials = trials_per_ct + (1 if ct_idx < remaining_trials else 0)
        
        for trial in range(n_trials):
            # Select class-size pairs for 4-way choice
            if len(available_class_sizes) == 3:
                # Use all 3 pairs plus duplicate one for 4-way choice
                selected_pairs = available_class_sizes.copy()
                # Add a duplicate to make 4 options
                selected_pairs.append(random.choice(available_class_sizes))
            elif len(available_class_sizes) >= 4:
                # Try to select 4 diverse pairs
                selected_pairs = []
                used_classes = set()
                used_sizes = set()
                
                # Shuffle to get random selection
                shuffled_pairs = available_class_sizes.copy()
                random.shuffle(shuffled_pairs)
                
                for pair in shuffled_pairs:
                    pair_class = class_size_groups[pair]['class']
                    pair_size = class_size_groups[pair]['size']
                    
                    # Try to get diverse classes and sizes
                    if len(selected_pairs) < 4:
                        if pair_class not in used_classes or pair_size not in used_sizes or len(selected_pairs) < 4:
                            selected_pairs.append(pair)
                            used_classes.add(pair_class)
                            used_sizes.add(pair_size)
                
                if len(selected_pairs) < 4:
                    # If we can't get 4 diverse pairs, just take any 4
                    selected_pairs = random.sample(available_class_sizes, 4)
            else:
                continue
            
            # First pair is the query
            query_pair = selected_pairs[0]
            query_data = class_size_groups[query_pair]
            
            # Select random query image from valid images
            valid_query_paths = [p for p in query_data['image_path'] if p in image_embeddings]
            if not valid_query_paths:
                continue
            query_img_path = random.choice(valid_query_paths)
            query_class = query_data['class']
            query_size = query_data['size']
            
            # Shuffle for candidate order
            random.shuffle(selected_pairs)
            correct_idx = selected_pairs.index(query_pair)
            
            # Create text prompts - ONLY size + class, no color/texture
            candidate_texts = []
            for pair in selected_pairs:
                pair_data = class_size_groups[pair]
                text_prompt = f"{pair_data['size']} {pair_data['class'].lower()}"
                candidate_texts.append(text_prompt)
            
            # Encode text prompts
            with torch.no_grad():
                if "clip" in model_name:
                    tokens = clip.tokenize(candidate_texts, truncate=True).to(device)
                    txt_features = model.encode_text(tokens)
                    txt_features = extractor.norm_features(txt_features)
                else:  # CVCL
                    tokens, token_len = model.tokenize(candidate_texts)
                    tokens = tokens.to(device)
                    if isinstance(token_len, torch.Tensor):
                        token_len = token_len.to(device)
                    txt_features = model.encode_text(tokens, token_len)
                    txt_features = extractor.norm_features(txt_features)
            
            # Get query image embedding
            query_embedding = image_embeddings[query_img_path].unsqueeze(0).to(device)
            
            # Calculate similarity
            query_embedding = query_embedding.float()
            txt_features = txt_features.float()
            
            similarity = (100.0 * query_embedding @ txt_features.transpose(-2, -1)).softmax(dim=1)
            
            # Get prediction
            pred_idx = similarity.argmax(dim=1).item()
            
            # Check if correct
            is_correct = (pred_idx == correct_idx)
            if is_correct:
                correct_count += 1
            
            # Store trial result
            trial_results.append({
                'trial': len(trial_results) + 1,
                'query_class': query_class,
                'query_size': query_size,
                'color_texture': ct,
                'query_img': os.path.basename(query_img_path),
                'correct_idx': correct_idx,
                'predicted_idx': pred_idx,
                'correct': is_correct,
                'candidate_texts': candidate_texts,
                'similarity_scores': similarity.cpu().numpy().tolist()
            })
    
    # Calculate accuracy
    accuracy = correct_count / len(trial_results) if trial_results else 0
    
    print(f"\n{'='*60}")
    print(f"Results for {model_name} - DCDS Text-Vision Test:")
    print(f"Total trials: {len(trial_results)}")
    print(f"Correct: {correct_count}")
    print(f"Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"{'='*60}")
    
    # Save results
    results_row = {
        'Model': model_name,
        'Test': 'DCDS-TextVision-Controlled',
        'Dataset': 'SyntheticKonkle',
        'Correct': correct_count,
        'Trials': len(trial_results),
        'Accuracy': accuracy
    }
    
    os.makedirs(os.path.dirname(RESULTS_PATH), exist_ok=True)
    if os.path.exists(RESULTS_PATH):
        results_df = pd.read_csv(RESULTS_PATH)
    else:
        results_df = pd.DataFrame()
    
    results_df = pd.concat([results_df, pd.DataFrame([results_row])], ignore_index=True)
    results_df.to_csv(RESULTS_PATH, index=False, float_format='%.4f')
    print(f"\nResults saved to {RESULTS_PATH}")
    
    return trial_results, accuracy

## Run CVCL DCDS Text-Vision Test

In [4]:
# Run CVCL test with seed=0 (matching original tests)
cvcl_trials, cvcl_accuracy = run_dcds_text_vision_test('cvcl-resnext', seed=0, num_trials=4000)


Running DCDS Text-Vision Test with cvcl-resnext
(Different Class Different Size - Controlled Color/Texture)
Using device: cuda
[INFO] Loading cvcl-resnext on cuda...
Loading checkpoint from C:\Users\jbats\.cache\huggingface\hub\models--wkvong--cvcl_s_dino_resnext50_embedding\snapshots\f50eaa0c50a6076a5190b1dd52aeeb6c3e747045\cvcl_s_dino_resnext50_embedding.ckpt


Lightning automatically upgraded your loaded checkpoint from v1.5.8 to v2.5.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\jbats\.cache\huggingface\hub\models--wkvong--cvcl_s_dino_resnext50_embedding\snapshots\f50eaa0c50a6076a5190b1dd52aeeb6c3e747045\cvcl_s_dino_resnext50_embedding.ckpt`



Found 20 color-texture combinations with sufficient diversity

Extracting image embeddings...


Extracting embeddings: 100%|██████████| 491/491 [00:21<00:00, 23.31it/s]


Extracted embeddings for 7823 images
Skipped 22 corrupted/invalid images

Running 4000 trials across 20 color-texture combinations...


Processing combinations: 100%|██████████| 20/20 [00:36<00:00,  1.82s/it]


Results for cvcl-resnext - DCDS Text-Vision Test:
Total trials: 4000
Correct: 1161
Accuracy: 0.2903 (29.03%)

Results saved to C:\Users\jbats\Projects\NTU-Synthetic\PatrickProject\Chart_Generation\text_vision_results.csv





## Run CLIP DCDS Text-Vision Test

In [5]:
# Run CLIP test with seed=0 (matching original tests)
clip_trials, clip_accuracy = run_dcds_text_vision_test('clip-resnext', seed=0, num_trials=4000)


Running DCDS Text-Vision Test with clip-resnext
(Different Class Different Size - Controlled Color/Texture)
Using device: cuda
[INFO] Loading clip-resnext on cuda...

Found 20 color-texture combinations with sufficient diversity

Extracting image embeddings...


  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
Extracting embeddings: 100%|██████████| 491/491 [00:17<00:00, 27.53it/s]


Extracted embeddings for 7823 images
Skipped 22 corrupted/invalid images

Running 4000 trials across 20 color-texture combinations...


Processing combinations: 100%|██████████| 20/20 [00:18<00:00,  1.08it/s]


Results for clip-resnext - DCDS Text-Vision Test:
Total trials: 4000
Correct: 3210
Accuracy: 0.8025 (80.25%)

Results saved to C:\Users\jbats\Projects\NTU-Synthetic\PatrickProject\Chart_Generation\text_vision_results.csv





## Compare Results

In [7]:
# Display comparison
print("\n" + "="*60)
print("DCDS TEXT-VISION TEST COMPARISON (CONTROLLED)")
print("="*60)
print(f"\nTest: Different Class Different Size (4-way forced choice)")
print(f"Control: Color and Texture held constant (not mentioned in text)")
print(f"\nResults:")
print(f"  CVCL Accuracy: {cvcl_accuracy:.4f} ({cvcl_accuracy*100:.2f}%)")
print(f"  CLIP Accuracy: {clip_accuracy:.4f} ({clip_accuracy*100:.2f}%)")
print(f"\nDifference: {abs(cvcl_accuracy - clip_accuracy):.4f} ({abs(cvcl_accuracy - clip_accuracy)*100:.2f}%)")
if cvcl_accuracy > clip_accuracy:
    print(f"CVCL performs better by {(cvcl_accuracy - clip_accuracy)*100:.2f}%")
elif clip_accuracy > cvcl_accuracy:
    print(f"CLIP performs better by {(clip_accuracy - cvcl_accuracy)*100:.2f}%")
else:
    print("Both models perform equally")

print("\n" + "="*60)
print("\nAnalysis:")
print("- Tests size discrimination when class also varies")
print("- Color and texture are controlled to isolate size+class discrimination")
print("- Should be easier than SCDS since both size AND class provide cues")
print("- Performance shows how well models combine size and class understanding")


DCDS TEXT-VISION TEST COMPARISON (CONTROLLED)

Test: Different Class Different Size (4-way forced choice)
Control: Color and Texture held constant (not mentioned in text)

Results:
  CVCL Accuracy: 0.2903 (29.03%)
  CLIP Accuracy: 0.8025 (80.25%)

Difference: 0.5122 (51.23%)
CLIP performs better by 51.23%


Analysis:
- Tests size discrimination when class also varies
- Color and texture are controlled to isolate size+class discrimination
- Should be easier than SCDS since both size AND class provide cues
- Performance shows how well models combine size and class understanding


## Analysis Notes

### DCDS Text-Vision Test Characteristics (Controlled Version):
- **Visual Control**: All 4 candidates have same color and texture (when possible)
- **Variation**: Both class AND size differ between candidates
- **Text Prompts**: Only mention size + class (e.g., "large apple", "small car")
- **NOT mentioned**: Color and texture are controlled but excluded from text

