### Data loading and feature extraction

Creating DataFrame for training

In [None]:
import json
import pandas as pd
import torch
import torchvision.models as models

# Load the JSON file
with open('/kaggle/input/coco-image-caption/annotations_trainval2014/annotations/captions_train2014.json', 'r') as f:
    coco_data = json.load(f)

# Convert to DataFrames
annotations_df = pd.DataFrame(coco_data['annotations'])
images_df = pd.DataFrame(coco_data['images'])


df = annotations_df.merge(images_df, left_on='image_id', right_on='id', 
                          suffixes=('_ann', '_img'))

# Select relevant columns
df = df[['image_id', 'file_name', 'caption', 'height', 'width']]



captions_grouped = annotations_df.groupby('image_id')['caption'].apply(list).reset_index()
captions_grouped.columns = ['image_id', 'captions']

grouped_df = captions_grouped.merge(images_df, left_on='image_id', right_on='id')
grouped_df = final_df[['image_id', 'file_name', 'captions', 'height', 'width']]
print(grouped_df.head())


Pretrained VGG-16 and ResNet loading

In [None]:
vgg16 = models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_V1)
vgg16.classifier = vgg16.classifier[:-1]  # Removes layer [6]
resnet101 = models.resnet101(pretrained=True)
resnet152 = models.resnet152(pretrained=True)

In [None]:
import torchvision.transforms as transforms
from tqdm import tqdm
from PIL import Image

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vgg16 =vgg16.to(device)
resnet101 = resnet101.to(device)
resnet152 = resnet152.to(device)

transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                       std=[0.229, 0.224, 0.225])
])

def extract_features(image_path,model):
    img = Image.open(image_path).convert('RGB')
    img = transform(img).unsqueeze(0).cuda()
    
    with torch.no_grad():
        features = model(img)
    
    return features.cpu().numpy()

# Extract and save features for all COCO images
features_dict = {vgg16:{},resnet101:{},resnet152:{}}

for row in tqdm(grouped_df[['file_name','image_id']].itertuples(index=False)):
    img_filename = row.file_name
    img_id = row.image_id
    img_path = f'/kaggle/input/coco-image-caption/train2014/train2014/{img_filename}'
    for model in features_dict.keys():
        features = extract_features(img_path, model)
        features_dict[model][img_id] = features
# Save extracted features
np.save('coco_vgg16_features.npy', features_dict[vgg16])
np.save('coco_resnet101_features.npy', features_dict[resnet101])
np.save('coco_resnet152_features.npy', features_dict[resnet152])



Feature extraction and saving

Build complete dataset

In [10]:
import json
import pandas as pd
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from tqdm import tqdm
from PIL import Image
import numpy as np
import os

# ============================================================================
# 1. LOAD KARPATHY SPLIT
# ============================================================================

print("Loading Karpathy split...")
karpathy_file = '/kaggle/input/karpathy-splits/dataset_coco.json'

if not os.path.exists(karpathy_file):
    raise FileNotFoundError(f"Karpathy split not found at: {karpathy_file}")

with open(karpathy_file, 'r') as f:
    karpathy_data = json.load(f)

print(f"✓ Loaded Karpathy split: {len(karpathy_data['images'])} total images")

# ============================================================================
# 2. ORGANIZE DATA BY SPLIT
# ============================================================================

def organize_by_split(karpathy_data):
    """Organize images and captions by train/val/test split"""
    splits = {'train': [], 'val': [], 'test': []}
    
    for img_data in karpathy_data['images']:
        split = img_data['split']
        
        # Handle 'restval' - add to training (standard practice)
        if split == 'restval':
            split = 'train'
        
        if split in ['train', 'val', 'test']:
            # Extract relevant info
            # Note: Karpathy uses images from BOTH train2014 and val2014 folders
            image_info = {
                'image_id': img_data['cocoid'],
                'file_name': img_data['filename'],  # e.g., COCO_train2014_xxx.jpg or COCO_val2014_xxx.jpg
                'split': split,
                'captions': [sent['raw'] for sent in img_data['sentences']]
            }
            splits[split].append(image_info)
    
    return splits

print("\nOrganizing data by Karpathy split...")
splits_data = organize_by_split(karpathy_data)

print(f"\n{'='*70}")
print(f"KARPATHY SPLIT DISTRIBUTION")
print(f"{'='*70}")
print(f"Train images: {len(splits_data['train']):,}")
print(f"Val images:   {len(splits_data['val']):,}")
print(f"Test images:  {len(splits_data['test']):,}")
print(f"{'='*70}")

# Convert to DataFrames
train_df = pd.DataFrame(splits_data['train'])
val_df = pd.DataFrame(splits_data['val'])
test_df = pd.DataFrame(splits_data['test'])

# Check which COCO folders the images come from
print("\nAnalyzing image sources...")
train_sources = train_df['file_name'].str.contains('train2014').sum()
train_val_sources = train_df['file_name'].str.contains('val2014').sum()
print(f"Train split:")
print(f"  From train2014/: {train_sources:,} images")
print(f"  From val2014/:   {train_val_sources:,} images")

val_sources = val_df['file_name'].str.contains('train2014').sum()
val_val_sources = val_df['file_name'].str.contains('val2014').sum()
print(f"Val split:")
print(f"  From train2014/: {val_sources:,} images")
print(f"  From val2014/:   {val_val_sources:,} images")

test_sources = test_df['file_name'].str.contains('train2014').sum()
test_val_sources = test_df['file_name'].str.contains('val2014').sum()
print(f"Test split:")
print(f"  From train2014/: {test_sources:,} images")
print(f"  From val2014/:   {test_val_sources:,} images")

print("\nSample filenames:")
print(f"Train: {train_df.iloc[0]['file_name']}")
print(f"Val:   {val_df.iloc[0]['file_name']}")
print(f"Test:  {test_df.iloc[0]['file_name']}")

# ============================================================================
# 3. LOAD PRETRAINED MODELS
# ============================================================================

print(f"\n{'='*70}")
print("LOADING PRETRAINED MODELS")
print(f"{'='*70}")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

# VGG16 - fc7 features (4096-dim) - matching the paper
print("\n[1/2] Loading VGG16...")
vgg16 = models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_V1)
vgg16.classifier = vgg16.classifier[:-1]  # Remove last layer to get fc7
vgg16 = vgg16.to(device)
vgg16.eval()
print("✓ VGG16 loaded (fc7: 4096-dim)")

# ResNet101 (optional - for better performance)
print("\n[2/2] Loading ResNet101...")
resnet101 = models.resnet101(weights=models.ResNet101_Weights.IMAGENET1K_V1)
resnet101 = torch.nn.Sequential(*list(resnet101.children())[:-1])  # Remove FC layer
resnet101 = resnet101.to(device)
resnet101.eval()
print("✓ ResNet101 loaded (avgpool: 2048-dim)")

print(f"\n{'='*70}")

# ============================================================================
# 4. IMAGE PREPROCESSING
# ============================================================================

transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                       std=[0.229, 0.224, 0.225])
])

# ============================================================================
# 5. FEATURE EXTRACTION FUNCTION
# ============================================================================

def extract_features(image_path, model):
    """Extract features from a single image"""
    try:
        img = Image.open(image_path).convert('RGB')
        img = transform(img).unsqueeze(0).to(device)
        
        with torch.no_grad():
            features = model(img)
            # Flatten if needed (for ResNet)
            if len(features.shape) > 2:
                features = features.squeeze()
        
        return features.cpu().numpy()
    except Exception as e:
        return None

# ============================================================================
# 6. HELPER FUNCTION TO FIND IMAGE PATH
# ============================================================================

def get_image_path(filename, base_paths):
    """
    Find the correct path for an image.
    Karpathy split uses images from both train2014 and val2014 folders.
    
    Args:
        filename: e.g., 'COCO_train2014_000000123456.jpg' or 'COCO_val2014_000000123456.jpg'
        base_paths: dict with 'train2014' and 'val2014' keys
    """
    # Determine which folder based on filename
    if 'train2014' in filename:
        folder = 'train2014'
    elif 'val2014' in filename:
        folder = 'val2014'
    else:
        return None
    
    img_path = f"{base_paths[folder]}/{filename}"
    
    if os.path.exists(img_path):
        return img_path
    else:
        return None

# ============================================================================
# 7. EXTRACT AND SAVE FEATURES FOR EACH SPLIT
# ============================================================================

def extract_and_save_split_features(df, split_name, base_paths, models_dict):
    """Extract features for all images in a split and save"""
    print(f"\n{'='*70}")
    print(f"EXTRACTING {split_name.upper()} FEATURES ({len(df):,} images)")
    print(f"{'='*70}")
    
    features_by_model = {model_name: {} for model_name in models_dict.keys()}
    missing_images = []
    processed = 0
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc=f"{split_name}"):
        img_id = row['image_id']
        img_filename = row['file_name']
        
        # Find the correct image path (from train2014 or val2014)
        img_path = get_image_path(img_filename, base_paths)
        
        if img_path is None:
            missing_images.append(img_filename)
            continue
        
        # Extract features with each model
        for model_name, model in models_dict.items():
            features = extract_features(img_path, model)
            if features is not None:
                features_by_model[model_name][img_id] = features
        
        processed += 1
    
    # Report statistics
    print(f"\n✓ Processed: {processed:,} images")
    if missing_images:
        print(f"⚠ Missing: {len(missing_images):,} images")
        if len(missing_images) <= 10:
            print(f"  Missing files: {missing_images}")
        else:
            print(f"  First 5 missing: {missing_images[:5]}")
    
    # Save features for each model
    print(f"\nSaving features...")
    for model_name, features_dict in features_by_model.items():
        save_path = f'coco_{split_name}_{model_name}_features.npy'
        np.save(save_path, features_dict)
        feature_shape = list(features_dict.values())[0].shape if features_dict else 'N/A'
        print(f"  ✓ {save_path}: {len(features_dict):,} images, shape={feature_shape}")
    
    return features_by_model

# ============================================================================
# 8. SETUP IMAGE PATHS (BOTH train2014 AND val2014)
# ============================================================================

BASE_PATHS = {
    'train2014': '/kaggle/input/coco2014/train2014/train2014',
    'val2014': '/kaggle/input/coco2014/val2014/val2014'
}

# Verify paths exist
print(f"\n{'='*70}")
print("VERIFYING IMAGE DIRECTORIES")
print(f"{'='*70}")

for folder, path in BASE_PATHS.items():
    if not os.path.exists(path):
        print(f"⚠ Warning: Path not found: {path}")
    else:
        num_files = len([f for f in os.listdir(path) if f.endswith('.jpg')])
        print(f"✓ {folder}: {path}")
        print(f"  Files: {num_files:,} .jpg images")

# ============================================================================
# 9. RUN FEATURE EXTRACTION
# ============================================================================

# Define models to use
models_dict = {
    'vgg16': vgg16,
    'resnet101': resnet101,
}

# Extract features for each split
print(f"\n{'='*70}")
print("STARTING FEATURE EXTRACTION")
print(f"{'='*70}")

train_features = extract_and_save_split_features(import json
import pandas as pd
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from tqdm import tqdm
from PIL import Image
import numpy as np
import os

# ============================================================================
# 1. LOAD KARPATHY SPLIT
# ============================================================================

print("Loading Karpathy split...")
karpathy_file = '/kaggle/input/karpathy-splits/dataset_coco.json'

if not os.path.exists(karpathy_file):
    raise FileNotFoundError(f"Karpathy split not found at: {karpathy_file}")

with open(karpathy_file, 'r') as f:
    karpathy_data = json.load(f)

print(f"✓ Loaded Karpathy split: {len(karpathy_data['images'])} total images")

# ============================================================================
# 2. ORGANIZE DATA BY SPLIT
# ============================================================================

def organize_by_split(karpathy_data):
    """Organize images and captions by train/val/test split"""
    splits = {'train': [], 'val': [], 'test': []}
    
    for img_data in karpathy_data['images']:
        split = img_data['split']
        
        # Handle 'restval' - add to training (standard practice)
        if split == 'restval':
            split = 'train'
        
        if split in ['train', 'val', 'test']:
            # Extract relevant info
            # Note: Karpathy uses images from BOTH train2014 and val2014 folders
            image_info = {
                'image_id': img_data['cocoid'],
                'file_name': img_data['filename'],  # e.g., COCO_train2014_xxx.jpg or COCO_val2014_xxx.jpg
                'split': split,
                'captions': [sent['raw'] for sent in img_data['sentences']]
            }
            splits[split].append(image_info)
    
    return splits

print("\nOrganizing data by Karpathy split...")
splits_data = organize_by_split(karpathy_data)

print(f"\n{'='*70}")
print(f"KARPATHY SPLIT DISTRIBUTION")
print(f"{'='*70}")
print(f"Train images: {len(splits_data['train']):,}")
print(f"Val images:   {len(splits_data['val']):,}")
print(f"Test images:  {len(splits_data['test']):,}")
print(f"{'='*70}")

# Convert to DataFrames
train_df = pd.DataFrame(splits_data['train'])
val_df = pd.DataFrame(splits_data['val'])
test_df = pd.DataFrame(splits_data['test'])

# Check which COCO folders the images come from
print("\nAnalyzing image sources...")
train_sources = train_df['file_name'].str.contains('train2014').sum()
train_val_sources = train_df['file_name'].str.contains('val2014').sum()
print(f"Train split:")
print(f"  From train2014/: {train_sources:,} images")
print(f"  From val2014/:   {train_val_sources:,} images")

val_sources = val_df['file_name'].str.contains('train2014').sum()
val_val_sources = val_df['file_name'].str.contains('val2014').sum()
print(f"Val split:")
print(f"  From train2014/: {val_sources:,} images")
print(f"  From val2014/:   {val_val_sources:,} images")

test_sources = test_df['file_name'].str.contains('train2014').sum()
test_val_sources = test_df['file_name'].str.contains('val2014').sum()
print(f"Test split:")
print(f"  From train2014/: {test_sources:,} images")
print(f"  From val2014/:   {test_val_sources:,} images")

print("\nSample filenames:")
print(f"Train: {train_df.iloc[0]['file_name']}")
print(f"Val:   {val_df.iloc[0]['file_name']}")
print(f"Test:  {test_df.iloc[0]['file_name']}")

# ============================================================================
# 3. LOAD PRETRAINED MODELS
# ============================================================================

print(f"\n{'='*70}")
print("LOADING PRETRAINED MODELS")
print(f"{'='*70}")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

# VGG16 - fc7 features (4096-dim) - matching the paper
print("\n[1/2] Loading VGG16...")
vgg16 = models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_V1)
vgg16.classifier = vgg16.classifier[:-1]  # Remove last layer to get fc7
vgg16 = vgg16.to(device)
vgg16.eval()
print("✓ VGG16 loaded (fc7: 4096-dim)")

# ResNet101 (optional - for better performance)
print("\n[2/2] Loading ResNet101...")
resnet101 = models.resnet101(weights=models.ResNet101_Weights.IMAGENET1K_V1)
resnet101 = torch.nn.Sequential(*list(resnet101.children())[:-1])  # Remove FC layer
resnet101 = resnet101.to(device)
resnet101.eval()
print("✓ ResNet101 loaded (avgpool: 2048-dim)")

print(f"\n{'='*70}")

# ============================================================================
# 4. IMAGE PREPROCESSING
# ============================================================================

transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                       std=[0.229, 0.224, 0.225])
])

# ============================================================================
# 5. FEATURE EXTRACTION FUNCTION
# ============================================================================

def extract_features(image_path, model):
    """Extract features from a single image"""
    try:
        img = Image.open(image_path).convert('RGB')
        img = transform(img).unsqueeze(0).to(device)
        
        with torch.no_grad():
            features = model(img)
            # Flatten if needed (for ResNet)
            if len(features.shape) > 2:
                features = features.squeeze()
        
        return features.cpu().numpy()
    except Exception as e:
        return None

# ============================================================================
# 6. HELPER FUNCTION TO FIND IMAGE PATH
# ============================================================================

def get_image_path(filename, base_paths):
    """
    Find the correct path for an image.
    Karpathy split uses images from both train2014 and val2014 folders.
    
    Args:
        filename: e.g., 'COCO_train2014_000000123456.jpg' or 'COCO_val2014_000000123456.jpg'
        base_paths: dict with 'train2014' and 'val2014' keys
    """
    # Determine which folder based on filename
    if 'train2014' in filename:
        folder = 'train2014'
    elif 'val2014' in filename:
        folder = 'val2014'
    else:
        return None
    
    img_path = f"{base_paths[folder]}/{filename}"
    
    if os.path.exists(img_path):
        return img_path
    else:
        return None

# ============================================================================
# 7. EXTRACT AND SAVE FEATURES FOR EACH SPLIT
# ============================================================================

def extract_and_save_split_features(df, split_name, base_paths, models_dict):
    """Extract features for all images in a split and save"""
    print(f"\n{'='*70}")
    print(f"EXTRACTING {split_name.upper()} FEATURES ({len(df):,} images)")
    print(f"{'='*70}")
    
    features_by_model = {model_name: {} for model_name in models_dict.keys()}
    missing_images = []
    processed = 0
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc=f"{split_name}"):
        img_id = row['image_id']
        img_filename = row['file_name']
        
        # Find the correct image path (from train2014 or val2014)
        img_path = get_image_path(img_filename, base_paths)
        
        if img_path is None:
            missing_images.append(img_filename)
            continue
        
        # Extract features with each model
        for model_name, model in models_dict.items():
            features = extract_features(img_path, model)
            if features is not None:
                features_by_model[model_name][img_id] = features
        
        processed += 1
    
    # Report statistics
    print(f"\n✓ Processed: {processed:,} images")
    if missing_images:
        print(f"⚠ Missing: {len(missing_images):,} images")
        if len(missing_images) <= 10:
            print(f"  Missing files: {missing_images}")
        else:
            print(f"  First 5 missing: {missing_images[:5]}")
    
    # Save features for each model
    print(f"\nSaving features...")
    for model_name, features_dict in features_by_model.items():
        save_path = f'coco_{split_name}_{model_name}_features.npy'
        np.save(save_path, features_dict)
        feature_shape = list(features_dict.values())[0].shape if features_dict else 'N/A'
        print(f"  ✓ {save_path}: {len(features_dict):,} images, shape={feature_shape}")
    
    return features_by_model

# ============================================================================
# 8. SETUP IMAGE PATHS (BOTH train2014 AND val2014)
# ============================================================================

BASE_PATHS = {
    'train2014': '/kaggle/input/coco2014/train2014/train2014',
    'val2014': '/kaggle/input/coco2014/val2014/val2014'
}

# Verify paths exist
print(f"\n{'='*70}")
print("VERIFYING IMAGE DIRECTORIES")
print(f"{'='*70}")

for folder, path in BASE_PATHS.items():
    if not os.path.exists(path):
        print(f"⚠ Warning: Path not found: {path}")
    else:
        num_files = len([f for f in os.listdir(path) if f.endswith('.jpg')])
        print(f"✓ {folder}: {path}")
        print(f"  Files: {num_files:,} .jpg images")

# ============================================================================
# 9. RUN FEATURE EXTRACTION
# ============================================================================

# Define models to use
models_dict = {
    'vgg16': vgg16,
    'resnet101': resnet101,
}

# Extract features for each split
print(f"\n{'='*70}")
print("STARTING FEATURE EXTRACTION")
print(f"{'='*70}")

train_features = extract_and_save_split_features(train_df, 'train', BASE_PATHS, models_dict)
val_features = extract_and_save_split_features(val_df, 'val', BASE_PATHS, models_dict)
test_features = extract_and_save_split_features(test_df, 'test', BASE_PATHS, models_dict)

# ============================================================================
# 10. SAVE DATAFRAMES WITH METADATA
# ============================================================================

print(f"\n{'='*70}")
print("SAVING CAPTION METADATA")
print(f"{'='*70}")

train_df.to_pickle('train_captions.pkl')
val_df.to_pickle('val_captions.pkl')
test_df.to_pickle('test_captions.pkl')
print("✓ Saved .pkl files")

train_df.to_csv('train_captions.csv', index=False)
val_df.to_csv('val_captions.csv', index=False)
test_df.to_csv('test_captions.csv', index=False)
print("✓ Saved .csv files")

# ============================================================================
# 11. FINAL SUMMARY
# ============================================================================

print(f"\n{'='*70}")
print("FEATURE EXTRACTION COMPLETE! ✓")
print(f"{'='*70}")

print(f"\nDataset sizes:")
print(f"  Train: {len(train_df):,} images")
print(f"  Val:   {len(val_df):,} images")
print(f"  Test:  {len(test_df):,} images")
print(f"  TOTAL: {len(train_df) + len(val_df) + len(test_df):,} images")

print(f"\nSaved files:")
print(f"  Captions:")
print(f"    - train_captions.pkl / .csv")
print(f"    - val_captions.pkl / .csv")
print(f"    - test_captions.pkl / .csv")
print(f"  Features:")
print(f"    - coco_train_vgg16_features.npy")
print(f"    - coco_train_resnet101_features.npy")
print(f"    - coco_val_vgg16_features.npy")
print(f"    - coco_val_resnet101_features.npy")
print(f"    - coco_test_vgg16_features.npy")
print(f"    - coco_test_resnet101_features.npy")

# ============================================================================
# 12. VERIFICATION
# ============================================================================

print(f"\n{'='*70}")
print("VERIFICATION")
print(f"{'='*70}")

# Load and verify
loaded_vgg_features = np.load('coco_train_vgg16_features.npy', allow_pickle=True).item()
loaded_resnet_features = np.load('coco_train_resnet101_features.npy', allow_pickle=True).item()
loaded_df = pd.read_pickle('train_captions.pkl')

print(f"\nTrain set verification:")
print(f"  VGG16 features: {len(loaded_vgg_features):,} images")
print(f"    Shape: {list(loaded_vgg_features.values())[0].shape}")
print(f"  ResNet101 features: {len(loaded_resnet_features):,} images")
print(f"    Shape: {list(loaded_resnet_features.values())[0].shape}")
print(f"  Captions DataFrame: {len(loaded_df):,} rows")

print(f"\nSample image (ID={loaded_df.iloc[0]['image_id']}):")
print(f"  File: {loaded_df.iloc[0]['file_name']}")
print(f"  Captions:")
for i, cap in enumerate(loaded_df.iloc[0]['captions']):
    print(f"    {i+1}. {cap}")

print(f"\n{'='*70}")
print("NEXT STEP: Build vocabulary from training captions")
print(f"{'='*70}"), 'train', BASE_PATHS, models_dict)
val_features = extract_and_save_split_features(val_df, 'val', BASE_PATHS, models_dict)
test_features = extract_and_save_split_features(test_df, 'test', BASE_PATHS, models_dict)

# ============================================================================
# 10. SAVE DATAFRAMES WITH METADATA
# ============================================================================

print(f"\n{'='*70}")
print("SAVING CAPTION METADATA")
print(f"{'='*70}")

train_df.to_pickle('train_captions.pkl')
val_df.to_pickle('val_captions.pkl')
test_df.to_pickle('test_captions.pkl')
print("✓ Saved .pkl files")

train_df.to_csv('train_captions.csv', index=False)
val_df.to_csv('val_captions.csv', index=False)
test_df.to_csv('test_captions.csv', index=False)
print("✓ Saved .csv files")

# ============================================================================
# 11. FINAL SUMMARY
# ============================================================================

print(f"\n{'='*70}")
print("FEATURE EXTRACTION COMPLETE! ✓")
print(f"{'='*70}")

print(f"\nDataset sizes:")
print(f"  Train: {len(train_df):,} images")
print(f"  Val:   {len(val_df):,} images")
print(f"  Test:  {len(test_df):,} images")
print(f"  TOTAL: {len(train_df) + len(val_df) + len(test_df):,} images")

print(f"\nSaved files:")
print(f"  Captions:")
print(f"    - train_captions.pkl / .csv")
print(f"    - val_captions.pkl / .csv")
print(f"    - test_captions.pkl / .csv")
print(f"  Features:")
print(f"    - coco_train_vgg16_features.npy")
print(f"    - coco_train_resnet101_features.npy")
print(f"    - coco_val_vgg16_features.npy")
print(f"    - coco_val_resnet101_features.npy")
print(f"    - coco_test_vgg16_features.npy")
print(f"    - coco_test_resnet101_features.npy")

# ============================================================================
# 12. VERIFICATION
# ============================================================================

print(f"\n{'='*70}")
print("VERIFICATION")
print(f"{'='*70}")

# Load and verify
loaded_vgg_features = np.load('coco_train_vgg16_features.npy', allow_pickle=True).item()
loaded_resnet_features = np.load('coco_train_resnet101_features.npy', allow_pickle=True).item()
loaded_df = pd.read_pickle('train_captions.pkl')

print(f"\nTrain set verification:")
print(f"  VGG16 features: {len(loaded_vgg_features):,} images")
print(f"    Shape: {list(loaded_vgg_features.values())[0].shape}")
print(f"  ResNet101 features: {len(loaded_resnet_features):,} images")
print(f"    Shape: {list(loaded_resnet_features.values())[0].shape}")
print(f"  Captions DataFrame: {len(loaded_df):,} rows")

print(f"\nSample image (ID={loaded_df.iloc[0]['image_id']}):")
print(f"  File: {loaded_df.iloc[0]['file_name']}")
print(f"  Captions:")
for i, cap in enumerate(loaded_df.iloc[0]['captions']):
    print(f"    {i+1}. {cap}")

print(f"\n{'='*70}")
print("NEXT STEP: Build vocabulary from training captions")
print(f"{'='*70}")

Loading Karpathy split...
✓ Loaded Karpathy split: 123287 total images

Organizing data by Karpathy split...

KARPATHY SPLIT DISTRIBUTION
Train images: 113,287
Val images:   5,000
Test images:  5,000

Analyzing image sources...
Train split:
  From train2014/: 82,783 images
  From val2014/:   30,504 images
Val split:
  From train2014/: 0 images
  From val2014/:   5,000 images
Test split:
  From train2014/: 0 images
  From val2014/:   5,000 images

Sample filenames:
Train: COCO_val2014_000000522418.jpg
Val:   COCO_val2014_000000184613.jpg
Test:  COCO_val2014_000000391895.jpg

LOADING PRETRAINED MODELS
Device: cuda

[1/2] Loading VGG16...
✓ VGG16 loaded (fc7: 4096-dim)

[2/2] Loading ResNet101...
✓ ResNet101 loaded (avgpool: 2048-dim)


VERIFYING IMAGE DIRECTORIES
✓ train2014: /kaggle/input/coco2014/train2014/train2014
  Files: 82,783 .jpg images
✓ val2014: /kaggle/input/coco2014/val2014/val2014
  Files: 40,504 .jpg images

STARTING FEATURE EXTRACTION

EXTRACTING TRAIN FEATURES (113,287 i

train: 100%|██████████| 113287/113287 [1:23:46<00:00, 22.54it/s]



✓ Processed: 113,287 images

Saving features...
  ✓ coco_train_vgg16_features.npy: 113,287 images, shape=(1, 4096)
  ✓ coco_train_resnet101_features.npy: 113,287 images, shape=(2048,)

EXTRACTING VAL FEATURES (5,000 images)


val: 100%|██████████| 5000/5000 [03:44<00:00, 22.30it/s]



✓ Processed: 5,000 images

Saving features...
  ✓ coco_val_vgg16_features.npy: 5,000 images, shape=(1, 4096)
  ✓ coco_val_resnet101_features.npy: 5,000 images, shape=(2048,)

EXTRACTING TEST FEATURES (5,000 images)


test: 100%|██████████| 5000/5000 [03:43<00:00, 22.42it/s]



✓ Processed: 5,000 images

Saving features...
  ✓ coco_test_vgg16_features.npy: 5,000 images, shape=(1, 4096)
  ✓ coco_test_resnet101_features.npy: 5,000 images, shape=(2048,)

SAVING CAPTION METADATA
✓ Saved .pkl files
✓ Saved .csv files

FEATURE EXTRACTION COMPLETE! ✓

Dataset sizes:
  Train: 113,287 images
  Val:   5,000 images
  Test:  5,000 images
  TOTAL: 123,287 images

Saved files:
  Captions:
    - train_captions.pkl / .csv
    - val_captions.pkl / .csv
    - test_captions.pkl / .csv
  Features:
    - coco_train_vgg16_features.npy
    - coco_train_resnet101_features.npy
    - coco_val_vgg16_features.npy
    - coco_val_resnet101_features.npy
    - coco_test_vgg16_features.npy
    - coco_test_resnet101_features.npy

VERIFICATION

Train set verification:
  VGG16 features: 113,287 images
    Shape: (1, 4096)
  ResNet101 features: 113,287 images
    Shape: (2048,)
  Captions DataFrame: 113,287 rows

Sample image (ID=522418):
  File: COCO_val2014_000000522418.jpg
  Captions:
    1.

Vocabulary creation (9,221 words)


In [None]:
import pandas as pd
import numpy as np
import pickle
import re
from collections import Counter
from tqdm import tqdm

# ============================================================================
# VOCABULARY BUILDER FOR IMAGE CAPTIONING
# ============================================================================
# This script builds a vocabulary from training captions, matching the paper's
# approach: ~9,221 words + special tokens
# ============================================================================

class Vocabulary:
    """Vocabulary class for image captioning"""
    
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.word_counts = Counter()
        
        # Special tokens (must be added first to get correct indices)
        self.PAD_TOKEN = '<PAD>'
        self.START_TOKEN = '<START>'
        self.END_TOKEN = '<END>'
        self.UNK_TOKEN = '<UNK>'
        
        # Initialize with special tokens
        self.word2idx = {
            self.PAD_TOKEN: 0,
            self.START_TOKEN: 1,
            self.END_TOKEN: 2,
            self.UNK_TOKEN: 3
        }
        self.idx2word = {v: k for k, v in self.word2idx.items()}
        self.idx = 4  # Next available index
    
    def add_word(self, word):
        """Add a word to the vocabulary"""
        if word not in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1
    
    def __len__(self):
        return len(self.word2idx)
    
    def __call__(self, word):
        """Convert word to index"""
        return self.word2idx.get(word, self.word2idx[self.UNK_TOKEN])


def tokenize_caption(caption):
    """
    Tokenize a caption:
    - Convert to lowercase
    - Remove punctuation (except hyphens in words)
    - Split into words
    
    Args:
        caption: str, raw caption text
    
    Returns:
        list of tokens
    """
    # Lowercase
    caption = caption.lower()
    
    # Remove punctuation at end of words but keep apostrophes and hyphens
    # This regex keeps alphanumeric, apostrophes, and hyphens
    caption = re.sub(r'[^\w\s\'-]', ' ', caption)
    
    # Split and remove extra whitespace
    tokens = caption.split()
    
    # Remove empty strings
    tokens = [t for t in tokens if t]
    
    return tokens


def build_vocabulary(train_captions_df, vocab_size=9221, min_word_freq=5):
    """
    Build vocabulary from training captions.
    
    Args:
        train_captions_df: pandas DataFrame with 'captions' column
        vocab_size: target vocabulary size (excluding special tokens)
        min_word_freq: minimum frequency for a word to be included
    
    Returns:
        Vocabulary object
    """
    print(f"{'='*70}")
    print(f"BUILDING VOCABULARY")
    print(f"{'='*70}")
    
    vocab = Vocabulary()
    
    # Count all words in training captions
    print(f"\n[1/3] Tokenizing and counting words...")
    all_tokens = []
    
    for idx, row in tqdm(train_captions_df.iterrows(), 
                         total=len(train_captions_df),
                         desc="Processing"):
        captions = row['captions']
        
        # Process each caption for this image (usually 5 captions per image)
        for caption in captions:
            tokens = tokenize_caption(caption)
            all_tokens.extend(tokens)
            vocab.word_counts.update(tokens)
    
    print(f"✓ Total tokens processed: {len(all_tokens):,}")
    print(f"✓ Unique words found: {len(vocab.word_counts):,}")
    
    # Filter by minimum frequency
    print(f"\n[2/3] Filtering words by frequency (min_freq={min_word_freq})...")
    filtered_words = {word: count for word, count in vocab.word_counts.items() 
                      if count >= min_word_freq}
    print(f"✓ Words with freq >= {min_word_freq}: {len(filtered_words):,}")
    
    # Sort by frequency and take top vocab_size words
    print(f"\n[3/3] Building vocabulary (size={vocab_size})...")
    most_common = sorted(filtered_words.items(), key=lambda x: x[1], reverse=True)
    
    # Take top vocab_size - 4 (to account for special tokens)
    top_words = most_common[:vocab_size - 4]
    
    # Add to vocabulary
    for word, count in tqdm(top_words, desc="Adding words"):
        vocab.add_word(word)
    
    print(f"\n✓ Vocabulary built successfully!")
    print(f"  Total vocabulary size: {len(vocab)} words")
    print(f"  Special tokens: {[vocab.PAD_TOKEN, vocab.START_TOKEN, vocab.END_TOKEN, vocab.UNK_TOKEN]}")
    print(f"  Regular words: {len(vocab) - 4}")
    
    return vocab


def analyze_vocabulary(vocab, train_df):
    """
    Analyze vocabulary statistics
    
    Args:
        vocab: Vocabulary object
        train_df: training DataFrame
    """
    print(f"\n{'='*70}")
    print(f"VOCABULARY STATISTICS")
    print(f"{'='*70}")
    
    # Most common words
    print(f"\nTop 20 most common words:")
    most_common = vocab.word_counts.most_common(20)
    for i, (word, count) in enumerate(most_common, 1):
        print(f"  {i:2d}. '{word}': {count:,} occurrences")
    
    # Calculate coverage
    total_tokens = sum(vocab.word_counts.values())
    vocab_coverage = sum(count for word, count in vocab.word_counts.items() 
                         if word in vocab.word2idx)
    coverage_pct = (vocab_coverage / total_tokens) * 100
    
    print(f"\nVocabulary Coverage:")
    print(f"  Total tokens in training: {total_tokens:,}")
    print(f"  Tokens covered by vocab: {vocab_coverage:,}")
    print(f"  Coverage: {coverage_pct:.2f}%")
    
    # Caption length statistics
    print(f"\nCaption Length Statistics:")
    caption_lengths = []
    for idx, row in train_df.iterrows():
        for caption in row['captions']:
            tokens = tokenize_caption(caption)
            caption_lengths.append(len(tokens))
    
    print(f"  Min length: {min(caption_lengths)} words")
    print(f"  Max length: {max(caption_lengths)} words")
    print(f"  Mean length: {np.mean(caption_lengths):.2f} words")
    print(f"  Median length: {np.median(caption_lengths):.0f} words")
    print(f"  Std dev: {np.std(caption_lengths):.2f} words")
    
    # Distribution of caption lengths
    print(f"\n  Length distribution:")
    length_bins = [0, 5, 10, 15, 20, 25, 100]
    for i in range(len(length_bins) - 1):
        count = sum(1 for l in caption_lengths if length_bins[i] < l <= length_bins[i+1])
        pct = (count / len(caption_lengths)) * 100
        print(f"    {length_bins[i]+1}-{length_bins[i+1]} words: {count:,} ({pct:.1f}%)")


def save_vocabulary(vocab, filepath='vocabulary.pkl'):
    """Save vocabulary to file"""
    with open(filepath, 'wb') as f:
        pickle.dump(vocab, f)
    print(f"\n✓ Vocabulary saved to: {filepath}")


def load_vocabulary(filepath='vocabulary.pkl'):
    """Load vocabulary from file"""
    with open(filepath, 'rb') as f:
        vocab = pickle.load(f)
    print(f"✓ Vocabulary loaded from: {filepath}")
    print(f"  Size: {len(vocab)} words")
    return vocab


# ============================================================================
# MAIN EXECUTION
# ============================================================================

if __name__ == "__main__":
    print("\n" + "="*70)
    print("IMAGE CAPTIONING - VOCABULARY BUILDER")
    print("="*70)
    
    # Load training captions
    print("\nLoading training captions...")
    train_df = pd.read_pickle('train_captions.pkl')
    print(f"✓ Loaded {len(train_df):,} training images")
    
    # Build vocabulary
    # Target size: 9,221 words (matching the paper)
    # This is 9,217 regular words + 4 special tokens = 9,221 total
    vocab = build_vocabulary(
        train_df, 
        vocab_size=9221,  # Total vocabulary size (including special tokens)
        min_word_freq=5   # Minimum word frequency threshold
    )
    
    # Analyze vocabulary
    analyze_vocabulary(vocab, train_df)
    
    # Save vocabulary
    save_vocabulary(vocab, 'vocabulary.pkl')
    
    # Verification - test tokenization
    print(f"\n{'='*70}")
    print("VERIFICATION - Sample Tokenization")
    print(f"{'='*70}")
    
    sample_caption = train_df.iloc[0]['captions'][0]
    print(f"\nOriginal caption:")
    print(f"  '{sample_caption}'")
    
    tokens = tokenize_caption(sample_caption)
    print(f"\nTokenized:")
    print(f"  {tokens}")
    
    indices = [vocab(token) for token in tokens]
    print(f"\nWord indices:")
    print(f"  {indices}")
    
    reconstructed = [vocab.idx2word[idx] for idx in indices]
    print(f"\nReconstructed:")
    print(f"  {reconstructed}")
    
    # Show special tokens
    print(f"\n{'='*70}")
    print("SPECIAL TOKENS")
    print(f"{'='*70}")
    print(f"  {vocab.PAD_TOKEN}: {vocab(vocab.PAD_TOKEN)}")
    print(f"  {vocab.START_TOKEN}: {vocab(vocab.START_TOKEN)}")
    print(f"  {vocab.END_TOKEN}: {vocab(vocab.END_TOKEN)}")
    print(f"  {vocab.UNK_TOKEN}: {vocab(vocab.UNK_TOKEN)}")
    
    # Test unknown word
    print(f"\nTest unknown word:")
    unknown_word = "xyzabc123notinvocab"
    print(f"  '{unknown_word}' -> index {vocab(unknown_word)} ({vocab.UNK_TOKEN})")
    
    print(f"\n{'='*70}")
    print("VOCABULARY BUILDING COMPLETE! ✓")
    print(f"{'='*70}")
    print(f"\nNext steps:")
    print(f"  1. Use vocabulary.pkl in your training script")
    print(f"  2. Convert captions to sequences using vocab(word)")
    print(f"  3. Pad sequences to max length (recommend 15-20 based on paper)")

### 

### 