# Data loading and feature extraction

#### Load Karpathy split and organize the COCO data according to it

In [None]:
import json
import pandas as pd
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from tqdm import tqdm
from PIL import Image
import numpy as np
import os


karpathy_file = '/kaggle/input/karpathy-splits/dataset_coco.json'

if not os.path.exists(karpathy_file):
    raise FileNotFoundError(f"Karpathy split not found at: {karpathy_file}")

with open(karpathy_file, 'r') as f:
    karpathy_data = json.load(f)


def organize_by_split(karpathy_data):
    splits = {'train': [], 'val': [], 'test': []}
    
    for img_data in karpathy_data['images']:
        split = img_data['split']
        
        # handle 'restval' - we add them to the training set 
        if split == 'restval':
            split = 'train'
        
        if split in ['train', 'val', 'test']:
          
            image_info = {
                'image_id': img_data['cocoid'],
                'file_name': img_data['filename'],  
                'captions': [sent['raw'] for sent in img_data['sentences']]
            }
            splits[split].append(image_info)
    
    return splits

splits_data = organize_by_split(karpathy_data)

# convert to DataFrames
train_df = pd.DataFrame(splits_data['train'])
val_df = pd.DataFrame(splits_data['val'])
test_df = pd.DataFrame(splits_data['test'])
print(train_df.head())

Load the pretrained feature extractor models

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"device: {device}")

# VGG16 - fc7 features (4096-dim) - matching the paper

vgg16 = models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_V1)
vgg16.classifier = vgg16.classifier[:-1]  # remove last layer to get fc7
vgg16 = vgg16.to(device)
vgg16.eval()

# ResNet101

resnet101 = models.resnet101(weights=models.ResNet101_Weights.IMAGENET1K_V1)
resnet101 = torch.nn.Sequential(*list(resnet101.children())[:-1])  # remove FC layer
resnet101 = resnet101.to(device)
resnet101.eval()

## Feature extraction

In [None]:
# image preprocessing
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                       std=[0.229, 0.224, 0.225])
])


def extract_features(image_path, model):
    try:
        img = Image.open(image_path).convert('RGB')
        img = transform(img).unsqueeze(0).to(device)
        
        with torch.no_grad():
            features = model(img)
            if len(features.shape) > 2:
                features = features.squeeze()
        
        return features.cpu().numpy()
    except Exception as e:
        return None


def get_image_path(filename, base_paths):   
    # determine which folder based on filename
    if 'train2014' in filename:
        folder = 'train2014'
    elif 'val2014' in filename:
        folder = 'val2014'
    else:
        return None
    
    img_path = f"{base_paths[folder]}/{filename}"
    
    if os.path.exists(img_path):
        return img_path
    else:
        return None



def extract_and_save_split_features(df, split_name, base_paths, models_dict):
   
    features_by_model = {model_name: {} for model_name in models_dict.keys()}
    missing_images = []
    processed = 0
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc=f"{split_name}"):
        img_id = row['image_id']
        img_filename = row['file_name']
        
        img_path = get_image_path(img_filename, base_paths)
        
        if img_path is None:
            missing_images.append(img_filename)
            continue
        
        # extract features with each model
        for model_name, model in models_dict.items():
            features = extract_features(img_path, model)
            if features is not None:
                features_by_model[model_name][img_id] = features
        
        processed += 1
    

BASE_PATHS = {
    'train2014': '/kaggle/input/coco2014/train2014/train2014',
    'val2014': '/kaggle/input/coco2014/val2014/val2014'
}


models_dict = {
    'vgg16': vgg16,
    'resnet101': resnet101,
}

#start feature extraction
train_features = extract_and_save_split_features(train_df, 'train', BASE_PATHS, models_dict)
val_features = extract_and_save_split_features(val_df, 'val', BASE_PATHS, models_dict)
test_features = extract_and_save_split_features(test_df, 'test', BASE_PATHS, models_dict)

Save the caption metadata

In [None]:
train_df.to_pickle('train_captions.pkl')
val_df.to_pickle('val_captions.pkl')
test_df.to_pickle('test_captions.pkl')

train_df.to_csv('train_captions.csv', index=False)
val_df.to_csv('val_captions.csv', index=False)
test_df.to_csv('test_captions.csv', index=False)

# Vocabulary


In [None]:
import pandas as pd
import numpy as np
import pickle
import re
from collections import Counter
from tqdm import tqdm


# vocabulary size  ~9,221 words + special tokens ( matches the paper's approach)

class Vocabulary:
    
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.word_counts = Counter()
        
        # special tokens
        self.PAD_TOKEN = '<PAD>'
        self.START_TOKEN = '<START>'
        self.END_TOKEN = '<END>'
        self.UNK_TOKEN = '<UNK>'
        
        # initialize with special tokens
        self.word2idx = {
            self.PAD_TOKEN: 0,
            self.START_TOKEN: 1,
            self.END_TOKEN: 2,
            self.UNK_TOKEN: 3
        }
        self.idx2word = {v: k for k, v in self.word2idx.items()}
        self.idx = 4  # next available index
    
    def add_word(self, word):
        if word not in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1
    
    def __len__(self):
        return len(self.word2idx)
    
    def __call__(self, word):
        return self.word2idx.get(word, self.word2idx[self.UNK_TOKEN])


def tokenize_caption(caption):
    """
    - Convert to lowercase
    - Remove punctuation (except hyphens in words)
    - Split into words

    """
    caption = caption.lower()
    
    # keeps alphanumeric, apostrophes, and hyphens
    caption = re.sub(r'[^\w\s\'-]', ' ', caption)
    
    # split and remove extra whitespace
    tokens = caption.split()
    
    # remove empty strings
    tokens = [t for t in tokens if t]
    
    return tokens


def build_vocabulary(train_captions_df, vocab_size=9221, min_word_freq=5):
    
    vocab = Vocabulary()
    
    # count all words in training captions
    all_tokens = []
    
    for idx, row in tqdm(train_captions_df.iterrows(), 
                         total=len(train_captions_df),
                         desc="Processing"):
        captions = row['captions']

        #process captions
        for caption in captions:
            tokens = tokenize_caption(caption)
            all_tokens.extend(tokens)
            vocab.word_counts.update(tokens)
    

    # filter by minimum frequency
    filtered_words = {word: count for word, count in vocab.word_counts.items() 
                      if count >= min_word_freq}
    

    most_common = sorted(filtered_words.items(), key=lambda x: x[1], reverse=True)
    
    # vocab_size - 4 (to account for special tokens)
    top_words = most_common[:vocab_size - 4]
    
    for word, count in tqdm(top_words, desc="Adding words"):
        vocab.add_word(word)
    
    return vocab


def save_vocabulary(vocab, filepath='vocabulary.pkl'):
    with open(filepath, 'wb') as f:
        pickle.dump(vocab, f)


def load_vocabulary(filepath='vocabulary.pkl'):
    with open(filepath, 'rb') as f:
        vocab = pickle.load(f)

    return vocab




# load training captions
train_df = pd.read_pickle('train_captions.pkl')

vocab = build_vocabulary(
    train_df, 
    vocab_size=9221,  
    min_word_freq=5  
)

analyze_vocabulary(vocab, train_df)

save_vocabulary(vocab, 'vocabulary.pkl')

# Dataset and DataLoaders

Dataset Objects

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import pickle
import random
from typing import Tuple, List


class CaptionDataset(Dataset):
    """
    dataset for image captioning that loads pre-extracted features and captions.
    
    Args:
        captions_df: DataFrame with columns ['image_id', 'file_name', 'captions']
        features_dict: Dictionary mapping image_id -> feature vector 
        vocabulary: Vocabulary object
        max_length: Maximum caption length (default: 15, matching paper)
        training: If True, randomly sample one caption per image per epoch
    """
    
    def __init__(self, 
                 captions_df: pd.DataFrame,
                 features_dict: dict,
                 vocabulary,
                 max_length: int = 15,
                 training: bool = True):
        
        self.captions_df = captions_df.reset_index(drop=True)
        self.features_dict = features_dict
        self.vocab = vocabulary
        self.max_length = max_length
        self.training = training
        
    
    def __len__(self):
        return len(self.valid_indices)
    
    def tokenize_caption(self, caption: str) -> List[str]:
        
        import re
        caption = caption.lower()
        caption = re.sub(r'[^\w\s\'-]', ' ', caption)
        tokens = caption.split()
        tokens = [t for t in tokens if t]
        return tokens
    
    def caption_to_sequence(self, caption: str) -> Tuple[torch.Tensor, int]:
        """
        Convert caption to sequence of word indices with <START> and <END>. Pad to max length using <PAD>
        """
        
        words = self.tokenize_caption(caption)
        
        # leave room for START and END
        if len(words) > self.max_length:
            words = words[:self.max_length]
        
        # convert to indices and add START + END
        tokens = [self.vocab.word2idx[self.vocab.START_TOKEN]]
        tokens.extend([self.vocab(word) for word in words])
        tokens.append(self.vocab.word2idx[self.vocab.END_TOKEN])
        
        actual_length = len(tokens)
        
        # pad to max_length + 2 (for START and END)
        max_seq_len = self.max_length + 2
        if len(tokens) < max_seq_len:
            tokens.extend([self.vocab.word2idx[self.vocab.PAD_TOKEN]] * (max_seq_len - len(tokens)))
        
        return torch.tensor(tokens, dtype=torch.long), actual_length
    
    def __getitem__(self, idx):
        """
        returns:
            image_features: Tensor of shape (feature_dim,) - e.g., (4096,) for VGG16
            caption: Tensor of shape (max_length + 2,) - padded caption sequence
            caption_length: int - actual caption length including START/END
        """
        df_idx = self.valid_indices[idx]
        row = self.captions_df.iloc[df_idx]
        
        # Get image features
        image_id = row['image_id']
        image_features = self.features_dict[image_id]
        image_features = torch.from_numpy(image_features).float()
        
        # get caption
        captions = row['captions']
        if self.training:
            caption = random.choice(captions)
        else:
            caption = captions[0]
        
        # Convert caption to sequence
        caption_seq, caption_length = self.caption_to_sequence(caption)
        
        return image_features, caption_seq, caption_length


class CaptionDatasetAllCaptions(Dataset):
    """
    dataset that returns all captions for each image.
    
    Args:
        captions_df: DataFrame with columns ['image_id', 'file_name', 'captions']
        features_dict: Dictionary mapping image_id -> feature vector
        vocabulary: Vocabulary object
        max_length: Maximum caption length
    """
    
    def __init__(self, 
                 captions_df: pd.DataFrame,
                 features_dict: dict,
                 vocabulary,
                 max_length: int = 15):
        
        self.captions_df = captions_df.reset_index(drop=True)
        self.features_dict = features_dict
        self.vocab = vocabulary
        self.max_length = max_length
        
        # create expanded dataset with one entry per caption
        self.data = []
        for idx, row in self.captions_df.iterrows():
            if row['image_id'] in self.features_dict:
                for caption in row['captions']:
                    self.data.append({
                        'image_id': row['image_id'],
                        'caption': caption,
                        'all_captions': row['captions']
                    })
        

    
    def tokenize_caption(self, caption: str) -> List[str]:
        import re
        caption = caption.lower()
        caption = re.sub(r'[^\w\s\'-]', ' ', caption)
        tokens = caption.split()
        tokens = [t for t in tokens if t]
        return tokens
    
    def caption_to_sequence(self, caption: str) -> Tuple[torch.Tensor, int]:
        words = self.tokenize_caption(caption)
        if len(words) > self.max_length:
            words = words[:self.max_length]
        
        tokens = [self.vocab.word2idx[self.vocab.START_TOKEN]]
        tokens.extend([self.vocab(word) for word in words])
        tokens.append(self.vocab.word2idx[self.vocab.END_TOKEN])
        
        actual_length = len(tokens)
        max_seq_len = self.max_length + 2
        
        if len(tokens) < max_seq_len:
            tokens.extend([self.vocab.word2idx[self.vocab.PAD_TOKEN]] * (max_seq_len - len(tokens)))
        
        return torch.tensor(tokens, dtype=torch.long), actual_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        
        image_features = self.features_dict[item['image_id']]
        image_features = torch.from_numpy(image_features).float()
        
        # convert caption to sequence
        caption_seq, caption_length = self.caption_to_sequence(item['caption'])
        
        return image_features, caption_seq, caption_length, item['all_captions']

## Helper functions and main dataloader entrypoint

In [None]:
def collate_fn(batch):
    """
    Custom collate function for batching.
    
    Args:
        batch: List of (image_features, caption, length) tuples
    
    Returns:
        images: Tensor of shape (batch_size, feature_dim)
        captions: Tensor of shape (batch_size, max_length + 2)
        lengths: Tensor of shape (batch_size,)
    """
    # Separate the components
    images, captions, lengths = zip(*batch)
    
    # Stack into tensors
    images = torch.stack(images, dim=0)
    captions = torch.stack(captions, dim=0)
    lengths = torch.tensor(lengths, dtype=torch.long)
    
    return images, captions, lengths


def collate_fn_eval(batch):
    """
    Custom collate function for evaluation (includes all reference captions).
    
    Returns:
        images: Tensor of shape (batch_size, feature_dim)
        captions: Tensor of shape (batch_size, max_length + 2)
        lengths: Tensor of shape (batch_size,)
        all_captions: List of lists of reference captions
    """
    images, captions, lengths, all_captions = zip(*batch)
    
    images = torch.stack(images, dim=0)
    captions = torch.stack(captions, dim=0)
    lengths = torch.tensor(lengths, dtype=torch.long)
    
    return images, captions, lengths, list(all_captions)


def create_dataloaders(
    train_df: pd.DataFrame,
    val_df: pd.DataFrame,
    test_df: pd.DataFrame,
    train_features: dict,
    val_features: dict,
    test_features: dict,
    vocabulary,
    batch_size: int = 64,
    max_length: int = 15,
    num_workers: int = 4,
    shuffle_train: bool = True
):
    """
    Create train, validation, and test dataloaders.
    
    Args:
        train_df, val_df, test_df: DataFrames with captions
        train_features, val_features, test_features: Feature dictionaries
        vocabulary: Vocabulary object
        batch_size: Batch size for training
        max_length: Maximum caption length
        num_workers: Number of worker processes for data loading
        shuffle_train: Whether to shuffle training data
    
    Returns:
        train_loader, val_loader, test_loader
    """
  
    # create datasets
    train_dataset = CaptionDataset(
        train_df, 
        train_features, 
        vocabulary, 
        max_length=max_length,
        training=True
    )
    
    val_dataset = CaptionDataset(
        val_df, 
        val_features, 
        vocabulary, 
        max_length=max_length,
        training=False
    )
    
    test_dataset = CaptionDataset(
        test_df, 
        test_features, 
        vocabulary, 
        max_length=max_length,
        training=False
    )
    
    # create dataloaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=shuffle_train,
        num_workers=num_workers,
        collate_fn=collate_fn,
        pin_memory=True
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        collate_fn=collate_fn,
        pin_memory=True
    )
    
    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        collate_fn=collate_fn,
        pin_memory=True
    )
    
    print(f"\nDataLoader Configuration:")
    print(f"  Batch size: {batch_size}")
    print(f"  Num workers: {num_workers}")
    print(f"  Max caption length: {max_length}")
    
    print(f"\nDataLoader Sizes:")
    print(f"  Train batches: {len(train_loader)}")
    print(f"  Val batches: {len(val_loader)}")
    print(f"  Test batches: {len(test_loader)}")
    
    return train_loader, val_loader, test_loader


### 

### 