### Step 1
In this step, we set up the basic execution environment. We define the GPU device if available, mount the project directory, download necessary datasets, and configure matplotlib settings. The autoreload extension is enabled to automatically reload modules if changes occur.

In [None]:
# =============================================================================
# Step 1: Setup Cell
# =============================================================================
import os
import sys
import time
import json
import numpy as np
import matplotlib.pyplot as plt
import torch

# Set GPU device 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)

# Mount directory to ensure cgudl modules can be imported
FOLDERNAME = 'cgudl/cgudl-assignment3/'
assert FOLDERNAME is not None, "[!] Enter the foldername."
sys.path.append('/home/jovyan/{}'.format(FOLDERNAME))

# Change directory and download datasets if necessary
%cd /home/jovyan/$FOLDERNAME/cgudl/datasets/
!bash get_datasets.sh
%cd /home/jovyan/$FOLDERNAME

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0)
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Enable autoreload for convenience 
%load_ext autoreload
%autoreload 2

def rel_error(x, y):
    """Return relative error between x and y."""
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))


### Step 2 Load COCO Dataset
In this step, we load the COCO dataset using load_coco_data(pca_features=True) provided by cgudl. This function returns pre-extracted and PCA-compressed image features and captions (features typically have shape [N, 512] and captions [N, T]), along with a vocabulary dictionary for further decoding.

In [None]:

# =============================================================================
# Step 2: Load COCO Dataset
# =============================================================================
from cgudl.coco_utils import load_coco_data, sample_coco_minibatch, decode_captions
from cgudl.image_utils import image_from_url

# Load pre-extracted and PCA-compressed image features and captions
data = load_coco_data(pca_features=True)
print("Train features shape:", data['train_features'].shape)
print("Train captions shape:", data['train_captions'].shape)
print("Vocab size:", len(data['word_to_idx']))

word_to_idx = data['word_to_idx']
idx_to_word = data['idx_to_word']
vocab_size = len(word_to_idx)



### Step 3 Data Inspection and Visualization
In this step, we sample a small batch of data using sample_coco_minibatch and then use image_from_url and decode_captions to display the images along with their corresponding captions. This helps students visually verify that the data has been loaded correctly and understand its format.

In [None]:
# =============================================================================
# Step 3: Data Inspection and Visualization
# =============================================================================
# ex: batch_size = 32
batch_size = 32
# Set batch size to None to sample a single image
captions_sample, features_sample, urls_sample = sample_coco_minibatch(data, batch_size=batch_size)
for i, (caption, url) in enumerate(zip(captions_sample, urls_sample)):
    try:
        img = image_from_url(url)
        if img is None:
            print(f"Image {i}: URL not found, skip.")
            continue
        plt.imshow(img)
        plt.axis('off')
        caption_str = decode_captions(caption, data['idx_to_word'])
        plt.title(caption_str)
        plt.show()
    except Exception as e:
        print(f"Error displaying image {i} from URL: {url}\n", e)


### Step 4: Model Definition (Pure RNN Version) - TODO Version
In this step, we define the model: the encoder and the decoder.

The SimpleEncoder maps the pre-extracted features (e.g., 512-dim) via a linear layer and BatchNorm to the embedding space.

The DecoderRNN uses a pure RNN (nn.RNN) to generate captions. In training, we split the captions into input (captions_in = captions[:, :-1]) and target (captions_out = captions[:, 1:]). The sample() method uses greedy search to generate captions.
The core parts are marked with TODO, which students must fill in.

In [None]:

# =============================================================================
# Step 4: Model Definition (Pure RNN Version) - TODO Version
# =============================================================================
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence
from torch.utils.data import Dataset, DataLoader

# ----- (4-1) Custom Dataset (Keep unchanged) -----
class CocoFeatureCaptionDataset(Dataset):
    def __init__(self, features, captions):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.captions = torch.tensor(captions, dtype=torch.long)
    def __len__(self):
        return self.features.shape[0]
    def __getitem__(self, idx):
        return self.features[idx], self.captions[idx]

train_dataset = CocoFeatureCaptionDataset(data['train_features'], data['train_captions'])
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)



### Parameter Settings
The "Parameter Settings" section is used to define and organize the various hyperparameters required by the model. This part allows you to adjust key parameters that control the model architecture, training process, and data handling, such as the input feature dimension, embedding size, RNN hidden state size, number of layers, the maximum sequence length for caption generation, learning rate, and batch size. By centralizing these settings, users can conveniently modify the configuration for model tuning and experimental comparisons.

In [None]:

# ----- Parameter Settings (TODO version) -----
# TODO: Set your hyperparameters here.
# input_dim = data['train_features'].shape[1]  # e.g., 512
# embed_size 
# hidden_size 
# num_layers 
# max_seq_length = data['train_captions'].shape[1]  # e.g., 17
# lr 
# batch_size 


In [None]:
# ----- (4-2) SimpleEncoder -----
class SimpleEncoder(nn.Module):
    def __init__(self, input_dim, embed_size):
        super(SimpleEncoder, self).__init__()
        # TODO: Define a Linear layer to map input_dim to embed_size

        # TODO: Define a BatchNorm1d layer with num_features=embed_size

        pass

    def forward(self, features):
        out = self.linear(features)
        out = self.bn(out)
        return out


In [None]:

# ----- (4-3) DecoderRNN -----
class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, max_seq_length=20):
        """
        captions include <START> and <END> tokens, with shape (batch, T).
        We define:
          captions_in = captions[:, :-1]  (input)
          captions_out = captions[:, 1:]   (target)
        """
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.max_seq_length = max_seq_length
        
    def forward(self, features, captions_in):
        """
        features: (batch, embed_size)
        captions_in: (batch, T-1) (input portion, without the last token)
        Returns:
            outputs: (batch, T-1, vocab_size)
        """
        # TODO: Use the Embedding layer to convert captions_in to embeddings; shape should be (batch, T-1, embed_size)
        pass
        # TODO: Initialize the RNN hidden state using features (repeat for num_layers)
        pass
        # TODO: Pass embeddings and hidden state through the RNN layer, then apply the Linear layer to produce vocab scores
        pass
        return outputs
    
    def sample(self, features, states=None):
        """
        Use greedy search to generate caption without teacher forcing.
        """
        batch_size = features.size(0)
        sampled_ids = []
        # Get Original token ID (According to the dictionary, assume it is '<START>' or '<start>'; if not set, default to 1)
        start_token = word_to_idx.get('<START>', word_to_idx.get('<start>', 1))
        inputs = self.embed(torch.tensor([start_token] * batch_size, device=device))
        inputs = inputs.unsqueeze(1)  # (batch, 1, embed_size)
        states = features.unsqueeze(0).repeat(self.rnn.num_layers, 1, 1)
        for i in range(self.max_seq_length):
            hiddens, states = self.rnn(inputs, states)
            outputs = self.linear(hiddens.squeeze(1))
            _, predicted = outputs.max(1)
            sampled_ids.append(predicted)
            inputs = self.embed(predicted)
            inputs = inputs.unsqueeze(1)
        sampled_ids = torch.stack(sampled_ids, 1)
        return sampled_ids

In [None]:
# ----- (4-4) ImageCaptioningModel -----
class ImageCaptioningModel(nn.Module):
    def __init__(self, input_dim, embed_size, hidden_size, vocab_size, num_layers, max_seq_length=20):
        super(ImageCaptioningModel, self).__init__()
        # TODO: Instantiate SimpleEncoder and DecoderRNN, and assign them to self.encoder and self.decoder
        pass
        
    def forward(self, features, captions, lengths=None):
        """
        captions: (batch, T) containing <START> and <END> tokens.
        """
        # TODO: Split captions into captions_in and pass features and captions_in to the decoder; return the outputs.
        pass
    
    def sample(self, features):
        return self.decoder.sample(self.encoder(features))


In [None]:
model = ImageCaptioningModel(input_dim, embed_size, hidden_size, vocab_size, num_layers, max_seq_length)
model = model.to(device)

### Step 5: Training Loop
In this step, we run the training loop using tqdm to track progress. For each batch, we compute the loss and update the model parameters. After every epoch, the average loss is recorded and sample captions are generated and printed.

In [None]:
# =============================================================================
# Step 5: Training Loop (using torch autograd and tqdm)
# =============================================================================
from tqdm import tqdm

# Set loss and optimizer
# According to cgudl data, the padding token is '<NULL>'
pad_token = word_to_idx.get('<NULL>', 0)
criterion = nn.CrossEntropyLoss(ignore_index=pad_token)
optimizer = optim.Adam(model.parameters(), lr=lr)

loss_history = []

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    with tqdm(total=len(train_loader), desc=f"Epoch {epoch+1}/{num_epochs}") as pbar:
        for features, captions in train_loader:
            features = features.to(device)
            captions = captions.to(device)
            optimizer.zero_grad()
            # Split captions into input and target: captions_in = captions[:, :-1], targets = captions[:, 1:]
            outputs = model(features, captions, None)  # outputs: (batch, T-1, vocab_size)
            targets = captions[:, 1:]
            loss = criterion(outputs.reshape(-1, vocab_size), targets.reshape(-1))
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
            pbar.set_postfix({'loss': loss.item()})
            pbar.update(1)
    epoch_loss /= len(train_loader)
    loss_history.append(epoch_loss)
    print(f"Epoch {epoch} loss: {epoch_loss:.4f}")
    
    # Sample and display generated captions after each epoch
    model.eval()
    with torch.no_grad():
        sample_features, _ = next(iter(train_loader))
        sample_features = sample_features.to(device)
        sampled_ids = model.sample(sample_features)  # (batch, max_seq_length)
        sampled_ids = sampled_ids.cpu().numpy()
        captions_generated = decode_captions(sampled_ids, idx_to_word)
        print("Sample captions:")
        for cap in captions_generated[:4]:
            print(cap)


### Step 6: Loss Plot and Sample Image Display
In the final step, we plot the training loss curve using matplotlib and display sample images from the validation set along with the generated and ground truth captions.

In [None]:

# =============================================================================
# Step 6: Plot Loss Curve and Display Sample Images with Captions
# =============================================================================
plt.figure()
plt.plot(loss_history, marker='o')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss History')
plt.grid(True)
plt.show()

model.eval()
with torch.no_grad():
    gt_captions, features, urls = sample_coco_minibatch(data, split='val', batch_size=batch_size)
    features = torch.tensor(features, dtype=torch.float32).to(device)
    sampled_ids = model.sample(features)  # (batch, max_seq_length)
    sampled_ids = sampled_ids.cpu().numpy()
    generated_captions = decode_captions(sampled_ids, idx_to_word)
    gt_text = decode_captions(gt_captions, data['idx_to_word'])
    
    for gt_caption, gen_caption, url in zip(gt_text, generated_captions, urls):
        try:
            img = image_from_url(url)
            if img is None:
                print("Image not found:", url)
                continue
            plt.imshow(img)
            plt.axis('off')
            plt.title("GT: " + gt_caption + "\nGen: " + gen_caption)
            plt.show()
        except Exception as e:
            print(f"Error displaying image {url}:\n", e)
