In [1]:
import torch
import json

# Load the extracted features (list of tensors)
all_features = torch.load('image_features.pt', weights_only=True)  # This is a list of tensors
image_name_to_index = torch.load('image_name_to_index.pt', weights_only=True)

# Load the captions dictionary
with open('captions_dict.json', 'r') as f:
    caption_sequences = json.load(f)

# Load the vocabulary
with open('vocabulary.json', 'r') as vocab_file:
    vocabulary = json.load(vocab_file)

# Debug prints to confirm loading
print(f"Loaded {len(all_features)} features.")
print(f"Loaded {len(caption_sequences)} captions.")
print(f"Loaded {len(vocabulary)} words in the vocabulary.")

Loaded 8091 features.
Loaded 8091 captions.
Loaded 8517 words in the vocabulary.


In [2]:
def pad_captions(captions, max_length=21):
    # Convert list of caption sequences into a tensor
    caption_tensors = [torch.tensor(caption) for caption in captions]

    padded_captions = []
    for tensor in caption_tensors:
        # If the caption is shorter than max_length, pad with zeros
        if len(tensor) < max_length:
            padding = torch.zeros(max_length - len(tensor), dtype=torch.long)
            padded_captions.append(torch.cat((tensor, padding)))
        else:
            # If longer than max_length, truncate it
            padded_captions.append(tensor[:max_length])

    return torch.stack(padded_captions)

In [3]:
# Separate features and captions
feature_caption_pairs = []
for image_id, captions in caption_sequences.items():
    image_feature = all_features[image_name_to_index[image_id]]  # Get the feature for the image
    for caption in captions:
        feature_caption_pairs.append((image_feature, caption))  # Create pairs

# Now feature_caption_pairs contains tuples of (feature, caption)


In [20]:
import random

def data_generator(dataset, batch_size, max_length=21):
    while True:
        # Create a shuffled index list
        indices = list(range(len(dataset)))  # Indices of the dataset
        random.shuffle(indices)  # Shuffle indices
        
        for start in range(0, len(indices), batch_size):
            end = min(start + batch_size, len(indices))
            batch_indices = indices[start:end]  # Get shuffled indices for the current batch

            # Separate features and captions using the shuffled indices
            batch_pairs = [dataset[i] for i in batch_indices]  # Get pairs from the dataset
            
            batch_features = torch.stack([pair[0] for pair in batch_pairs])
            batch_captions = [pair[1] for pair in batch_pairs]

            # Pad the captions
            padded_batch_captions = pad_captions(batch_captions, max_length)

            yield batch_features, padded_batch_captions

In [21]:
# Test the loading part
print(f"Loaded {len(all_features)} features.")
print(f"Loaded {len(caption_sequences)} captions.")
print(f"Toatal image caption pairs {len(feature_caption_pairs)}")
# Test the data generator
batch_size = 32
data_gen = data_generator(feature_caption_pairs, batch_size)

# Retrieve a batch
batch_features, padded_batch_captions = next(data_gen)
print(f"Batch features shape: {len(batch_features)}")
print(f"Padded captions shape: {padded_batch_captions.shape}")
print(f"Example caption (first caption in the batch): {padded_batch_captions[0]}")


Loaded 8091 features.
Loaded 8091 captions.
Toatal image caption pairs 38008
Batch features shape: 32
Padded captions shape: torch.Size([32, 21])
Example caption (first caption in the batch): tensor([   1,   74,  202,    1,    2,   37,   78, 1269,   77, 1264,    1,  923,
           0,    0,    0,    0,    0,    0,    0,    0,    0])


In [22]:
import torch
from torch.utils.data import random_split

# Total number of feature-caption pairs
total_pairs = len(feature_caption_pairs)

# Define split ratio (80% train, 20% test)
train_size = int(0.8 * total_pairs)
test_size = total_pairs - train_size

# Split the data
train_data, test_data = random_split(feature_caption_pairs, [train_size, test_size])

# Now train_data and test_data contain the splits
print(f"Training data size: {len(train_data)}")
print(f"Testing data size: {len(test_data)}")

Training data size: 30406
Testing data size: 7602


In [44]:
import torch
import torch.nn as nn

class ShowAndTellModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, lstm_units, feature_size):
        super(ShowAndTellModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim + feature_size, lstm_units, batch_first=True, dropout=0.3)
        self.fc = nn.Linear(lstm_units, vocab_size)

    def forward(self, image_features, captions):
        # Embed the captions
        caption_embeddings = self.embedding(captions)
        
        # Repeat the image features for each time step
        image_features_repeated = image_features.unsqueeze(1).repeat(1, captions.size(1), 1)
        
        # Combine the image features with the caption embeddings
        combined_input = torch.cat((image_features_repeated, caption_embeddings), dim=2)
        
        # Pass through LSTM
        lstm_out, _ = self.lstm(combined_input)
        
        batch_size, seq_len, hidden_size = lstm_out.size()
        
        
        # Get the output for the last time step
        output = self.fc(lstm_out)
        
        return output


In [45]:
vocab_size = len(vocabulary) + 1
embedding_dim = 256
lstm_units = 512
feature_size = 1000

In [46]:
model = ShowAndTellModel(vocab_size, embedding_dim, lstm_units, feature_size)

In [47]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [48]:
num_epochs = 10
batch_size = 32

train_gen = data_generator(train_data, batch_size)

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0

    for step in range(len(train_data) // batch_size):  # Define steps for one epoch
        batch_features, padded_batch_captions = next(train_gen)  # Get the next batch

        optimizer.zero_grad()  # Clear previous gradients

        # Forward pass: Predict the caption sequence
        outputs = model(batch_features, padded_batch_captions)  # Pass both features and captions

        # Reshape the outputs and target captions for loss calculation
        outputs = outputs.view(-1, outputs.size(-1))  # Shape: [batch_size * seq_len, vocab_size]
        target_captions = padded_batch_captions.view(-1)  # Shape: [batch_size * seq_len]

        # Compute the loss
        loss = criterion(outputs, target_captions)  # Use the reshaped outputs and target captions
        loss.backward()  # Backpropagation
        optimizer.step()  # Update the weights

        running_loss += loss.item()

    # Print statistics for the epoch
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / (len(train_data) // batch_size):.4f}")

Epoch [1/10], Loss: 0.5854
Epoch [2/10], Loss: 0.0651
Epoch [3/10], Loss: 0.0126
Epoch [4/10], Loss: 0.0005
Epoch [5/10], Loss: 0.0002
Epoch [6/10], Loss: 0.0001


KeyboardInterrupt: 