In [None]:
# Setup dependencies (as taken from assignment 6)
import os
import math
from collections import OrderedDict
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from torchvision import datasets, transforms, models

from torchsummary import summary
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt

#Additional Setup to use Tensorboard
!pip install -q tensorflow
%load_ext tensorboard

# Load raw data here and convert it to pytorch datasets

# Define LSTM here

In [None]:
class LSTM(nn.Module):

    def __init__(self, num_tokens, feature_size, embed_size, hidden_size):
        """
        Creates a captining model.

        Args:
            num_tokens: number of diffent tokens which is also known as
                vocabulary size
            feature_size: image feature dimension extracted from images with a
                feature extractor model.
            embed_size: vector dimension of word embeddings
            hidden_size: LSTM hidden state size
        """
        super(LSTM, self).__init__()


        ########################################################################
        # TODO: Define embedding, feature projector, lstm and output projector  #
        #                               layers.                                #
        ########################################################################

        # Embedding layer, embeds each token into a vector
        self.embedding = nn.Embedding(num_tokens, embed_size)

        # Feature projector, projects image features to a hidden sized vector
        self.feature_projector = nn.Linear(feature_size, hidden_size)

        # LSTM layer, which will try to guess next word
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)

        # Output projector, projects hidden states to token scores
        self.output_projector = nn.Linear(hidden_size, num_tokens)

        ########################################################################
        #                         END OF YOUR CODE                             #
        ########################################################################


    def forward(self, features, captions):
        """
        Forward pass.

        Args:
            features: extracted image features with a shape of (N, D)  where N
                is batch size and D is number of features.
            captions: grand truth image captions with a shape of (N, M) where M
                is max caption length. If an image caption is less than max
                caption length it should be padded with <NULL> token.

        Returns:
            scores: token scores with a shape of (N, M, K) where K is number of
                tokens (num_tokens).
        """

        scores = None

        ########################################################################
        #                     TODO: Implement forward pass.                    #
        # Steps to follow:                                                     #
        #  1. Embed captions                                                   #
        #  2. Create hidden state by projecting features with feature projector#
        #  3. Create cell state with zeros                                     #
        #  4. Caulate output using LSTM                                        #
        #  5. Calculate scores by projecting output with output projector      #
        # Note: Be careful with dimenstions                                    #
        ########################################################################

        # Embed captions
        embeds = self.embedding(captions)

        # Create h0 from image features
        h0 = self.feature_projector(features).unsqueeze(0)

        # Set cell state to zeros
        c0 = torch.zeros_like(h0)

        # Calculate LTSM output
        output, _ = self.lstm(embeds, (h0, c0))

        # Project output to token scores
        scores = self.output_projector(output)

        ########################################################################
        #                         END OF YOUR CODE                             #
        ########################################################################

        return scores
    

    @torch.no_grad()
    def sample(self, features, start_idx, max_length):
        """
        Generates captions for each given image features.

        Args:
            features: image features with a shape of (N, D) where N is batch
                size and D is number of features.
            start_idx: start token (numerical representation of <START>).
            max_length: max sampling iteration.
        """
        # Project image features
        h0 = self.feature_projector(features).unsqueeze(0)

        # Set first token as <START> token
        token = torch.tensor([[start_idx]]*len(features), device=features.device)

        # Keep tokens, they are our generated captions and each captions
        # starts with <START> token
        captions = [token]

        for i in range(max_length):
            # Embed token
            embeds = self.embedding(token)

            # First iteration, so init hidden state and cell state
            # Hidden state will be projected image feature, cell state will
            # be zero
            if i == 0:
                h = h0
                c = torch.zeros_like(h0)
            
            # Get output and next cell and hidden states
            output, (h, c) = self.lstm(embeds, (h, c))

            # Project output to token scores
            scores = self.output_projector(output)

            # Get predictions, the predictions will be our next token,
            # because our LSTM tries to guess next token if we give it a
            # token
            token = torch.argmax(scores, dim=2)

            # Store the predictions
            captions.append(token)
        
        # Concatenate predicted tokens
        captions = torch.cat(captions, dim=1)

        return captions

# train and validation loop

In [None]:
N=7#we use on week worth of data
T=24# we use one day of previous data for training
D= #len each training sample

In [None]:
# Create a writer to write to Tensorboard
writer = SummaryWriter()

# Some hyperparams
epochs = 80
lr_decay = 1

# Number of different words in our vocabulary including special tokens <START>,
# <END>, <NULL> and <UNK>
#num_tokens = len(coco['vocab']['idx_to_token']) needs editing

# Number of image features
feature_size = train_features.shape[-1]

# Word embedding vector size
embed_size = 512

# LSTM hidden state dimension
hidden_size = 1024

model = LSTM(num_tokens, feature_size, embed_size, hidden_size)
model = model.to(device)

# Create loss function and optimizer and learning rate scheduler
criterion = nn.CrossEntropyLoss(ignore_index=NULL_IDX, reduction='sum')
optimizer = optim.Adam(model.parameters(), 1e-3)
lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: lr_decay ** epoch)

for epoch in tqdm(range(epochs)):

    # Train one epoch
    train_loss = 0

    # Iterate through batches
    for features, captions in train_loader:
        # Move data to target device
        features, captions = features.to(device), captions.to(device)

        # Define input and target captions
        # Input captions will be fed into LSTM and target captions are used for
        # loss calculation as LTSM tries to guess next word
        input_captions, target_captions = captions[:, :-1], captions[:, 1:]

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward + backward + optimize
        output = model(features, input_captions)
        loss = criterion(output.transpose(1, 2), target_captions)
        loss.backward()
        optimizer.step()

        train_loss += loss
    
    # Write train loss to Tensorboard
    writer.add_scalars('Captioning Loss',
                        {'Train': train_loss / len(train_dataset)},
                        epoch)
    
    # Step learning rate scheduler after each epoch
    lr_scheduler.step()

    # Validate loss
    val_loss = 0

    with torch.no_grad():
        # Iterate through batches
        for features, captions in val_loader:
            # Move data to target device
            features, captions = features.to(device), captions.to(device)

            # Define input and target captions
            # Input captions will be fed into LSTM and target captions are used for
            # loss calculation as LTSM tries to guess next word
            input_captions, target_captions = captions[:, :-1], captions[:, 1:]
            output = model(features, input_captions)
            loss = criterion(output.transpose(1, 2), target_captions)

            val_loss += loss
        
        # Write loss to Tensorboard
        writer.add_scalars('Captioning Loss',
                            {'Validation': val_loss / len(val_dataset)},
                            epoch)

print('\nFinished.')
writer.flush()
writer.close()

In [None]:
model.eval()

In [None]:
# run this cell once and wait for it to time out
# run this cell a second time and you should see the board

%tensorboard --logdir runs/ --host localhost