In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torch.optim as optim
import os
import csv
import json

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1):
        super(DecoderRNN, self).__init__()
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.lstm_cell = nn.LSTMCell(embed_size, hidden_size,num_layers)
        self.fc_out = nn.Linear(in_features=self.hidden_size, out_features=self.vocab_size)
        self.embed = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.embed_size)
        self.dropout=nn.Dropout(0.5)
        # activations
        self.softmax = nn.Softmax(dim=1)

    def forward(self, features, captions):
        batch_size = features.size(0)
        # init the hidden and cell states to zeros
        hidden_state = torch.zeros((batch_size, self.hidden_size)).cuda()
        cell_state = torch.zeros((batch_size, self.hidden_size)).cuda()

        # define the output tensor placeholder
        outputs = torch.empty((batch_size, captions.shape[1], self.vocab_size)).cuda()

        # embed the captions
        captions_embed = self.dropout(self.embed(captions)).cuda()
        # print("Captions_embed size :- ",captions_embed.shape)
        # pass the caption word by word
        for t in range(captions.size(1)):

            # for the first time step the input is the feature vector
            if t == 0:
                hidden_state, cell_state = self.lstm_cell(features, (hidden_state, cell_state))

            # for the 2nd+ time step, using teacher forcer
            else:
                # print("hidden_state size :- ",captions_embed[:, t, :].shape)
                hidden_state, cell_state = self.lstm_cell(captions_embed[:, t, :], (hidden_state, cell_state))

            # output of the attention mechanism
            out = self.fc_out(hidden_state).cuda()

            # build the output tensor
            outputs[:, t, :] = out.cuda()


        return outputs


In [None]:
class JsonData(Dataset):
    def __init__(self,vocab_path, json_path):
      with open(vocab_path, 'r') as json_file:
        self.vocabulary = json.load(json_file)
      with open(json_path, 'r') as json_file1:
        self.data = json.load(json_file1)
    def __len__(self):
      return len(self.data.keys())

    def __getitem__(self,idx):
      feature = torch.tensor(self.data[str(idx)]['feature'])
      caption = torch.tensor(self.data[str(idx)]['caption'])
      return {
            'feature':feature,
            'caption':caption
        }
def custom_collate(batch):   #custom collate for converting batch 1X5X2024 to 5X2024
  # print(batch)
  return batch[0]


json_data_path="/content/drive/MyDrive/Colab Notebooks/Data/IC_dataset/train_data.json"
vocab_path=r"/content/drive/MyDrive/Colab Notebooks/Data/IC_dataset/vocabulary.json"
train_data=JsonData(vocab_path,json_data_path)
train_dataloader = DataLoader(train_data, batch_size = 1, shuffle = False,collate_fn=custom_collate)
# collate_fn=custom_collate

In [None]:
# train_data.__getitem__(0)
# train_data.data.keys()
# len(data['feature'][0])
# feature = torch.tensor(data['feature'])
# caption = torch.tensor(data['caption'])

In [None]:
feature_size=1024
hidden_size=feature_size
learning_rate=3e-4
vocab_size=len(train_data.vocabulary.keys())
vocab_size
num_epochs=1

In [None]:
# torch.backends.cudnn.benchmark = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# load_model = False
# save_model = True
device
torch.cuda.is_available()

True

In [None]:
model=DecoderRNN(feature_size,feature_size,vocab_size).cuda()
optimizer=optim.Adam(model.parameters(),lr=learning_rate)
criterion=nn.CrossEntropyLoss()
# losses = torch.tensor([], device='cuda')
losses=list()
model.train()
# model=model.to(device)
for i in model.parameters():
  print(i.is_cuda)


True
True
True
True
True
True
True


In [None]:

num_epoch=4
for batch in train_dataloader:
    # print("shapes :- ",batch['feature'].shape," ",batch['caption'].shape)
    features = batch['feature'].cuda()
    # print(features.is_cuda)
    captions = batch['caption'].cuda()
    # print(captions.is_cuda)
    output = model(features,captions)
    loss = criterion(output.view(-1, len(train_data.vocabulary)), captions.contiguous().view(-1))
    losses.append(loss)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
torch.save(model.state_dict(), '/content/drive/MyDrive/Colab Notebooks/Data/IC_dataset/trained_model.pth')

In [None]:
losses[5700:]

[tensor(0.6165, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(0.4265, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(0.3661, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(0.2850, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(0.1582, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(0.4195, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(0.3177, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(0.6445, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(0.1250, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(0.1309, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(0.0013, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(0.0698, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(0.4014, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(2.0305, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(0.0034, device='cuda:0', grad_fn=<NllLossBackward0>)]

In [None]:
# # for tensorboard
# writer = SummaryWriter("runs/flickr")
# step = 0
# # initialize model, loss etc
# model = DecoderRNN(feature_size,feature_size,len(train_data.vocabulary)).to (device)
# criterion = nn.CrossEntropyLoss (ignore_index=dataset.vocab.stoi["<PAD>"])
# optimizer = optim. Adam (model.parameters(), lr=learning_rate)
# if load_model:
#   step =load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)
# model.train()
# for epoch in range(num_epochs):
#   if save_model:
#     checkpoint = {
#       "state_dict": model.state_dict(),
#       "optimizer": optimizer.state_dict(),
#       "step": step,
#     }
#     save_checkpoint(checkpoint)
  # for idx, (imgs, captions) in enumerate (train_loader):
  #   imgs=imgs.to (device)
  #   captions=captions.to (device)
  #   outputs = model(imgs, captions [:-1])
  #   loss = criterion (outputs.reshape(-1, outputs.shape[2]), caption.reshape(-1))
  #   writer.add_scalar("Training loss", loss.item(), global_step=step)
  #   step += 1
  #   optimizer.zero_grad()
  #   loss.backward (loss)
  #   optimizer.step()