# Model Structure: Transfer Learned ResNet200D + DistilBert.

# Import Dependencies

In [None]:
%%capture
import PIL
import json
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import tqdm.notebook as tqdm
from sklearn.model_selection import ShuffleSplit
import albumentations as A
from albumentations.pytorch import ToTensorV2

import random
import copy
import cv2
import math

import numpy as np
import collections
import pandas as pd
import matplotlib.pyplot as plt

!pip install livelossplot
import livelossplot

!pip install timm
import timm

!pip install transformers
import transformers

import nltk 
nltk.download('punkt')
import re
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Load in the Dataset(FlicKR)

In [None]:
images_dataframe = pd.read_csv('../input/flickr8k/captions.txt')
images_dataframe = images_dataframe.drop_duplicates(subset = 'image')
images_dataframe = images_dataframe.set_index("image")

Helpers for Reproducibility:

In [None]:
import os
import random
seed = 42
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    numpy.random.seed(worker_seed)
    random.seed(worker_seed)

# Append COCO Captions to Train Dataset(8k is too little, 30k full of duplicates)

In [None]:
%%capture
!wget http://images.cocodataset.org/zips/val2014.zip
!wget http://images.cocodataset.org/annotations/annotations_trainval2014.zip
!unzip val2014.zip
!unzip annotations_trainval2014.zip
!rm -f val2014.zip
!rm -f annotations_trainval2014.zip

In [None]:
Captions_Path = './annotations/captions_val2014.json'
COCO_Path = "./val2014/"

In [None]:
def process_coco():
    '''
    Converts the COCO dataset to the Flickr8k Format
    '''
    dataset = json.load(open(Captions_Path))
    images = dataset['images']
    labels = dataset['annotations']
    ID2Image = {}
    for i in images:
        ID2Image[i['id']] = i['file_name']
    ID2Label = {}
    for i in labels:
        ID2Label[i['image_id']] = i['caption']
    Image2Label = {}
    for key in ID2Image:
        Image2Label[ID2Image[key]] = ID2Label[key]
    return Image2Label

In [None]:
def construct_dataframe(Image2Label):
    images = []
    labels = []
    for image in Image2Label:
        images += [image]
        labels += [Image2Label[image]]
    dictionary = {'image': images, 'caption': labels}
    dataframe = pd.DataFrame(dictionary)
    dataframe = dataframe.set_index('image')
    return dataframe

In [None]:
Image2Label = process_coco()
COCO_dataframe = construct_dataframe(Image2Label)
images_dataframe = images_dataframe.append(COCO_dataframe)

# Custom Dataloader

In [None]:
# HYPER PARAMETERS 
BASE_IMAGE_PATH = "../input/flickr8k/Images/"
GLOVE_PATH = '../input/glove-global-vectors-for-word-representation/glove.6B.200d.txt'
TRAIN_BATCH_SIZE = 128
TEST_BATCH_SIZE = 128
TRAIN_SIZE = 0.9995
IMAGE_SIZE = 256

Transforms

In [None]:
train_transforms = A.Compose([
    A.RandomResizedCrop(IMAGE_SIZE, IMAGE_SIZE, scale=(0.6, 0.6), p=1),
    A.Flip(p = 0.7),
    A.OneOf([
        A.MotionBlur(blur_limit=(3, 5)),
        A.MedianBlur(blur_limit=5),
        A.GaussianBlur(blur_limit=(3, 5)),
        A.GaussNoise(var_limit=(5.0, 30.0)),
        A.MultiplicativeNoise(),
    ], p=0.7),
    A.OneOf([
        A.OpticalDistortion(distort_limit=1.0),
        A.GridDistortion(num_steps=5, distort_limit=1.),
        A.ElasticTransform(alpha=3),
    #    A.GridDropout()
    ], p=0.7),
    A.CLAHE(clip_limit=4.0, p=0.7),
    #A.IAAPiecewiseAffine(p=0.5),
    A.IAASharpen(p=0.5),
    #A.RandomGamma(gamma_limit=(70, 130), p=0.3),
    A.ColorJitter(p = 0.7),
    A.OneOf([
        A.ImageCompression(),
        A.Downscale(scale_min=0.7, scale_max=0.95),
        #A.RandomGridShuffle(),
    ], p=0.2),
    A.OneOf([
        A.ToGray(),
        A.ToSepia()
    ]),
    #A.OneOf([
    #    A.RandomRain(),
    #    A.RandomFog(),
    #    A.RandomShadow(),
    #    A.RandomSnow(),
    #    A.RandomSunFlare()
    #]),
    A.CoarseDropout(max_holes=8, max_height=int(IMAGE_SIZE * 0.1),
                       max_width=int(IMAGE_SIZE* 0.1), p=0.5),
    A.Cutout(num_holes = 32),
    A.ShiftScaleRotate(shift_limit=0.2, scale_limit=0.2, rotate_limit=45, border_mode=0, p=0.85),
    A.Normalize(mean=0.482288, std=0.22085),
    ToTensorV2()
])

test_transforms = A.Compose([
    A.Resize(IMAGE_SIZE, IMAGE_SIZE),
    A.Normalize(),
    ToTensorV2()
])

In [None]:
def process_vocab(dataframe):
    '''
    Loads in the Vocabulary for Flickr8K
    '''
    captions = [nltk.word_tokenize(str.lower(caption)) for caption in dataframe.caption.values]
    captions_word = [word for sentence in captions for word in sentence]
    return sorted(list(set(captions_word)))

In [None]:
vocabulary = process_vocab(images_dataframe)

In [None]:
class ImageCaptioningDataset(torch.utils.data.Dataset):
  def __init__(self, transforms, dataframe, device):
    self.dataframe = dataframe
    self.transforms = transforms
    self.device = device
    self.COCO_Base = COCO_Path
    self.Flickr_base = BASE_IMAGE_PATH
  def __len__(self):
    return len(self.dataframe)
  def __getitem__(self, idx):
    index = str.strip(self.dataframe.index[idx])
    caption = str.strip(self.dataframe.iloc[idx]['caption'])
    caption = str.lower(caption)
    caption = re.sub(r'[^\w\s]','',caption)
    lower_caption = str.lower(caption)
    # Load in Image
    if index[:4] == 'COCO':
        image_path = self.COCO_Base + index
    else:
        image_path = self.Flickr_base + index
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = self.transforms(image = image)['image']
    image = image.to(self.device)
    
    return image, caption

In [None]:
splitter = ShuffleSplit(1, test_size = 1 - TRAIN_SIZE, train_size = TRAIN_SIZE, random_state = 42)
for train_idx, test_idx in splitter.split(images_dataframe):
    break

In [None]:
train_dataframe = images_dataframe.iloc[train_idx]
val_dataframe = images_dataframe.iloc[test_idx]

In [None]:
TrainDataset = ImageCaptioningDataset(train_transforms,  train_dataframe, device)
ValDataset = ImageCaptioningDataset(test_transforms, val_dataframe, device)

In [None]:
TrainDataloader = torch.utils.data.DataLoader(TrainDataset, batch_size = TRAIN_BATCH_SIZE, shuffle = True, worker_init_fn = seed_worker)
ValDataloader = torch.utils.data.DataLoader(ValDataset, batch_size = TEST_BATCH_SIZE, shuffle = False, worker_init_fn = seed_worker)

# Image Encoding

In [None]:
class ConvBlock(nn.Module):
  def __init__(self, in_features, out_features, kernel_size, padding, groups):
    super().__init__()
    self.conv = nn.Conv2d(in_features, out_features, kernel_size = kernel_size, padding = padding, groups = groups)
    self.bn = nn.BatchNorm2d(out_features)
    self.act1 = nn.SiLU(inplace = True)
  def forward(self, x):
    return self.bn(self.act1(self.conv(x)))

In [None]:
class DownSampleConvBlock(nn.Module):
  def __init__(self, in_features, out_features, kernel_size, padding, groups, stride):
    super().__init__()
    self.conv = nn.Conv2d(in_features, out_features, kernel_size, stride, padding = padding, groups = groups)
    self.bn = nn.BatchNorm2d(out_features)
    self.act1 = nn.SiLU(inplace = True)
  def forward(self, x):
    return self.bn(self.act1(self.conv(x)))

In [None]:
class RegularSE(nn.Module):
  '''
  Regular Squeeze and Excitation Block 
  '''
  def __init__(self, in_dim, inner_dim):
    super().__init__()
    self.in_dim = in_dim
    self.inner_dim = inner_dim
    self.Squeeze = nn.Linear(self.in_dim, self.inner_dim)
    self.SiLU = nn.SiLU(inplace = True)
    self.Excite = nn.Linear(self.inner_dim, self.in_dim)
  def forward(self, x):
    '''
    Performs Channel-Wise Attention using Squeeze Excite Blocks
    '''
    avg_pool = torch.mean(x, dim = -1)
    avg_pool = torch.mean(avg_pool, dim = -1) 
    squeeze = self.SiLU(self.Squeeze(avg_pool))
    excite = torch.sigmoid(self.Excite(squeeze)).unsqueeze(-1).unsqueeze(-1)
    return excite * x 


In [None]:
class ResBottleNeck(nn.Module):
  def __init__(self, in_features, inner_features, device):
    super().__init__()
    self.device = device
    self.in_features = in_features
    self.inner_features = inner_features
    
    self.squeeze = ConvBlock(self.in_features, self.inner_features, 1, 0, 1)
    self.process = ConvBlock(self.inner_features, self.inner_features, 3, 1, self.inner_features)
    self.SE = RegularSE(self.inner_features, self.inner_features // 4)
    self.expand = ConvBlock(self.inner_features, self.in_features, 1, 0, 1)
    self.gamma = nn.Parameter(torch.zeros(1, device = self.device))
  def forward(self, x):
    '''
    x: Tensor(B, C, H, W)
    '''
    squeeze = self.squeeze(x)
    process= self.process(squeeze)
    SE = self.SE(process)
    expand = self.expand(SE) 
    return expand * self.gamma + x


In [None]:
class InvertedResidualBlock(nn.Module):
  def __init__(self, in_features, inner_features, device):
    super().__init__()
    self.device = device
    self.in_features = in_features
    self.inner_features = inner_features
    self.squeeze = ConvBlock(self.in_features, self.inner_features, 1, 0, 1)
    self.depthwise = ConvBlock(self.inner_features, self.inner_features, 3, 1, self.inner_features)
    self.SE = RegularSE(self.inner_features, self.in_features // 16)
    self.expand = ConvBlock(self.inner_features, self.in_features, 1, 0, 1)
    self.gamma = nn.Parameter(torch.zeros((1), device = self.device))
  def forward(self, x):
    squeeze = self.squeeze(x)
    depthwise = self.depthwise(squeeze)
    SE = self.SE(depthwise)
    expand = self.expand(SE)
    return expand * self.gamma + x

ResNet200D Transfer Learned

In [None]:
class ResNetBase(nn.Module):
    '''
    Tiny ResNet Pretrained as a baseline
    '''
    def freeze(self, layer):
        for parameter in layer.parameters():
            parameter.requires_grad = False
    def __init__(self, in_dim, device):
        super().__init__()
        self.in_dim = in_dim
        self.device = device
        self.model_name = 'resnet200d'
        self.model = timm.create_model(self.model_name, pretrained = True)
        self.model.global_pool = nn.Identity()
        self.model.fc = nn.Identity()
        self.freeze(self.model)
        self.proj = ConvBlock(2048, self.in_dim, 1, 0, 1)
    def forward(self, x):
        features = self.proj(self.model(x))
        B, C, H, W = features.shape
        return features.view(B, C, H * W).transpose(1, 2)

Model Blocks

In [None]:
class CustomTokens(nn.Module):
    def __init__(self, vocabulary, glove_path, max_length, device):
        super().__init__()
        self.device = device
        self.glove_path = glove_path
        self.vocab = vocabulary
        self.idx2word = {idx + 4: self.vocab[idx] for idx in range(len(self.vocab))}
        self.START = "<START>"
        self.END = "<END>"
        self.PAD = "<PAD>"
        self.UNK = "<UNK>"
        self.START_ID = 0
        self.END_ID = 1
        self.PAD_ID = 2
        self.UNK_ID = 3
    
        
        self.idx2word[self.START_ID]=self.START
        self.idx2word[self.END_ID] = self.END
        self.idx2word[self.PAD_ID] = self.PAD
        self.idx2word[self.UNK_ID] = self.UNK
        
        self.word2idx = {self.vocab[idx]: idx + 4 for idx in range(len(self.vocab))}
        self.word2idx[self.START] = self.START_ID
        self.word2idx[self.END] = self.END_ID
        self.word2idx[self.PAD] = self.PAD_ID
        self.word2idx[self.UNK] = self.UNK_ID
        self.dim = 200
        self.max_length = max_length
        if self.glove_path:
            self.embeddings = self.load_glove(self.glove_path)
        else:
            self.embeddings = nn.init.xavier_uniform(torch.zeros((len(self.wordx2idx), self.dim), device = self.device))
        self.embeddings = nn.Embedding(len(self.word2idx), self.dim, _weight = self.embeddings)
    def load_glove(self, glove_path):
        '''
        glove_path: path to the glove file.
        '''
        embeddings = nn.init.xavier_uniform(torch.zeros((len(self.word2idx), self.dim), device = self.device))
        with open(glove_path, 'r') as file:
            for line in tqdm.tqdm(file):
                vals = line.split()
                word = vals[0]
                if word in self.word2idx:
                    embedding = torch.tensor([float(val) for val in vals[1:]], device = self.device)
                    embeddings[self.word2idx[word], :] = embedding
        return embeddings
    def decode(self, x):
        return self.idx2word[x]
    def pad_sents(self, x):
        '''
        Pads and Tokenizes Inputs
        '''
        tokenized_sents = []
        for sent in x:
            tok_sent = [self.PAD_ID for i in range(self.max_length)]
            for word_idx in range(self.max_length):
                if word_idx >= len(sent):
                    break
                if sent[word_idx] in self.word2idx:
                    tok_sent[word_idx] = self.word2idx[sent[word_idx]]
                else:
                    tok_sent[word_idx] = self.word2idx[self.UNK]
            tokenized_sents += [tok_sent]
        return torch.tensor(tokenized_sents, device = self.device)
            
    def forward(self, x):
        '''
        Tokenizes a Given Input
        '''
        tokenized = [nltk.word_tokenize(sent) for sent in x]
        padded = self.pad_sents(tokenized)
        return padded
    def embed(self, x):
        return self.embeddings(x)

In [None]:
class Tokenizer(nn.Module):
  def __init__(self, model_name, max_length):
    super().__init__()
    self.model_name = model_name
    self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_name)
    self.num_words = self.tokenizer.vocab_size
    self.START = self.tokenizer.cls_token
    self.END = self.tokenizer.sep_token
    self.START_ID = self.tokenizer.cls_token_id
    self.END_ID = self.tokenizer.sep_token_id 
    self.max_length = max_length
  def forward(self, x):
    '''
    Tokenizes sentences using HuggingFace 
    '''
    tokenized_sentences = self.tokenizer(x, add_special_tokens = True, padding = 'max_length', truncation = True, max_length = self.max_length, return_tensors = 'pt')
    return tokenized_sentences

In [None]:
class Attention(nn.Module):
    '''
    Decoder Attention for the LSTM
    '''
    def __init__(self, enc_dim, dec_dim, inner_dim):
        super().__init__()
        self.enc_dim = enc_dim
        self.dec_dim = dec_dim
        self.inner_dim = inner_dim
        
        self.dec_proj = nn.Linear(self.dec_dim, self.inner_dim)
        self.enc_proj = nn.Linear(self.enc_dim, self.inner_dim)
    def forward(self, encoder, decoder):
        '''
        encoder: output from LSTM(B, enc)
        decoder: output from CNN(B, L, dec)
        '''
        decoder = self.dec_proj(decoder) # (B, L, dim)
        encoder = self.enc_proj(encoder).unsqueeze(1) # (B, 1, dim) 
        att_mat = F.softmax(torch.squeeze(torch.bmm(encoder, decoder.transpose(1, 2)), dim = 1)).unsqueeze(1) # (B, 1, L)
        attended = torch.squeeze(torch.bmm(att_mat, decoder))
        return attended 
        
    

In [None]:
class LSTM(nn.Module):
    '''
    Uses LSTMS to Caption Images
    '''
    def __init__(self, in_dim, im_dim, max_length, device, drop_prob = 0.2):
        super().__init__()
        self.im_dim = im_dim
        self.in_dim = in_dim
        self.drop_prob = drop_prob
        self.proj_hidden = nn.Linear(self.im_dim, self.im_dim)
        self.proj_cell = nn.Linear(self.im_dim, self.im_dim)
        self.max_length = max_length
        self.device = device
        
        #self.Attention = Attention(self.im_dim, self.im_dim, self.im_dim)
        self.LSTMCell = nn.LSTMCell(self.in_dim, self.im_dim)
        
        #self.model_name = 'distilbert-base-uncased'
        self.tokenizer = CustomTokens(vocabulary, GLOVE_PATH, self.max_length, self.device)
        self.num_classes = len(self.tokenizer.word2idx)
        self.Dropout = nn.Dropout(self.drop_prob)
        self.Linear = nn.Linear(self.im_dim, self.num_classes)
        self.criterion = nn.CrossEntropyLoss()
    def forward_train(self, x, GT):
        '''
        Uses LSTMs to Generate Captions
        '''
        # Tokenize Ground Truth
        tokenized_GT = self.tokenizer(GT)
        B, L = tokenized_GT.shape
        # Project the Image down to One LSTM Cell
        hidden_state = self.proj_hidden(torch.mean(x, dim = 1))
        cell_state = self.proj_cell(torch.mean(x, dim = 1))
        total_loss = torch.zeros((1), device = self.device)
        # Begin Decoding
        for l in range(0, L - 1):
            input_id = tokenized_GT[:, l]
            GT_id = tokenized_GT[:, l + 1]
            
            embeddings = self.tokenizer.embed(input_id)
            hidden_state, cell_state = self.LSTMCell(embeddings, (hidden_state, cell_state))
            copied_h = hidden_state.clone()
            #attended = self.Attention(copied_h, x)
            #concat = torch.cat([attended, copied_h], dim = -1)
            concat = self.Dropout(copied_h)
            pred = self.Linear(concat)
            # Mask Out PAD tokens
            keep = GT_id != self.tokenizer.PAD_ID
            pred = pred[keep]
            GT_id = GT_id[keep]
            if pred.shape[0] != 0:
                loss = self.criterion(pred, GT_id)
                total_loss = total_loss + loss
        return total_loss
    def forward(self, x):
        # Project Down the X to the hidden_state
        hidden_state = self.proj_hidden(torch.mean(x, dim = 1))
        cell_state = self.proj_cell(torch.mean(x, dim = 1) )
        B, L = cell_state.shape
        # Starter Sentences
        current_tokens = torch.tensor([self.tokenizer.START_ID for i in range(B)], device = self.device) 
        pred_sentences = [self.tokenizer.START for i in range(B)]
        finished = [False for i in range(B)]
        # Begin Decoding
        for i in range(self.max_length):
            embeddings = self.tokenizer.embed(current_tokens)
            hidden_state, cell_state = self.LSTMCell(embeddings, (hidden_state, cell_state))
            copied_h = hidden_state.clone()
            #attended = self.Attention(copied_h, x)
            #concat = torch.cat([attended, copied_h], dim = -1)
            pred = F.softmax(self.Linear(copied_h))
            _, indices = torch.max(pred, dim = -1)
            for b in range(B):
                if finished[b]:
                    continue # Finished Already
                elif indices[b].item() == self.tokenizer.END_ID:
                    finished[b] = True
                    pred_sentences[b] += f" {self.tokenizer.decode(indices[b].item())}"
                else:
                    pred_sentences[b] += f" {self.tokenizer.decode(indices[b].item())}"
                    current_tokens[b] = indices[b].item()
        return pred_sentences

In [None]:
class FullModel(nn.Module):
  '''
  Houses the Full Image Captioning Model
  '''
  def __init__(self, device):
    super().__init__()
    self.device = device
    self.in_dim = 200
    self.im_dim = 2048
    self.max_length = 20
    self.image_encoder = ResNetBase(self.im_dim, self.device)
    self.LSTM = LSTM(self.in_dim, self.im_dim, self.max_length, self.device)
  def forward_train(self, images, captioning):
    image_encoded = self.image_encoder(images)
    return self.LSTM.forward_train(image_encoded, list(captioning))
  def forward(self, images):
    image_encoded = self.image_encoder(images)
    return self.LSTM(image_encoded)

# Custom Training Loop:

In [None]:
class ImageCaptioningSolver(nn.Module):
  def __init__(self, device):
    super().__init__()
    self.device = device
    self.model = FullModel(self.device)
    self.optim = optim.Adam(self.model.parameters(), lr= 1e-3, weight_decay = 1e-3)
    self.decay = optim.lr_scheduler.StepLR(self.optim, 1, 0.97)
  def forward(self, x):
    self.eval()
    with torch.no_grad():
      return self.model(x)
  def evaluate(self, valloader):
    '''
    Evaluates the Model's BLEU score.
    '''
    self.eval()
    with torch.no_grad():
      total_bleu = 0
      count = 0
      for images, labels in tqdm.tqdm(valloader):
        predicted = self.model(images)
        bleu = nltk.translate.bleu_score.corpus_bleu(labels, predicted)
        total_bleu += bleu
        count += 1
        del images
        del labels
        del predicted
        torch.cuda.empty_cache()
    return total_bleu / count
  def training_loop(self, trainloader, valloader, NUM_EPOCHS, display_every = 64):
    liveloss = livelossplot.PlotLosses()
    best_val_bleu  = 0
    best_val_loss = 999
    torch.cuda.empty_cache()
    for EPOCH in range(NUM_EPOCHS):
      self.train()
      logs = {}
      total_loss = 0
      count = 0
      for images, labels in trainloader:
        self.optim.zero_grad()
        loss = self.model.forward_train(images, labels)
        loss.backward()
        self.optim.step()
        print(f"Step: {count}, loss: {loss.item()}")
        total_loss += loss.item()
        count += 1
        del images
        del labels
        torch.cuda.empty_cache()
        if count == display_every:
          break
      logs['loss'] = total_loss / count
      self.decay.step()
      print(f"EPOCH: {EPOCH}, total_loss: {logs['loss']}")
      
      self.eval()
      sample_caption = ""
      sample_image = None
      with torch.no_grad():
        logs['val_loss'] = 0
        logs['accuracy'] = 0
        count = 0
    
        for images, labels in valloader:
          loss = self.model.forward_train(images, labels)
          pred_sentences = self.model(images)
            
          sample_caption = pred_sentences[0]
          sample_image = images[0].cpu()
          # Compute BLEU between GT and Predicted sentences
          bleu = nltk.translate.bleu_score.corpus_bleu(labels, pred_sentences)
          logs['val_loss'] += loss.item()
          logs['accuracy'] += bleu
          count += 1
          del images
          del labels
          torch.cuda.empty_cache()
          break
        logs['val_loss'] /= count
        logs['accuracy'] /= count
      
      liveloss.update(logs)
      liveloss.send()
      
      if logs['val_loss'] < best_val_loss:
        best_val_loss = logs['val_loss']
        torch.save(self.model.state_dict(), "./BestVal.pth")
      if logs['accuracy'] > best_val_bleu:
        best_val_bleu = logs['accuracy']
        torch.save(self.model.state_dict(), "./BestAcc.pth")
      plt.imshow(sample_image.transpose(0, 1).transpose(1, 2))
      plt.show()
      print(f"E:{EPOCH}VL{round(logs['val_loss'], 3)}VA{round(logs['accuracy'], 3)}loss:{round(logs['loss'], 3)}, Sample Caption Created: {sample_caption}")

In [None]:
%%capture
solver = ImageCaptioningSolver(device)
solver.to(device)

In [None]:
#solver.training_loop(TrainDataloader, ValDataloader, 120, display_every = 32)

In [None]:
solver.load_state_dict(torch.load("../input/image-captioned-trained/BestVal.pth", map_location = device))

In [None]:
torch.save(solver.state_dict(), "./Captioner.pth")

In [None]:
torch.save(vocabulary, "./vocab.pth")

In [None]:
!rm -rf val2014
!rm -rf annotations