In [1]:
import gc
gc.collect()

%reset -f

In [2]:
#for cleaning the GPU ram
import torch 
torch.cuda.empty_cache()

In [3]:
# path to notebook folder, use os.path.join to concat 
import os
ROOT_PATH = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

## **This notebook is created for the original NSVD project.** 
### There are 4 main folders in this project as following:
1. Preprocess_dialogs 
2. program generator
3. executor 
4. data foldes
Among these folders we are going to bring number 2 to 3. 

## **Following codes belong to different .py files inside program_genarator**

## **clevrDialog_dataset.py**

In [4]:
# /kaggle/input/nsvd-dataset/caption

In [5]:
import h5py
import json
import os
import numpy as np
import torch
from torch.utils.data import Dataset

In [6]:
def invertDict(_dict):
    return {v: k for k, v in _dict.items()}

In [7]:
class ClevrDialogDataset(Dataset):
    def __init__(self, dataPath, vocabPath, split, indStart=0, indEnd=-1):
        super(ClevrDialogDataset, self).__init__()
        self.data = h5py.File(dataPath, "r")
        with open(vocabPath, "r") as f:
            self.vocab = json.load(f)
        self.vocab["idx_text_to_token"] = invertDict(self.vocab["text_token_to_idx"])
        self.vocab["idx_prog_to_token"] = invertDict(self.vocab["prog_token_to_idx"])
        self.vocab["idx_prog_to_token"] = invertDict(self.vocab["prog_token_to_idx"])
        self.lenVocabText = len(self.vocab["text_token_to_idx"])
        self.lenVocabProg = len(self.vocab["prog_token_to_idx"])

        self.split = split
        self.indStart = indStart
        self.indEnd = indEnd
        self.maxSamples = indEnd - indStart
        self.maxLenProg = 6

    def __len__(self):
        raise NotImplementedError

    def __getitem__(self, index):
        raise NotImplementedError

In [8]:
class ClevrDialogCaptionDataset(ClevrDialogDataset):
    def __init__(self, dataPath, vocabPath, split, name, indStart=0, indEnd=-1):
        super(ClevrDialogCaptionDataset, self).__init__(dataPath, vocabPath, split, indStart=indStart, indEnd=indEnd)
        self.captions = torch.LongTensor(np.asarray(self.data["captions"], dtype=np.int64)[indStart: indEnd])
        self.captionsPrgs = torch.LongTensor(np.asarray(self.data["captionProgs"], dtype=np.int64)[indStart: indEnd])
        self.name = name

    def __len__(self):
        return len(self.captions)

    def __getitem__(self, idx):
        assert idx < len(self)
        caption = self.captions[idx][:16]
        captionPrg = self.captionsPrgs[idx]
        return caption, captionPrg

## **models.py**

In [9]:
import torch
import math
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

In [10]:
class FC(nn.Module):
    def __init__(self, in_size, out_size, dropout_r=0., use_relu=True):
        super(FC, self).__init__()
        self.dropout_r = dropout_r
        self.use_relu = use_relu

        self.linear = nn.Linear(in_size, out_size)

        if use_relu:
            self.relu = nn.ReLU(inplace=True)

        if dropout_r > 0:
            self.dropout = nn.Dropout(dropout_r)

    def forward(self, x):
        x = self.linear(x)

        if self.use_relu:
            x = self.relu(x)

        if self.dropout_r > 0:
            x = self.dropout(x)

        return x


In [11]:
class MLP(nn.Module):
    def __init__(self, in_size, mid_size, out_size, dropout_r=0., use_relu=True):
        super(MLP, self).__init__()

        self.fc = FC(in_size, mid_size, dropout_r=dropout_r, use_relu=use_relu)
        self.linear = nn.Linear(mid_size, out_size)

    def forward(self, x):
        return self.linear(self.fc(x))

In [12]:
class LayerNorm(nn.Module):
    def __init__(self, size, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.eps = eps

        self.a_2 = nn.Parameter(torch.ones(size))
        self.b_2 = nn.Parameter(torch.zeros(size))

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)

        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [13]:
class MHAtt(nn.Module):
    def __init__(self, opts):
        super(MHAtt, self).__init__()
        self.opts = opts

        self.linear_v = nn.Linear(opts.hiddenDim, opts.hiddenDim)
        self.linear_k = nn.Linear(opts.hiddenDim, opts.hiddenDim)
        self.linear_q = nn.Linear(opts.hiddenDim, opts.hiddenDim)
        self.linear_merge = nn.Linear(opts.hiddenDim, opts.hiddenDim)

        self.dropout = nn.Dropout(opts.dropout)

    def forward(self, v, k, q, mask):
        n_batches = q.size(0)

        v = self.linear_v(v).view(
            n_batches,
            -1,
            self.opts.multiHead,
            self.opts.hiddenSizeHead
        ).transpose(1, 2)

        k = self.linear_k(k).view(
            n_batches,
            -1,
            self.opts.multiHead,
            self.opts.hiddenSizeHead
        ).transpose(1, 2)

        q = self.linear_q(q).view(
            n_batches,
            -1,
            self.opts.multiHead,
            self.opts.hiddenSizeHead
        ).transpose(1, 2)

        atted = self.att(v, k, q, mask)
        atted = atted.transpose(1, 2).contiguous().view(
            n_batches,
            -1,
            self.opts.hiddenDim
        )

        atted = self.linear_merge(atted)

        return atted

    def att(self, value, key, query, mask):
        d_k = query.size(-1)

        scores = torch.matmul(
            query, key.transpose(-2, -1)
        ) / math.sqrt(d_k)

        if mask is not None:
            scores = scores.masked_fill(mask, -1e9)

        att_map = F.softmax(scores, dim=-1)
        att_map = self.dropout(att_map)

        return torch.matmul(att_map, value)

In [14]:
class FFN(nn.Module):
    def __init__(self, opts):
        super(FFN, self).__init__()

        self.mlp = MLP(
            in_size=opts.hiddenDim,
            mid_size=opts.FeedForwardSize,
            out_size=opts.hiddenDim,
            dropout_r=opts.dropout,
            use_relu=True
        )

    def forward(self, x):
        return self.mlp(x)


In [15]:
class SA(nn.Module):
    def __init__(self, opts):
        super(SA, self).__init__()
        self.mhatt = MHAtt(opts)
        self.ffn = FFN(opts)

        self.dropout1 = nn.Dropout(opts.dropout)
        self.norm1 = LayerNorm(opts.hiddenDim)

        self.dropout2 = nn.Dropout(opts.dropout)
        self.norm2 = LayerNorm(opts.hiddenDim)

    def forward(self, x, x_mask):
        x = self.norm1(x + self.dropout1(
            self.mhatt(x, x, x, x_mask)
        ))

        x = self.norm2(x + self.dropout2(
            self.ffn(x)
        ))

        return x

In [16]:
class AttFlat(nn.Module):
    def __init__(self, opts):
        super(AttFlat, self).__init__()
        self.opts = opts

        self.mlp = MLP(
            in_size=opts.hiddenDim,
            mid_size=opts.FlatMLPSize,
            out_size=opts.FlatGlimpses,
            dropout_r=opts.dropout,
            use_relu=True
        )
        # FLAT_GLIMPSES = 1
        self.linear_merge = nn.Linear(
            opts.hiddenDim * opts.FlatGlimpses,
            opts.FlatOutSize
        )

    def forward(self, x, x_mask):
        att = self.mlp(x)
        att = att.masked_fill(
            x_mask.squeeze(1).squeeze(1).unsqueeze(2),
            -1e9
        )
        att = F.softmax(att, dim=1)

        att_list = []
        for i in range(self.opts.FlatGlimpses):
            att_list.append(
                torch.sum(att[:, :, i: i + 1] * x, dim=1)
            )

        x_atted = torch.cat(att_list, dim=1)
        x_atted = self.linear_merge(x_atted)

        return x_atted

In [17]:
class CaptionEncoder(nn.Module):
    def __init__(self, opts, textVocabSize):
        super(CaptionEncoder, self).__init__()
        self.embedding = nn.Embedding(textVocabSize, opts.embedDim)
        bidirectional = opts.bidirectional > 0
        self.lstmC = nn.LSTM(
            input_size=opts.embedDim,
            hidden_size=opts.hiddenDim,
            num_layers=opts.numLayers,
            batch_first=True,
            bidirectional=bidirectional
        )
        if bidirectional:
            opts.hiddenDim *= 2
            opts.hiddenSizeHead *= 2
            opts.FlatOutSize *= 2

        self.attCap = nn.ModuleList([SA(opts) for _ in range(opts.layers)])
        self.attFlatCap = AttFlat(opts)
        self.fc = nn.Linear(opts.hiddenDim, opts.hiddenDim)

    def forward(self, cap, hist=None):
        capMask = self.make_mask(cap.unsqueeze(2))
        cap = self.embedding(cap)
        cap, (_, _) = self.lstmC(cap)
        capO = cap.detach().clone()

        for attC in self.attCap:
            cap = attC(cap, capMask)
        # (batchSize, 512)
        cap = self.attFlatCap(cap, capMask)
        encOut = self.fc(cap)
        return encOut, capO
    
    # Masking
    def make_mask(self, feature):
        return (torch.sum(
            torch.abs(feature),
            dim=-1
        ) == 0).unsqueeze(1).unsqueeze(2)

In [18]:
from itertools import chain # needed for preprocessing the output of the local decode function -> added by Sepi

In [19]:
class Decoder(nn.Module):
    def __init__(self, opts, progVocabSize, maxLen, startID=1, endID=2):
        super(Decoder, self).__init__()
        self.numLayers = opts.numLayers
        self.bidirectional = opts.bidirectional > 0
        self.maxLen = maxLen
        self.startID = startID
        self.endID = endID

        self.embedding = nn.Embedding(progVocabSize, opts.embedDim)
        self.lstmProg = nn.LSTM(
            input_size=opts.embedDim,
            hidden_size=2*opts.hiddenDim if self.bidirectional else opts.hiddenDim,
            num_layers=opts.numLayers,
            batch_first=True,
            # bidirectional=self.bidirectional,
        )
        hiddenDim = opts.hiddenDim
        if self.bidirectional:
            hiddenDim *= 2

        self.fcAtt = nn.Linear(2*hiddenDim, hiddenDim)
        self.fcOut = nn.Linear(hiddenDim, progVocabSize)

    def initPrgHidden(self, encOut):
        hidden = [encOut for _ in range(self.numLayers)]
        hidden = torch.stack(hidden, 0).contiguous()
        return hidden, hidden

    def forwardStep(self, prog, progH, questO):
        #**********************************************our error relates to this prog cause in our case it is not acting as tensor anymore.
        batchSize = prog.size(0)
        inputDim = questO.size(1)
        prog = self.embedding(prog)
        outProg, progH = self.lstmProg(prog, progH)

        att = torch.bmm(outProg, questO.transpose(1, 2))
        att = F.softmax(att.view(-1, inputDim), 1).view(batchSize, -1, inputDim)
        context = torch.bmm(att, questO)
        # (batchSize, progLength, hiddenDim)
        out = F.tanh(self.fcAtt(torch.cat([outProg, context], dim=-1)))

        # (batchSize, progLength, progVocabSize)
        out = self.fcOut(out)
        predSoftmax = F.log_softmax(out, 2)
        return predSoftmax, progH

    def forward(self, prog, encOut, questO):
        progH = self.initPrgHidden(encOut)
        predSoftmax, progH = self.forwardStep(prog, progH, questO)

        return predSoftmax, progH

    def sample(self, encOut, questO):
        batchSize = encOut.size(0)
        cudaFlag = encOut.is_cuda
        progH = self.initPrgHidden(encOut)
        # prog = progCopy[:, 0:3]
        prog = torch.LongTensor(batchSize, 1).fill_(self.startID)
        # prog = torch.cat((progStart, progEnd), -1)
        if cudaFlag:
            prog = prog.cuda()
        outputLogProbs = []
        outputTokens = []
     

        def decode(i, output):
            tokens = output.topk(1, dim=-1)[1].view(batchSize, -1)
            return tokens

        for i in range(self.maxLen):
            predSoftmax, progH = self.forwardStep(prog, progH, questO)
            prog = decode(i, predSoftmax)
            prog_flat = list(chain(*prog))
            flat_list = [item.item() for item in prog_flat]
            outputTokens.append(flat_list)#new
        return outputTokens, outputLogProbs


In [20]:
class SeqToSeqC(nn.Module):
    def __init__(self, encoder, decoder):
        super(SeqToSeqC, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, cap, prog):
        encOut, capO = self.encoder(cap)
        predSoftmax, progHC = self.decoder(prog, encOut, capO)
        return predSoftmax, progHC
   
    def sample(self, cap):
        with torch.no_grad():
            encOut, capO = self.encoder(cap)
        outputTokens, outputLogProbs = self.decoder.sample(encOut, capO)

        outputTokens_t = [[row[i] for row in outputTokens] for i in range(len(outputTokens[0]))]
        #return outputTokens
        return outputTokens_t

## **optim.py**

In [21]:
import torch
import torch.optim as Optim

In [22]:
class WarmupOptimizer(object):
    def __init__(self, lr_base, optimizer, data_size, batch_size):
        self.optimizer = optimizer
        self._step = 0
        self.lr_base = lr_base
        self._rate = 0
        self.data_size = data_size
        self.batch_size = batch_size

    def step(self):
        self._step += 1

        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate

        self.optimizer.step()

    def zero_grad(self):
        self.optimizer.zero_grad()

    def rate(self, step=None):
        if step is None:
            step = self._step

        if step <= int(self.data_size / self.batch_size * 1):
            r = self.lr_base * 1/2.
        else:
            r = self.lr_base

        return r


def get_optim(opts, model, data_size, lr_base=None):
    if lr_base is None:
        lr_base = opts.lr

    if opts.optim == 'adam':
        optim = Optim.Adam(
                filter(lambda p: p.requires_grad, model.parameters()),
                lr=0,
                betas=opts.betas,
                eps=opts.eps,

            )
    elif opts.optim == 'rmsprop':
        optim = Optim.RMSprop(
                filter(lambda p: p.requires_grad, model.parameters()),
                lr=0,
                eps=opts.eps,
                weight_decay=opts.weight_decay
            )
    else:
        raise ValueError('{} optimizer is not supported'.fromat(opts.optim))
    return WarmupOptimizer(
        lr_base,
        optim,
        data_size,
        opts.batch_size
    )

def adjust_lr(optim, decay_r):
    optim.lr_base *= decay_r


## **option_caption_parser.py**

In [23]:
import argparse
import os
import torch
#import utils_m

In [24]:
#TOTAL_ITER = 5000
TOTAL_ITER = 5000
VALID_EVE = 1000
#VALID_EVE = 1000

In [25]:
class Options_c():#changed optiopn class as Option_c to differentiate it with the one belong to question
    def __init__(self):
        self.parser = argparse.ArgumentParser()
        self.initialized = False

    def initialize(self):
        self.parser.add_argument(
            '--mode',
            default="train",
            # required=True,
            type=str,
            choices=['train', 'test'],
            help='The mode of the experiment')

        self.parser.add_argument(
            '--run_dir',
            default="kaggle/working",
            # required=True,
            type=str,
            help='The experiment directory')

        self.parser.add_argument(
            '--load_checkpoint_path',
            default=None,
            type=str,
            help='The path the the pretrained CaptionNet')

        self.parser.add_argument(
            '--res_path',
            default="kaggle/working/res.txt",#***
            # required=True,
            type=str,
            help='Path where to log the predicted caption programs')

        self.parser.add_argument(
            '--gpu_ids',
            default='0',
            type=str,
            help='Id of the gpu to be used')

        self.parser.add_argument(
            '--seed',
            default=42,
            type=int,
            help='The seed used in training')

        self.parser.add_argument(
            '--dataPathTr',
            # required=True,
            default = '/kaggle/input/Small_Tr_Val_Test_Final/cap_tr_half.h5',
            type=str,
            help='Path to the h5 file of the Clevr-Dialog preprocessed training data')

        self.parser.add_argument(
            '--dataPathVal',
            default = '/kaggle/input/Small_Tr_Val_Test_Final/cap_val_half.h5',
            # required=True,
            type=str,
            help='Path to the h5 file of the Clevr-Dialog preprocessed validation data')

        self.parser.add_argument(
            '--dataPathTest',
            # required=True,
            default ='/kaggle/input/Small_Tr_Val_Test_Final/cap_test_75000.h5',
            type=str,
            help='Path to the h5 file of the Clevr-Dialog preprocessed test data')

        self.parser.add_argument(
            '--vocabPath',
            default = '/kaggle/input/caption/vocab_output_caption.json',

            # required=True,
            type=str,
            help='Path to the generated vocabulary')

        self.parser.add_argument(
            '--batch_size',
            default=64,
            type=int,
            help='Batch size')

        self.parser.add_argument(
            '--num_workers',
            default=0,
            type=int,
            help='Number of workers for loading')

        self.parser.add_argument(
            '--num_iters',
            #default=5000,
            default=TOTAL_ITER,
            type=int,
            help='Total number of iterations')

        self.parser.add_argument(
            '--display_every',
            default=5,
            type=int,
            help='Display training information every N iterations')

        self.parser.add_argument(
            '--debug_every',
            default=100,
            type=int,
            help='Display debug message every N iterations')

        self.parser.add_argument(
            '--validate_every',
            default=VALID_EVE,
            type=int,
            help='Validate every N iterations')

        self.parser.add_argument(
            '--shuffle_data',
            default=1,
            type=int,
            help='Activate to shuffle the training data')

        self.parser.add_argument(
            '--optim',
            default='adam',
            type=str,
            help='The name of the optimizer to be used')

        self.parser.add_argument(
            '--lr',
            default=1e-3,
            type=float,
            help='Base learning rate')

        self.parser.add_argument(
            '--betas',
            default='0.9, 0.98',
            type=str,
            help='Adam optimizer\'s betas')

        self.parser.add_argument(
            '--eps',
            default='1e-9',
            type=float,
            help='Adam optimizer\'s epsilon')

        self.parser.add_argument(
            '--lr_decay_marks',
            default='50000, 55000',
            type=str,
            help='Learing rate decay marks')

        self.parser.add_argument(
            '--lr_decay_factor',
            default=0.5,
            type=float,
            help='Learning rate decay factor')

        self.parser.add_argument(
            '--weight_decay',
            default=1e-6,
            type=float,
            help='Weight decay')

        self.parser.add_argument(
            '--embedDim',
            default=300,
            type=int,
            help='Embedding dimension')

        self.parser.add_argument(
            '--hiddenDim',
            default=512,
            type=int,
            help='LSTM hidden dimension')

        self.parser.add_argument(
            '--numLayers',
            default=2,
            type=int,
            help='Number of hidden LSTM layers')

        self.parser.add_argument(
            '--dropout',
            default=0.1,
            type=float,
            help='Dropout value')

        self.parser.add_argument(
            '--multiHead',
            default=8,
            type=int,
            help='Number of attention heads')

        self.parser.add_argument(
            '--hiddenSizeHead',
            default=64,
            type=int,
            help='Dimension of each attention head')

        self.parser.add_argument(
            '--FeedForwardSize',
            default=2048,
            type=int,
            help='Dimension of the feed forward layer')

        self.parser.add_argument(
            '--FlatMLPSize',
            default=512,
            type=int,
            help='MLP flatten size')

        self.parser.add_argument(
            '--FlatGlimpses',
            default=1,
            type=int,
            help='Number of flatten glimpses')

        self.parser.add_argument(
            '--FlatOutSize',
            default=512,
            type=int,
            help='Final attention reduction dimension')

        self.parser.add_argument(
            '--layers',
            default=6,
            type=int,
            help='Number of self attention layers')

        self.parser.add_argument(
            '--bidirectional',
            default=1,
            type=int,
            help='Activate to use bidirectional LSTMs')

        self.initialized = True

    def parse(self):
        # initialize parser
        if not self.initialized:
            self.initialize()
       # self.opts = self.parser.parse_args()
        self.opts, unknown = self.parser.parse_known_args()#this is added by me to fix the error of command line arguments.

        # parse gpu id list
        str_gpu_ids = self.opts.gpu_ids.split(',')
        self.opts.gpu_ids = []
        for str_id in str_gpu_ids:
            if str_id.isdigit() and int(str_id) >= 0:
                self.opts.gpu_ids.append(int(str_id))
        if len(self.opts.gpu_ids) > 0 and torch.cuda.is_available():
            print('\n[INFO] Using {} CUDA device(s) ...'.format(len(self.opts.gpu_ids)))
        else:
            print('\n[INFO] Using cpu ...')
            self.opts.gpu_ids = []

        # parse the optimizer's betas and lr decay marks
        self.opts.betas = [float(beta) for beta in self.opts.betas.split(',')]
        lr_decay_marks = [int(m) for m in self.opts.lr_decay_marks.split(',')]
        for i in range(1, len(lr_decay_marks)):
            assert lr_decay_marks[i] > lr_decay_marks[i-1]
        self.opts.lr_decay_marks = lr_decay_marks

        # print and save options
        args = vars(self.opts)
        print('\n ' + 30*'-' + 'Opts' + 30*'-')
        for k, v in args.items():
            print('%s: %s' % (str(k), str(v)))

        if not os.path.isdir(self.opts.run_dir):
            os.makedirs(self.opts.run_dir)
        filename = 'opts_c.txt'
        file_path = os.path.join(self.opts.run_dir, filename)
        with open(file_path, 'wt') as fout:
            fout.write('| options\n')
            for k, v in sorted(args.items()):
                fout.write('%s: %s\n' % (str(k), str(v)))
        return self.opts


## **train_caption_parser.py**

In [26]:
#from clevrDialog_dataset import ClevrDialogCaptionDataset
#from models import SeqToSeqC, CaptionEncoder, Decoder
#from optim import get_optim, adjust_lr
#from options_caption_parser import Options
import os, json, torch, pickle, copy, time
import numpy as np
import torch.nn as nn
import torch.utils.data as Data
from tensorboardX import SummaryWriter

In [27]:
import sys
class Execution:
    def __init__(self, opts):
        self.opts = opts

        self.loss_fn = torch.nn.NLLLoss().cuda()
        print("[INFO] Loading dataset ...")

        self.dataset_tr = ClevrDialogCaptionDataset(
            opts.dataPathTr, opts.vocabPath, "train", "Captions Tr")

        self.dataset_val = ClevrDialogCaptionDataset(
            opts.dataPathVal, opts.vocabPath, "val", "Captions Val")

        self.dataset_test = ClevrDialogCaptionDataset(
           opts.dataPathTest, opts.vocabPath, "test", "Captions Test")

        tb_path = os.path.join(opts.run_dir, "tb_logdir")
        if not os.path.isdir(tb_path):
            os.makedirs(tb_path)

        self.ckpt_path = os.path.join(opts.run_dir, "ckpt_dir")
        if not os.path.isdir(self.ckpt_path):
            os.makedirs(self.ckpt_path)

        self.writer = SummaryWriter(tb_path)
        self.iter_val = 0
        self.bestValAcc = float("-inf")
        self.bestValIter = -1

    def constructNet(self, lenVocabText, lenVocabProg, maxLenProg, ):
        decoder = Decoder(self.opts, lenVocabProg, maxLenProg)
        encoder = CaptionEncoder(self.opts, lenVocabText)
        net = SeqToSeqC(encoder, decoder)
        return net

    def train(self, dataset, dataset_val=None):
        # Obtain needed information
        lenVocabText = dataset.lenVocabText
        lenVocabProg = dataset.lenVocabProg
        maxLenProg = dataset.maxLenProg
        net = self.constructNet(lenVocabText, lenVocabProg, maxLenProg)

        net.cuda()
        net.train()

        # Define the multi-gpu training if needed
        if len(self.opts.gpu_ids) > 1:
            net = nn.DataParallel(net, device_ids=self.opts.gpu_ids)

        # Load checkpoint if resume training
        if self.opts.load_checkpoint_path is not None:
            print("[INFO] Resume trainig from ckpt {} ...".format(
                self.opts.load_checkpoint_path
            ))

            # Load the network parameters
            ckpt = torch.load(self.opts.load_checkpoint_path)
            print("[INFO] Checkpoint successfully loaded ...")
            net.load_state_dict(ckpt['state_dict'])

            # Load the optimizer paramters
            optim = get_optim(self.opts, net, len(dataset), lr_base=ckpt['lr_base'])
            optim.optimizer.load_state_dict(ckpt['optimizer'])

        else:
            optim = get_optim(self.opts, net, len(dataset))
        _iter = 0
        epoch = 0

        # Define dataloader
        dataloader = Data.DataLoader(
            dataset,
            batch_size=self.opts.batch_size,
            shuffle=self.opts.shuffle_data,
            num_workers=self.opts.num_workers,
        )
        _iterCur = 0
        _totalCur = len(dataloader)
        # Training loop
        while _iter < self.opts.num_iters:
            # Learning Rate Decay
            if _iter in self.opts.lr_decay_marks:
                adjust_lr(optim, self.opts.lr_decay_factor)

            time_start = time.time()
            # Iteration
            for caption, captionPrg in dataloader:
                if _iter >= self.opts.num_iters:
                    break
                caption = caption.cuda()
                captionPrg = captionPrg.cuda()
                captionPrgTarget = captionPrg.clone()
                optim.zero_grad()

                predSoftmax, _ = net(caption, captionPrg)

                loss = self.loss_fn(
                    predSoftmax[:, :-1, :].contiguous().view(-1, predSoftmax.size(2)),
                    captionPrgTarget[:, 1:].contiguous().view(-1))
                loss.backward()

                # logging
                self.writer.add_scalar(
                    'train/loss',
                    loss.cpu().data.numpy(),
                    global_step=_iter)

                self.writer.add_scalar(
                    'train/lr',
                    optim._rate,
                    global_step=_iter)
                if _iter % self.opts.display_every == 0:
                    print("\r[CLEVR-Dialog - %s (%d/%4d)][epoch %2d][iter %4d/%4d] loss: %.4f, lr: %.2e" % (
                            dataset.name,
                            _iterCur,
                            _totalCur,
                            epoch,
                            _iter,
                            self.opts.num_iters,
                            loss.cpu().data.numpy(),
                            optim._rate,
                        ), end='          ')
                optim.step()
                _iter += 1
                _iterCur += 1

                if _iter % self.opts.validate_every == 0:
                    if dataset_val is not None:
                        valAcc = self.eval(
                            net,
                            dataset_val,
                            valid=True,
                        )
                        if valAcc > self.bestValAcc:
                            self.bestValAcc = valAcc
                            self.bestValIter = _iter

                            print("[INFO] Checkpointing model @ iter {}".format(_iter))
                            state = {
                                'state_dict': net.state_dict(),
                                'optimizer': optim.optimizer.state_dict(),
                                'lr_base': optim.lr_base,
                                'optim': optim.lr_base,
                                'last_iter': _iter,
                                'last_epoch': epoch,
                            }
                            # checkpointing
                            torch.save(
                                state,
                                os.path.join(self.ckpt_path, 'ckpt_iter' + str(_iter) + '.pkl')
                            )
                    else:
                        print("[INFO] No validation dataset available")

            time_end = time.time()
            print('Finished epoch in {}s'.format(int(time_end-time_start)))
            epoch += 1

        print("[INFO] Training done. Best model had val acc. {} @ iter {}...".format(self.bestValAcc, self.bestValIter))

    # Evaluation
    def eval(self, net, dataset, valid=False):
        net = net.eval()
        data_size = len(dataset)
        dataloader = Data.DataLoader(
            dataset,
            batch_size=self.opts.batch_size,
            shuffle=False,
            num_workers=self.opts.num_workers,
            pin_memory=False
        )
        allPredictedProgs = []
        numAllProg = 0
        falsePred = 0
        for step, (caption, captionPrg) in enumerate(dataloader):
            print("\rEvaluation: [step %4d/%4d]" % (
                step,
                int(data_size / self.opts.batch_size),
            ), end='          ')
            sys.stdout.flush()#my shit***************************
            caption = caption.cuda()
            captionPrg = captionPrg.cuda()
          
            tokens = net.sample(caption)
            targetProgs = decodeProg(captionPrg, dataset.vocab["idx_prog_to_token"], target=True)
            predProgs = decodeProg(tokens, dataset.vocab["idx_prog_to_token"])
            predProgs = [sublist for sublist in predProgs if sublist]
            allPredictedProgs.extend(list(map(lambda s: "( {} ( {} ) ) \n".format(s[0], ", ".join(s[1:])), predProgs)))
                              
                                
            numAllProg += len(targetProgs)
            for targetProg, predProg in zip(targetProgs, predProgs):
                mainMod = targetProg[0] == predProg[0]
                sameLength = len(targetProg) == len(predProg)
                sameArgs = False
                if sameLength:
                    sameArgs = True
                    for argTarget in targetProg[1:]:
                        if argTarget not in predProg[1:]:
                            sameArgs = False
                            break

                if not (mainMod and sameArgs):
                    falsePred += 1
        val_acc = (1 - (falsePred / numAllProg)) * 100.0
        print("Acc: {}".format(val_acc))
        net = net.train()
        if not valid:
            with open(self.opts.res_path, "w") as f:
                f.writelines(allPredictedProgs)
            print("[INFO] Predicted caption programs logged into {}".format(self.opts.res_path))
        return val_acc

    def run(self, run_mode):
        self.set_seed(self.opts.seed)
        if run_mode == 'train':
            self.train(self.dataset_tr, self.dataset_val)

        elif run_mode == 'test':
            lenVocabText = self.dataset_test.lenVocabText
            lenVocabProg = self.dataset_test.lenVocabProg
            maxLenProg = self.dataset_test.maxLenProg
            net = self.constructNet(lenVocabText, lenVocabProg, maxLenProg)

            print('Loading ckpt {}'.format(self.opts.load_checkpoint_path))
            state_dict = torch.load(self.opts.load_checkpoint_path)['state_dict']
            net.load_state_dict(state_dict)
            net.cuda()
            self.eval(net, self.dataset_test)

        else:
            exit(-1)

    def set_seed(self, seed):
        """Sets the seed for reproducibility.
        Args:
            seed (int): The seed used
        """
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        np.random.seed(seed)
        print('[INFO] Seed set to {}...'.format(seed))


def decodeProg(tokens, prgIdxToToken, target=False):
    
    if (target == True):
        tokensBatch = tokens.tolist()
    else:
        tokensBatch = tokens
    #print("want to see what happens to tokens in decodeProg", tokensBatch)
    progsBatch = []
    for tokens in tokensBatch:
        #print("tokens inside the first for loop in decodeProg", tokens)
        prog = []
        for tok in tokens:
            if tok == 2:  # <END> has index 2
                break
            
            prog.append(prgIdxToToken.get(tok))
          
           
        if target:
            #print("tuye if")
            prog = prog[1:]
        progsBatch.append(prog)
    return progsBatch





In [28]:
##### #__name__ == "__main__":
opts = Options_c().parse()





[INFO] Using 1 CUDA device(s) ...

 ------------------------------Opts------------------------------
mode: train
run_dir: kaggle/working
load_checkpoint_path: None
res_path: kaggle/working/res.txt
gpu_ids: [0]
seed: 42
dataPathTr: /kaggle/input/Small_Tr_Val_Test_Final/cap_tr_half.h5
dataPathVal: /kaggle/input/Small_Tr_Val_Test_Final/cap_val_half.h5
dataPathTest: /kaggle/input/Small_Tr_Val_Test_Final/cap_test_75000.h5
vocabPath: /kaggle/input/caption/vocab_output_caption.json
batch_size: 64
num_workers: 0
num_iters: 5000
display_every: 5
debug_every: 100
validate_every: 1000
shuffle_data: 1
optim: adam
lr: 0.001
betas: [0.9, 0.98]
eps: 1e-09
lr_decay_marks: [50000, 55000]
lr_decay_factor: 0.5
weight_decay: 1e-06
embedDim: 300
hiddenDim: 512
numLayers: 2
dropout: 0.1
multiHead: 8
hiddenSizeHead: 64
FeedForwardSize: 2048
FlatMLPSize: 512
FlatGlimpses: 1
FlatOutSize: 512
layers: 6
bidirectional: 1


In [29]:
exe = Execution(opts)

[INFO] Loading dataset ...


In [30]:
exe.run(opts.mode)

[INFO] Seed set to 42...
Evaluation: [step    1/   1]          ][epoch  0][iter  995/5000] loss: 0.5769, lr: 5.00e-04          Acc: 75.0
[INFO] Checkpointing model @ iter 1000
Evaluation: [step    1/   1]          )][epoch  0][iter 1995/5000] loss: 0.5720, lr: 5.00e-04          Acc: 75.80645161290323
[INFO] Checkpointing model @ iter 2000
[CLEVR-Dialog - Captions Tr (2730/2733)][epoch  0][iter 2730/5000] loss: 0.5663, lr: 5.00e-04          Finished epoch in 805s
Evaluation: [step    1/   1]          )][epoch  1][iter 2995/5000] loss: 0.5704, lr: 1.00e-03          Acc: 72.58064516129032
Evaluation: [step    1/   1]          )][epoch  1][iter 3995/5000] loss: 0.5614, lr: 1.00e-03          Acc: 75.0
Evaluation: [step    1/   1]          )][epoch  1][iter 4995/5000] loss: 0.5499, lr: 1.00e-03          Acc: 75.80645161290323
Finished epoch in 671s
[INFO] Training done. Best model had val acc. 75.80645161290323 @ iter 2000...


In [31]:
print("[INFO] Done ...")

[INFO] Done ...
