### This notebook translates images to inchi strings using single network for whole string

In [1]:
%load_ext tensorboard
%load_ext autoreload
%autoreload 2

import torch, torchmetrics, timm, re, pickle, Levenshtein, math
import torch.nn as nn
import torchvision as tv
import pytorch_lightning as pl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import albumentations as A
from torch.nn.utils.rnn import pack_padded_sequence
from pathlib import Path
from functools import partial
from collections import defaultdict
from fastprogress import progress_bar
from typing import Optional, Union, Tuple
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, random_split
from PIL import Image
from albumentations.pytorch import ToTensorV2
from preprocessing import preprocess_image

from inchi_utils import *

# Set random seed for reproducibility
manualSeed = 999
#manualSeed = random.randint(1, 10000) # use if you want new results
print("Random Seed: ", manualSeed)
torch.manual_seed(manualSeed);

pl.seed_everything(manualSeed)

# This monkey-patch is there to be able to plot tensors
torch.Tensor.ndim = property(lambda x: len(x.shape))

Global seed set to 999


Random Seed:  999


In [2]:
CHKPTDIR = Path("TONE2Chkpts")
DATADIR = "data/bms-molecular-translation"
LABELS_CSV_PATH = "data/train_labels.csv"
VOCAB_FILEPATH  = CHKPTDIR/"vocab_dict.pt"
TRAINPATHS_PATH = CHKPTDIR/"train_paths.feather"
TESTPATHS_PATH  = CHKPTDIR/"test_paths.feather"
MODEL_SAVEPATH  = CHKPTDIR/"saved_model.ckpt"
CHKPTDIR.mkdir(parents=True, exist_ok=True)

tb_logger = pl.loggers.TensorBoardLogger(CHKPTDIR, name="OneInchINet")

N_WORKERS = 4
BATCH_SIZE = 64
PRECISION = 16
TRUNCATE_SEQ_TO = None # Overriding max length
EMB_SIZE = 256
INP_SIZE = (128, 128)
LR = 1e-3
EPOCHS = 1

In [3]:
!ls {DATADIR}

sample_submission.csv  test  train  train_labels.csv


# Lightning Data Module
### LightningDataModule API

To define a DataModule define 5 methods:
1. prepare_data (how to download(), tokenize, etc…)
2. setup (how to split, etc…)
3. train_dataloader
4. val_dataloader(s)
5. test_dataloader(s)

#### prepare_data
Use this method to do things that might write to disk or that need to be done only from a single process in distributed settings.
1. download
2. tokenize
3. etc…

#### setup
There are also data operations you might want to perform on every GPU. Use setup to do things like:
1. count number of classes
2. build vocabulary
3. perform train/val/test splits
4. apply transforms (defined explicitly in your datamodule or assigned in init)
5. etc…


In [4]:
class ImgtoInChIDataset(Dataset):
    def __init__(self, paths:list, df:pd.DataFrame=None, tsfms:A.Compose=None) -> None:
        self.paths = paths
        if df is not None:
            self.idtoinchi_dict = {
                _id:_inchi for _id, _inchi in
                zip(df["image_id"].values.tolist(), df["InChI"].values.tolist())
            }
        else: # Test time placeholder
            self.idtoinchi_dict = defaultdict(lambda : "test_placeholder", {})
            
        self.tsfms = tsfms
    
    def __len__(self) -> int:
        return len(self.paths)
    
    def __getitem__(self, idx:int) -> Tuple[torch.Tensor, str]:
        imgpath = self.paths[idx]
        imgid = Path(imgpath).stem
        img = np.array(preprocess_image(imgpath, out_size=INP_SIZE), dtype=np.float32)/255.
        if self.tsfms is not None:
            img = self.tsfms(image=img)["image"]
        
        target = self.idtoinchi_dict[imgid]
        return img, target

    
class ImgToInChIDataModuleONE(pl.LightningDataModule):
    def __init__(self, tb_logger, valset_ratio=0.1) -> None:
        super().__init__()
        self.tb_logger = tb_logger
        self.valset_ratio = valset_ratio
        self.dims = (1, *INP_SIZE)
        
        self.train_tsfms = A.Compose([
            A.Flip(),
#             A.Resize(*INP_SIZE, always_apply=True),
#             A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.05, rotate_limit=15, p=0.5),
#             A.RandomCrop(*INP_SIZE),
#             A.RandomBrightnessContrast(p=0.5),
#             A.Normalize(mean=(0.5), std=(0.229)),
            ToTensorV2(),
        ])
        self.test_tsfms = A.Compose([
#             A.Resize(*INP_SIZE, always_apply=True),
#             A.Normalize(mean=(0.5), std=(0.229)),
            ToTensorV2(),
        ])
        
    def prepare_data(self, verbose:bool=True) -> None:
        """Use this method to do things that might write to disk or that
        need to be done only from a single process in distributed settings."""
        # Load labels in DataFrame
        if verbose: print("Loading labels data...", end=' ')
        self.df = pd.read_csv(LABELS_CSV_PATH)
        if verbose: print("DONE!")
        
        # Load image paths
        if verbose: print("Loading paths...", end=' ')
        if TRAINPATHS_PATH.exists():
            self.train_paths = pd.read_feather(TRAINPATHS_PATH)
            self.train_paths = self.train_paths.train_paths.tolist()
        else:
            if verbose: print("Traning paths file not found. Creating...", end=' ')
            self.train_paths = pd.DataFrame(list((Path(DATADIR)/"train").rglob("*.*")), columns=["train_paths"])
            self.train_paths = self.train_paths.applymap(lambda x: str(x))
            self.train_paths.to_feather(TRAINPATHS_PATH)
            self.train_paths = self.train_paths.train_paths.tolist()
            if verbose: print("DONE!")
        if TESTPATHS_PATH.exists():
            self.test_paths = pd.read_feather(TESTPATHS_PATH)
            self.test_paths = self.test_paths.test_paths.tolist()
        else:
            if verbose: print("Test paths file not found. Creating...", end=' ')
            self.test_paths = pd.DataFrame(list((Path(DATADIR)/"test").rglob("*.*")), columns=["test_paths"])
            self.test_paths = self.test_paths.applymap(lambda x: str(x))
            self.test_paths.to_feather(TESTPATHS_PATH)
            self.test_paths = self.test_paths.test_paths.tolist()
            if verbose: print("DONE!")
        if verbose: print("DONE!")
        
        # Get Vocab and Tokenizer
        if verbose: print("Loading vocab and tokenizer...", end=' ')
        
        if VOCAB_FILEPATH.exists():
            self.tokenizer = Tokenizer.from_file(VOCAB_FILEPATH)
        else:
            if verbose: print("Vocab file not found. Creating...", end=' ')
            vocab = VocabONE.from_inchi_pandas_column(self.df.InChI)
            vocab.save_vocab(VOCAB_FILEPATH)
            self.tokenizer = Tokenizer(vocab)
            if verbose: print("DONE!")
        
        self.vocab_size = len(self.tokenizer.vocab)
        if TRUNCATE_SEQ_TO != None:
            self.max_len = TRUNCATE_SEQ_TO
            self.tokenizer.vocab.max_len = TRUNCATE_SEQ_TO
        else:  
            self.max_len = self.tokenizer.vocab.max_len
        if verbose: print("DONE!")
                
    def setup(self, stage:Optional[str]=None) -> None:
        # Assign train/val datasets for use in dataloaders
        if stage == 'fit' or stage is None:
            trainpaths, valpaths = train_test_split(self.train_paths, test_size=self.valset_ratio)
            self.trainset = ImgtoInChIDataset(trainpaths, self.df, self.train_tsfms)
            self.valset = ImgtoInChIDataset(valpaths, self.df, self.test_tsfms)
            
#             # Sample batch
#             imgs, inp_seqs, attn_masks, cap_lens = next(iter(self.train_dataloader()))
#             self.tb_logger.experiment.add_images("Sample images", imgs)

        # Assign test dataset for use in dataloader(s)
        if stage == 'test' or stage is None:
            self.testset = ImgtoInChIDataset(self.test_paths, tsfms=self.test_tsfms)

    def train_dataloader(self) -> DataLoader:
        return DataLoader(self.trainset, BATCH_SIZE, shuffle=True, collate_fn=self.collate_fn, num_workers=N_WORKERS)

    def val_dataloader(self) -> DataLoader:
        return DataLoader(self.valset, BATCH_SIZE, shuffle=False, collate_fn=self.collate_fn, num_workers=N_WORKERS)
    
    def collate_fn(self, batch:tuple) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        imgs = torch.cat([ins[0].unsqueeze(0) for ins in batch])
        targets = [ins[1] for ins in batch]
        targets = [self.tokenizer.encode(t, pad_to_custom_len=TRUNCATE_SEQ_TO) for t in targets]
        
        batch_inp_seqs = torch.tensor([t["inp_seq"] for t in targets])
        batch_attn_masks = torch.tensor([t["attn_mask"] for t in targets])
        batch_cap_lens = torch.tensor([t["cap_len"] for t in targets])
        
        return imgs, batch_inp_seqs, batch_attn_masks, batch_cap_lens

In [5]:
# %%time

# dm = ImgToInChIDataModuleONE(tb_logger)
# dm.prepare_data()
# dm.setup('fit')
# imgs, inp_seqs, attn_masks, cap_lens = next(iter(dm.train_dataloader()))
# imgs.shape, inp_seqs.shape, attn_masks.shape

# Model
### At the time of sentence prediction can we use HMMs or [Beam Search](https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Image-Captioning#overview) to make better decisions?

***Try this***:
Use different LSTM layers for each inchi substring like /c /h
```
>>> rnn = nn.LSTM(10, 20, 2)
>>> input = torch.randn(1, 16, 10)
>>> h0 = torch.randn(2, 16, 20)
>>> c0 = torch.randn(2, 16, 20)
```
change number of layers here to num sublayers

In [34]:
class Encoder(nn.Module):
    def __init__(self, model_name='resnet18', max_len=275, out_channels=512, pretrained=False):
        super().__init__()
#         n_out_pixels = 8*8 if INP_SIZE[0] == 256 else 4*4
        self.out_channels = out_channels
        self.cnn = timm.create_model(model_name, pretrained=pretrained)
        self.cnn.conv1 = nn.Conv2d(1, 64, kernel_size=5, stride=2, padding=2, bias=False)
        
        self.out_channels, n_out_pixels = 128, 16*16
        self.cnn.layer3 = nn.Identity()
        self.cnn.layer4 = nn.Identity()
        self.cnn.global_pool = nn.Identity()
        self.cnn.fc = nn.Identity()
        self.encout = nn.Linear(n_out_pixels, max_len)
        
    def forward(self, x):
        out = self.cnn(x)
        out = out.view(x.size(0), self.out_channels, -1)
        out = self.encout(out)
        out = out.permute(0, 2, 1)
        return out

class Attention(nn.Module):
    """
    Attention Network.
    https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Image-Captioning/blob/master/models.py
    """

    def __init__(self, encoder_dim, decoder_dim, attention_dim):
        """
        :param encoder_dim: feature size of encoded images
        :param decoder_dim: size of decoder's RNN
        :param attention_dim: size of the attention network
        """
        super(Attention, self).__init__()
        self.encoder_att = nn.Linear(encoder_dim, attention_dim)  # linear layer to transform encoded image
        self.decoder_att = nn.Linear(decoder_dim, attention_dim)  # linear layer to transform decoder's output
        self.full_att = nn.Linear(attention_dim, 1)  # linear layer to calculate values to be softmax-ed
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)  # softmax layer to calculate weights

    def forward(self, encoder_out, decoder_hidden):
        """
        Forward propagation.
        :param encoder_out: encoded images, a tensor of dimension (batch_size, num_pixels, encoder_dim)
        :param decoder_hidden: previous decoder output, a tensor of dimension (batch_size, decoder_dim)
        :return: attention weighted encoding, weights
        """
        att1 = self.encoder_att(encoder_out)  # (batch_size, num_pixels, attention_dim)
        att2 = self.decoder_att(decoder_hidden)  # (batch_size, attention_dim)
        att = self.full_att(self.relu(att1 + att2.unsqueeze(1))).squeeze(2)  # (batch_size, num_pixels)
        alpha = self.softmax(att)  # (batch_size, num_pixels)
        attention_weighted_encoding = (encoder_out * alpha.unsqueeze(2)).sum(dim=1)  # (batch_size, encoder_dim)
        return attention_weighted_encoding, alpha
    
class DecoderWithAttention(nn.Module):
    """
    Decoder.
    https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Image-Captioning/blob/master/models.py
    """

    def __init__(self, attention_dim=256, embed_dim=256, decoder_dim=256, vocab_size=202, encoder_dim=128, dropout=0.2):
        """
        :param attention_dim: size of attention network
        :param embed_dim: embedding size
        :param decoder_dim: size of decoder's RNN
        :param vocab_size: size of vocabulary
        :param encoder_dim: feature size of encoded images
        :param dropout: dropout
        """
        super(DecoderWithAttention, self).__init__()

        self.encoder_dim = encoder_dim
        self.attention_dim = attention_dim
        self.embed_dim = embed_dim
        self.decoder_dim = decoder_dim
        self.vocab_size = vocab_size
        self.dropout = dropout

        self.attention = Attention(encoder_dim, decoder_dim, attention_dim)  # attention network

        self.embedding = nn.Embedding(vocab_size, embed_dim)  # embedding layer
        self.dropout = nn.Dropout(p=self.dropout)
        self.decode_step = nn.LSTMCell(embed_dim + encoder_dim, decoder_dim, bias=True)  # decoding LSTMCell
        self.init_h = nn.Linear(encoder_dim, decoder_dim)  # linear layer to find initial hidden state of LSTMCell
        self.init_c = nn.Linear(encoder_dim, decoder_dim)  # linear layer to find initial cell state of LSTMCell
        self.f_beta = nn.Linear(decoder_dim, encoder_dim)  # linear layer to create a sigmoid-activated gate
        self.sigmoid = nn.Sigmoid()
        self.fc = nn.Linear(decoder_dim, vocab_size)  # linear layer to find scores over vocabulary
        self.init_weights()  # initialize some layers with the uniform distribution

    def init_weights(self):
        """
        Initializes some parameters with values from the uniform distribution, for easier convergence.
        """
        self.embedding.weight.data.uniform_(-0.1, 0.1)
        self.fc.bias.data.fill_(0)
        self.fc.weight.data.uniform_(-0.1, 0.1)

    def init_hidden_state(self, encoder_out):
        """
        Creates the initial hidden and cell states for the decoder's LSTM based on the encoded images.
        :param encoder_out: encoded images, a tensor of dimension (batch_size, num_pixels, encoder_dim)
        :return: hidden state, cell state
        """
        mean_encoder_out = encoder_out.mean(dim=1)
        h = self.init_h(mean_encoder_out)  # (batch_size, decoder_dim)
        c = self.init_c(mean_encoder_out)
        return h, c

    def forward(self, encoder_out, encoded_captions, caption_lengths):
        """
        Forward propagation.
        :param encoder_out: encoded images, a tensor of dimension (batch_size, enc_image_size, enc_image_size, encoder_dim)
        :param encoded_captions: encoded captions, a tensor of dimension (batch_size, max_caption_length)
        :param caption_lengths: caption lengths, a tensor of dimension (batch_size, 1)
        :return: scores for vocabulary, sorted encoded captions, decode lengths, weights, sort indices
        """

        batch_size = encoder_out.size(0)
        encoder_dim = encoder_out.size(-1)
        vocab_size = self.vocab_size

        # Flatten image
        encoder_out = encoder_out.view(batch_size, -1, encoder_dim)  # (batch_size, num_pixels, encoder_dim)
        num_pixels = encoder_out.size(1)

        # Sort input data by decreasing lengths; why? apparent below
#         print("caption_lengths before squeeze =", caption_lengths.shape)
        caption_lengths, sort_ind = caption_lengths.squeeze(1).sort(dim=0, descending=True)
        encoder_out = encoder_out[sort_ind]
        encoded_captions = encoded_captions[sort_ind]
#         print("encoded_captions =", encoded_captions.shape)

        # Embedding
        embeddings = self.embedding(encoded_captions.squeeze(1))  # (batch_size, max_caption_length, embed_dim)

        # Initialize LSTM state
        h, c = self.init_hidden_state(encoder_out)  # (batch_size, decoder_dim)
#         print("h, c, emb =",h.shape, c.shape, embeddings.shape, encoded_captions.shape)

        # We won't decode at the <end> position, since we've finished generating as soon as we generate <end>
        # So, decoding lengths are actual lengths - 1
        decode_lengths = (caption_lengths - 1).tolist()

        # Create tensors to hold word predicion scores and alphas
        predictions = torch.zeros(batch_size, max(decode_lengths), vocab_size).to(encoder_out.device)
        alphas = torch.zeros(batch_size, max(decode_lengths), num_pixels).to(encoder_out.device)

        # At each time-step, decode by
        # attention-weighing the encoder's output based on the decoder's previous hidden state output
        # then generate a new word in the decoder with the previous word and the attention weighted encoding
#         print("max(decode_lengths) =", max(decode_lengths))
        for t in range(max(decode_lengths)):
            batch_size_t = sum([l > t for l in decode_lengths])
            attention_weighted_encoding, alpha = self.attention(encoder_out[:batch_size_t], h[:batch_size_t])
            gate = self.sigmoid(self.f_beta(h[:batch_size_t]))  # gating scalar, (batch_size_t, encoder_dim)
            attention_weighted_encoding = gate * attention_weighted_encoding
#             print(t, attention_weighted_encoding.shape, embeddings.shape)
            h, c = self.decode_step(
                torch.cat([embeddings[:batch_size_t, t, :], attention_weighted_encoding], dim=1),
                (h[:batch_size_t], c[:batch_size_t]))  # (batch_size_t, decoder_dim)
            preds = self.fc(self.dropout(h))  # (batch_size_t, vocab_size)
            predictions[:batch_size_t, t, :] = preds
            alphas[:batch_size_t, t, :] = alpha

        return predictions, encoded_captions, decode_lengths, alphas, sort_ind
        
class InChINetONE(pl.LightningModule):
    def __init__(self, vocab_size, max_len, tokenizer):
        super().__init__()
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.encoder_net = Encoder(max_len=max_len-1)
        self.decoder_net = DecoderWithAttention(vocab_size=vocab_size)
        self.loss_fn = nn.CrossEntropyLoss()
        self.softmax = nn.Softmax(dim=-1)
        self.alpha_c = 1.
        
    def forward(self, imgs, inp_seqs, cap_lens):
        encoder_out = self.encoder_net(imgs)
#         print("Encoder =", encoder_out.shape)
        output = self.decoder_net(encoder_out, inp_seqs, cap_lens)
        return output
    
    def training_step(self, train_batch, batch_idx):
        imgs, inp_seqs, attn_masks, cap_lens = train_batch
        tgt_inp, tgt_out = inp_seqs[:, :-1].unsqueeze(1), inp_seqs[:, 1:]

        output = self.forward(imgs, tgt_inp, cap_lens.unsqueeze(1))
        scores, caps_sorted, decode_lengths, alphas, sort_ind = output
        targets = tgt_out[:, :scores.shape[1]]

        # Calculate loss
        loss = self.loss_fn(scores.permute(0, 2, 1), targets)
        # Add doubly stochastic attention regularization
        loss += self.alpha_c * ((1. - alphas.sum(dim=1)) ** 2).mean()

        # Logging to TensorBoard by default
        self.log('train_loss', loss, on_step=True, on_epoch=True, logger=True)
        return loss
    
    def training_epoch_end(self, outputs):
        for name,params in self.named_parameters():
            self.logger.experiment.add_histogram(name, params, self.current_epoch)
    
    def validation_step(self, val_batch, batch_idx):
        with torch.no_grad():
            imgs, inp_seqs, attn_masks, cap_lens = val_batch
        tgt_inp, tgt_out = inp_seqs[:, :-1].unsqueeze(1), inp_seqs[:, 1:]

        output = self.forward(imgs, tgt_inp, cap_lens.unsqueeze(1))
        scores, caps_sorted, decode_lengths, alphas, sort_ind = output
        targets = tgt_out[:, :scores.shape[1]]

        # Calculate loss
        loss = self.loss_fn(scores.permute(0, 2, 1), targets)
        # Add doubly stochastic attention regularization
        loss += self.alpha_c * ((1. - alphas.sum(dim=1)) ** 2).mean()

        self.log('val_loss', loss, on_step=True, on_epoch=True, logger=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=LR)
        lr_scheduler = {
            'scheduler': torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 100),
            'name': 'AnnealingLR'
        }
        return [optimizer], [lr_scheduler]
    

# dm = ImgToInChIDataModuleONE(tb_logger=tb_logger)
# dm.prepare_data(verbose=True)
# dm.setup('fit')
# imgs, inp_seqs, attn_masks, cap_lens = next(iter(dm.train_dataloader()))
# print("inp_seqs =", inp_seqs.shape)

# # encoder_net = Encoder()
# # encoder_out = encoder_net(imgs) 
# encoder_out = torch.rand([256, 275, 128])
# print("Encoder =", encoder_out.shape)

# decoder_net = DecoderWithAttention(vocab_size=dm.vocab_size)
# decoder_out = decoder_net(encoder_out, inp_seqs[:, 1:].unsqueeze(1), cap_lens.unsqueeze(1))
# predictions, encoded_captions, decode_lengths, alphas, sort_ind = decoder_out
# print("@forward decoder_out =", predictions.shape)
# decoder_out = decoder_net.predict(encoder_out, dm.tokenizer)
# print("@predict decoder_out =", decoder_out.shape)

Encoder = torch.Size([256, 275, 128])
@forward decoder_out = torch.Size([256, 152, 202])


In [8]:
# for name, params in encoder_net.cnn.named_parameters():
#     if "weight" in name:
#         print(params.requires_grad)
#         break

# Training and Validation

In [7]:
# %tensorboard --logdir {CHKPTDIR}

dm = ImgToInChIDataModuleONE(tb_logger=tb_logger)
dm.prepare_data(verbose=True)

model = InChINetONE(dm.vocab_size, dm.max_len, dm.tokenizer)
# Add network graph to tensorboard
# tb_logger.log_graph(model, [imgs[0].unsqueeze(0).to(model.device), inp_seqs[0].unsqueeze(0).to(model.device)])
lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval='step')
# checkpoint_callback = pl.callbacks.ModelCheckpoint(monitor='val_loss')
trainer = pl.Trainer(gpus=1, auto_lr_find=True, max_epochs=EPOCHS, precision=PRECISION, profiler=None,
                     default_root_dir=CHKPTDIR, logger=tb_logger, callbacks=[lr_monitor])

trainer.fit(model, dm)
trainer.save_checkpoint(MODEL_SAVEPATH)

Loading labels data... DONE!
Loading paths... DONE!
Loading vocab and tokenizer... DONE!


GPU available: True, used: True
TPU available: None, using: 0 TPU cores
Using native 16bit precision.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                 | Params
-----------------------------------------------------
0 | encoder_net | Encoder              | 746 K 
1 | decoder_net | DecoderWithAttention | 959 K 
2 | loss_fn     | CrossEntropyLoss     | 0     
3 | softmax     | Softmax              | 0     
-----------------------------------------------------
1.7 M     Trainable params
0         Non-trainable params
1.7 M     Total params
6.821     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Traceback (most recent call last):
  File "/home/virk/miniconda3/envs/pt/lib/python3.9/multiprocessing/queues.py", line 251, in _feed
    send_bytes(obj)
  File "/home/virk/miniconda3/envs/pt/lib/python3.9/multiprocessing/connection.py", line 205, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/home/virk/miniconda3/envs/pt/lib/python3.9/multiprocessing/connection.py", line 416, in _send_bytes
    self._send(header + buf)
Traceback (most recent call last):
  File "/home/virk/miniconda3/envs/pt/lib/python3.9/multiprocessing/queues.py", line 251, in _feed
    send_bytes(obj)
  File "/home/virk/miniconda3/envs/pt/lib/python3.9/multiprocessing/connection.py", line 205, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/home/virk/miniconda3/envs/pt/lib/python3.9/multiprocessing/connection.py", line 416, in _send_bytes
    self._send(header + buf)
  File "/home/virk/miniconda3/envs/pt/lib/python3.9/multiprocessing/connection.py", line 373, in _send
    n

In [13]:
imgs, inp_seqs, attn_masks, cap_lens = next(iter(dm.val_dataloader()))
tgt_out = inp_seqs[:, 1:]
# Input <bos> as first token
bs = inp_seqs.size(0)
tgt_inp = torch.tensor(dm.tokenizer.vocab.ctoi(dm.tokenizer.vocab.bos_token), device=imgs.device)
tgt_inp = torch.repeat_interleave(tgt_inp, bs)
tgt_inp = tgt_inp.view(bs, 1)

In [12]:
encoder_out = trainer.model.encoder_net(imgs)
encoder_out.shape

torch.Size([64, 276, 128])

In [38]:
tgt_inp.unsqueeze(-1).shape

torch.Size([64, 1, 1])

In [36]:
encoder_out.shape, inp_seqs[:, 1:].unsqueeze(1).shape, cap_lens.unsqueeze(1).shape

(torch.Size([256, 275, 128]), torch.Size([64, 1, 276]), torch.Size([64, 1]))

In [39]:
output = model.decoder_net(encoder_out, tgt_inp.unsqueeze(-1), torch.tensor(dm.max_len).view(-1, 1))

IndexError: index 1 is out of bounds for dimension 1 with size 1

In [None]:
outputs = []
for i in progress_bar(range(dm.max_len - 1)):
    output = model.decoder_net(encoder_out, tgt_inp)
    outputs.append(output)
    print(i, output.shape)
    tgt_inp = model.softmax(output).argmax(dim=-1)#[:, -1].unsqueeze(-1)
#     print("pred_tgt_inp =", pred_tgt_inp.shape)
#     tgt_inp = torch.cat((tgt_inp, pred_tgt_inp), dim=1)

In [None]:

outputs = model.softmax(torch.cat(outputs, dim=1)).argmax(dim=-1)
avg_distance = []
for i in progress_bar(range(BATCH_SIZE-1)):
    true = dm.tokenizer.decode(inp_seqs[i+1])
    pred = dm.tokenizer.decode(outputs[i])
    print("True =", true)
    print("Pred =", pred)
    print()
#     print("Distance =", Levenshtein.distance(true, pred))
    avg_distance.append(Levenshtein.distance(true, pred))
np.mean(avg_distance)