# Imports

## Packages

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import re

import numpy as np
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker

from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_packed_sequence,pack_padded_sequence
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from torchtext.legacy.data import BucketIterator


torch.manual_seed(1)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import time
import math

## Data

In [None]:
!cp "/content/drive/MyDrive/Sourcery/data.zip" "./"
!unzip data.zip &> /dev/null

# Preprocessing

## Get Inputs and Outputs

In [None]:
with open("./train.txt","r") as f:
    train_lines = f.readlines()
with open("./val.txt","r") as f:
    val_lines = f.readlines()
with open("./test.txt","r") as f:
    test_lines = f.readlines()

#TRAIN
train_functions = []
train_names = []
for line in train_lines:
    file_path,function_name,has_docstring = tuple(line.split(","))
    try:
        with open(file_path + ".py","r") as f:
            train_functions.append(f.read())
        train_names.append(function_name)
    except Exception as e:
        print(file_path)

#VAL
val_functions = []
val_names = []
for line in val_lines:
    file_path,function_name,has_docstring = tuple(line.split(","))
    try:
        with open(file_path + ".py","r") as f:
            val_functions.append(f.read())
        val_names.append(function_name)
    except Exception as e:
        print(file_path)

#TEST
test_functions = []
test_names = []
for line in test_lines:
    file_path,function_name,has_docstring = tuple(line.split(","))
    try:
        with open(file_path + ".py","r") as f:
            test_functions.append(f.read())
        test_names.append(function_name)
    except Exception as e:
        print(file_path)

./data/pandas/test_decodeNumericIntExpeMinus_0
./data/python-social-auth/AUTHORIZATION_URL_0
./data/keras/test_flatten_0
./data/pip/serializeError_1
./data/pandas/test_decodeNumericIntExpe_0
./data/pandas/test_Microsecond_0
./data/django-rest-framework/QUERY_PARAMS_0
./data/CouchPotatoServer/Subscribe_0
./data/CouchPotatoServer/getChild_0
./data/edx-platform/CAN_DELETE_0
./data/python-social-auth/ACCESS_TOKEN_URL_0
./data/CouchPotatoServer/setdefault_0
./data/keras/test_reshape_0
./data/CouchPotatoServer/serializeError_0
./data/edx-platform/teardownClass_2
./data/edx-platform/teardownClass_1
./data/pandas/test_Easter_0
./data/pandas/test_Millisecond_0
./data/CouchPotatoServer/Request_0
./data/CouchPotatoServer/Unsubscribe_0
./data/pandas/test_constructor_DataFrame_0
./data/pandas/test_Minute_0
./data/pandas/test_Nanosecond_0
./data/pandas/test_Second_0
./data/sentry/Feature_0
./data/Sick-Beard/computeBitrate_0
./data/pip/eta_0
./data/keras/test_dense_0
./data/pandas/_slice_0
./data/pan

## Clean inputs

In [None]:
def preprocess_input(input_method):
    def replace(matchobj):
        return " " + matchobj.group(0) + " "

    return re.sub(r'\W',replace,input_method)

In [None]:
train_cleaned_inputs = list(map(preprocess_input,train_functions))
val_cleaned_inputs = list(map(preprocess_input,val_functions))
test_cleaned_inputs = list(map(preprocess_input,test_functions))

## Turn Outputs into list

In [None]:
def subTokenize(function_name):
    
    subtokens = function_name.split("_")
    if subtokens[0] == '': subtokens.pop(0)
    #Check if name contained underscore (snake_case)
    #Otherwise assume it's camelCase or PascalCase
    if len(subtokens) == 1:
        subtokens = re.findall('^[a-z]+|[A-Z][^A-Z]*',function_name)
    for i in range(len(subtokens)):
        subtokens[i] = subtokens[i].lower()
    #Add beginning and end of sequence special tokens
    subtokens.append("</s>")
    subtokens.insert(0,"<s>")
    return subtokens

In [None]:
train_names_subtokens = list(map(subTokenize,train_names))
val_names_subtokens = list(map(subTokenize,val_names))
test_names_subtokens = list(map(subTokenize,test_names))

## Build input and output vocab

In [None]:
def build_input_vocab(train_corpus,val_corpus):
    vocab = {
        "<PAD>":0,
        "FUNCTION_NAME_REPLACEMENT_TOKEN":1
    }
    word_frequencies = {
        "<PAD>":0,
        "FUNCTION_NAME_REPLACEMENT_TOKEN":0
    }
    id_frequencies = {
        0:0,
        1:0,
    }
    cnt = 2
    for function in train_corpus:
        tokens = function.split()
        for token in tokens:
            if token not in vocab.keys():
                vocab[token] = cnt
                word_frequencies[token] = 1
                id_frequencies[cnt] = 1
                cnt += 1
            else:
                word_frequencies[token] += 1
                id_frequencies[vocab[token]] += 1

    for function in val_corpus:
        tokens = function.split()
        for token in tokens:
            if token not in vocab.keys():
                vocab[token] = cnt
                word_frequencies[token] = 1
                id_frequencies[cnt] = 1
                cnt += 1
            else:
                word_frequencies[token] += 1
                id_frequencies[vocab[token]] += 1

    return vocab,word_frequencies,id_frequencies

def build_output_vocab(train_names,val_names):
    vocab = {
        "<PAD>":0,
        "<s>":1,
        "</s>":2,
    }
    word_frequencies = {
        "<PAD>":0,
        "<s>":0,
        "</s>":0,
    }
    id_frequencies = {
        0:0,
        1:0,
        2:0,
    }
    cnt = 3
    for name in train_names:
        for subtoken in name:
            if subtoken not in vocab.keys():
                vocab[subtoken] = cnt
                word_frequencies[subtoken] = 1
                id_frequencies[cnt] = 1
                cnt += 1
            else:
                word_frequencies[subtoken] += 1
                id_frequencies[vocab[subtoken]] += 1

    for name in val_names:
        for subtoken in name:
            if subtoken not in vocab.keys():
                vocab[subtoken] = cnt
                word_frequencies[subtoken] = 1
                id_frequencies[cnt] = 1
                cnt += 1
            else:
                word_frequencies[subtoken] += 1
                id_frequencies[vocab[subtoken]] += 1

    return vocab,word_frequencies,id_frequencies



In [None]:
input_vocab,input_word_frequencies,input_id_frequencies = build_input_vocab(train_cleaned_inputs,val_cleaned_inputs)
input_idx2word = {idx:w for w,idx in input_vocab.items()}

output_vocab,output_word_frequencies,output_id_frequencies = build_output_vocab(train_names_subtokens,val_names_subtokens)
output_idx2word = {idx:w for w,idx in output_vocab.items()}

## Unify vocabs

### Turn input vocab into subtokens

In [None]:
unified_vocab = {
        "<PAD>":0,
        "FUNCTION_NAME_REPLACEMENT_TOKEN":1
    }
idx = 2

for key,value in input_vocab.items():
    if value != 0 and value!=1:
        subtokens = subTokenize(key)
        for name in subtokens:
            if name not in unified_vocab.keys():
                unified_vocab[name] = idx
                idx += 1


In [None]:
#Unify input and output vocab
unified_vocab.update(output_vocab)

SOS_TOKEN = unified_vocab["<s>"]
EOS_TOKEN = unified_vocab["</s>"]

## Tokenize

In [None]:
def tokenize(text,vocabulary):
    start_time = time.time()
    tokenized_text = []
    split_text = text.split()
    for word in split_text:
        if word == "FUNCTION_NAME_REPLACEMENT_TOKEN":
            tokenized_text.append(vocabulary["FUNCTION_NAME_REPLACEMENT_TOKEN"])
            continue
        split_words = subTokenize(word)[1:-1]
        for split_word in split_words:
            if split_word in vocabulary.keys():
                tokenized_text.append(vocabulary[split_word])
    tokenize_time = time.time() - start_time
    return tokenized_text


def tokenize_subtokens_list(subtokens_list,vocabulary):
    tokenized_text = []
    for subtoken in subtokens_list:
        if subtoken in vocabulary.keys():
            tokenized_text.append(vocabulary[subtoken])
    return tokenized_text

In [None]:
train_tokenized_inputs = [tokenize(function,unified_vocab) for function in train_cleaned_inputs]
val_tokenized_inputs = [tokenize(function,unified_vocab) for function in val_cleaned_inputs]

train_tokenized_outputs = [tokenize_subtokens_list(function_name,unified_vocab) for function_name in train_names_subtokens]
val_tokenized_outputs = [tokenize_subtokens_list(function_name,unified_vocab) for function_name in val_names_subtokens]

## Cut Max Seq Len

In [None]:
MAX_SEQ_LEN = 1000
def cut(tokenized_text):
    if len(tokenized_text) < MAX_SEQ_LEN:
        return tokenized_text
    else:
        return tokenized_text[:MAX_SEQ_LEN]


train_tokenized_inputs = [cut(tokenized_text) for tokenized_text in train_tokenized_inputs]
val_tokenized_inputs = [cut(tokenized_text) for tokenized_text in val_tokenized_inputs]

## Build Inputs and Outputs Pairs

In [None]:
training_pairs = list(zip(train_tokenized_inputs,train_tokenized_outputs))
validation_pairs = list(zip(val_tokenized_inputs,val_tokenized_outputs))

## Remove ouput with len > Max Output Len

In [None]:
MAX_OUTPUT_LEN = 10

training_pairs = [training_pair for training_pair in training_pairs if len(training_pair[1])<MAX_OUTPUT_LEN]
validation_pairs = [validation_pair for validation_pair in validation_pairs if len(validation_pair[1])<MAX_OUTPUT_LEN]

## Build Dataloaders

### Dataset Class

In [None]:
class FunctionNamingDataset(Dataset):
    def __init__(self,data_pairs):
        self.pairs = data_pairs
        self.n_examples = len(self.pairs)
    
    def __len__(self):
        r"""When used `len` return the number of examples.
        """

        return self.n_examples


    def __getitem__(self, item):
        r"""Given an index return a pair of input output
        """
        input,output = self.pairs[item]
        return (input,output,len(input),len(output))

In [None]:
train_dataset = FunctionNamingDataset(training_pairs)
val_dataset = FunctionNamingDataset(validation_pairs)

train_batch_size = 128
valid_batch_size = 32

### DataLoaders

In [None]:
train_dataloader, val_dataloader = BucketIterator.splits(
    
                              # Datasets for iterator to draw data from
                              (train_dataset, val_dataset),

                              # Tuple of train and validation batch sizes.
                              batch_sizes=(train_batch_size, valid_batch_size),

                              # Device to load batches on.
                              device=device, 

                              # Function to use for sorting examples.
                              sort_key=lambda x: x[2],


                              # Repeat the iterator for multiple epochs.
                              repeat=True, 

                              # Sort all examples in data using `sort_key`.
                              sort=False, 

                              # Shuffle data on each epoch run.
                              shuffle=True,

                              # Use `sort_key` to sort examples in each batch.
                              sort_within_batch=True,
                              )

In [None]:
val_dataloader.create_batches()
for batch in val_dataloader.batches:

  # Let's check batch size.
  print('Batch size: %d\n'% len(batch))
  print('LABEL\tLENGTH\tTEXT'.ljust(10))
  print(batch)
  
  # Only look at first batch. Reuse this code in training models.
  break

Batch size: 32

LABEL	LENGTH	TEXT
[([4485, 1, 561, 1321, 2859, 392, 306, 1017, 168, 500, 1017, 34790, 10, 7777, 447, 91, 429, 8, 2859, 392, 306, 449, 1074, 1901, 2859, 78, 392, 447, 318, 306, 1382, 123, 1049, 349, 6104, 126, 443, 447, 407, 306, 357, 58, 429, 3555, 1503, 1321, 561, 263, 35, 135, 1321, 135, 561, 263, 1051, 2535, 149, 127, 449, 168, 500, 168, 500, 1503, 2859, 392, 306, 561, 263, 392, 306, 149], [1, 78, 537, 429, 2], 72, 5), ([4485, 1, 561, 3114, 12, 11127, 58, 74, 1768, 8, 1463, 818, 3121, 2145, 2145, 5087, 349, 2450, 954, 870, 1204, 97, 772, 973, 1382, 643, 2450, 954, 10, 2263, 555, 772, 2084, 10151, 447, 2798, 390, 123, 496, 2762, 58, 5541, 62, 4797, 445, 1221, 643, 3371, 1075, 2450, 954, 167, 4999, 1287, 3173, 2450, 954, 9, 561, 3116, 3173, 167, 2137, 206, 6413, 7536, 1209, 29, 58, 3114, 3173, 12], [1, 1463, 2], 72, 3), ([4485, 1, 561, 453, 26, 1503, 74, 453, 453, 561, 107, 453, 561, 5080, 449, 453, 9, 453, 453, 9, 384, 561, 2192, 472, 189, 384, 561, 4, 384, 824, 9, 45

### Utility function to transform a batch into input, target and len tensors

In [None]:
def paddedTensorsFromPairBatch(batch):
    batch_size = len(batch)
    unzipped = list(zip(*batch))
    inputs,targets,inputs_lengths,targets_lengths = unzipped[0],unzipped[1],unzipped[2],unzipped[3]

    #Pad input sequences
    inputs_lengths_tensor = torch.LongTensor(inputs_lengths)
    inputs_tensor = Variable(torch.zeros(inputs_lengths_tensor.max(),batch_size)).long()

    for idx, (seq, seqlen) in enumerate(zip(inputs, inputs_lengths_tensor)):
        inputs_tensor[:seqlen,idx] = torch.LongTensor(seq)

    #do the same for the targets
    targets_lengths_tensor = torch.LongTensor(targets_lengths)
    targets_tensor = Variable(torch.zeros(targets_lengths_tensor.max(),batch_size)).long()

    for idx, (seq, seqlen) in enumerate(zip(targets, targets_lengths_tensor)):
        targets_tensor[:seqlen,idx] = torch.LongTensor(seq)
    

    return (inputs_tensor, targets_tensor,inputs_lengths_tensor,targets_lengths_tensor)

In [None]:
start = time.time()
inputs,outputs,len1,len2 = paddedTensorsFromPairBatch(batch)
print(time.time() - start)

0.11125564575195312


# Models

## Baselines RNN Encoder-Decoders

### RNN Encoder

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, padding_idx=0):
        super(EncoderRNN, self).__init__()
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.embedding_dim = embedding_dim

        self.embedding = nn.Embedding(self.vocab_size, embedding_dim, padding_idx=padding_idx)
        self.rnn = nn.GRU(input_size=embedding_dim, hidden_size=hidden_size)

    def forward(self, inputs, lengths, return_packed=False):
        """
        Inputs:
            inputs: (seq_length, batch_size), non-packed inputs
            lengths: (batch_size)
        """
        # [seq_length, batch_size, embedding_dim]
        embedded = self.embedding(inputs)
        packed = pack_padded_sequence(embedded, lengths=lengths,enforce_sorted=False)
        outputs, hiddens = self.rnn(packed)
        if not return_packed:
            return pad_packed_sequence(outputs)[0], hiddens
        return outputs, hiddens

### RNN Decoder

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, encoder, embedding_dim, bias=True, tie_embeddings=False,padding_idx=0):
        """ General attention in `Effective Approaches to Attention-based Neural Machine Translation`
            Ref: https://arxiv.org/abs/1508.04025
            
            Share input and output embeddings:
            Ref:
                - "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
                   https://arxiv.org/abs/1608.05859
                - "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
                   https://arxiv.org/abs/1611.01462
        """
        super(DecoderRNN, self).__init__()
        
        self.hidden_size = encoder.hidden_size
        self.tie_embeddings = tie_embeddings
        
        self.vocab_size = encoder.vocab_size
        self.embedding_dim = embedding_dim
        
        self.embedding = nn.Embedding(self.vocab_size, embedding_dim, padding_idx=padding_idx)
        self.embedding.weight = encoder.embedding.weight
        self.rnn = nn.GRU(input_size=self.embedding_dim,
                            hidden_size=self.hidden_size)
        
        if self.tie_embeddings:
            self.W_proj = nn.Linear(self.hidden_size, self.embedding_dim, bias=bias)
            self.W_s = nn.Linear(self.embedding_dim, self.vocab_size, bias=bias)
            self.W_s.weight = self.embedding.weight
        else:
            self.W_s = nn.Linear(self.hidden_size, self.vocab_size, bias=bias)
        
    def forward(self, input_seq, decoder_hidden, encoder_outputs):
        """ Args:
            - input_seq      : (batch_size,seq_len=1)
            - decoder_hidden : (t=0) last encoder hidden state (num_layers * num_directions, batch_size, hidden_size) 
                               (t>0) previous decoder hidden state (num_layers, batch_size, hidden_size)
            - encoder_outputs: (max_src_len, batch_size, hidden_size * num_directions)
        
            Returns:
            - output           : (batch_size, vocab_size)
            - decoder_hidden   : (num_layers, batch_size, hidden_size)
            - attention_weights: (batch_size, max_src_len)
        """        
        # (batch_size,seq_len=1) => (seq_len=1, batch_size)
        if len(input_seq.size())>1:
            input_seq = input_seq.transpose(0,1)
        # (batch_size) => (seq_len=1, batch_size)
        else:
            input_seq = input_seq.unsqueeze(0)
        
        # (seq_len=1, batch_size) => (seq_len=1, batch_size, word_vec_size) 
        emb = self.embedding(input_seq)
        
        # rnn returns:
        # - decoder_output: (seq_len=1, batch_size, hidden_size)
        # - decoder_hidden: (num_layers, batch_size, hidden_size)
        decoder_output, decoder_hidden = self.rnn(emb, decoder_hidden)

        # (seq_len=1, batch_size, hidden_size) => (batch_size, seq_len=1, hidden_size)
        decoder_output = decoder_output.transpose(0,1)
        
        # If input and output embeddings are tied,
        # project `decoder_hidden_size` to `word_vec_size`.
        if self.tie_embeddings:
            output = self.W_s(self.W_proj(decoder_output))
        else:
            # (batch_size, seq_len=1, decoder_hidden_size) => (batch_size, seq_len=1, vocab_size)
            output = self.W_s(decoder_output)    
        
        # Prepare returns:
        # (batch_size, seq_len=1, vocab_size) => (batch_size, vocab_size)
        output = output.squeeze(1)
                
        return output, decoder_hidden

### Decoder RNN w/ Attention

In [None]:
class Attention(nn.Module):
    def __init__(self, encoder_hidden_dim, decoder_hidden_dim):
        super().__init__()
 
        # The input dimension will the the concatenation of
        # encoder_hidden_dim (hidden) and  decoder_hidden_dim(encoder_outputs)
        self.attn_hidden_vector = nn.Linear(encoder_hidden_dim + decoder_hidden_dim, decoder_hidden_dim)
 
        # We need source len number of values for n batch as the dimension
        # of the attention weights. The attn_hidden_vector will have the
        # dimension of [source len, batch size, decoder hidden dim]
        # If we set the output dim of this Linear layer to 1 then the
        # effective output dimension will be [source len, batch size]
        self.attn_scoring_fn = nn.Linear(decoder_hidden_dim, 1, bias=False)
 
    def forward(self, hidden, encoder_outputs):
        # hidden = [1, batch size, decoder hidden dim]
        src_len = encoder_outputs.shape[0]
 
        # We need to calculate the attn_hidden for each source words.
        # Instead of repeating this using a loop, we can duplicate
        # hidden src_len number of times and perform the operations.
        hidden = hidden.repeat(src_len, 1, 1)
 
        # Calculate Attention Hidden values
        attn_hidden = torch.tanh(self.attn_hidden_vector(torch.cat((hidden, encoder_outputs), dim=2)))
 
        # Calculate the Scoring function. Remove 3rd dimension.
        attn_scoring_vector = self.attn_scoring_fn(attn_hidden).squeeze(2)
 
        # The attn_scoring_vector has dimension of [source len, batch size]
        # Since we need to calculate the softmax per record in the batch
        # we will switch the dimension to [batch size,source len]
        attn_scoring_vector = attn_scoring_vector.permute(1, 0)
 
        # Softmax function for normalizing the weights to
        # probability distribution
        return F.softmax(attn_scoring_vector, dim=1)

class DecoderRNNAttention(nn.Module):
    def __init__(self, encoder, hidden_size,output_vocab_size, embedding_dim,dropout_prob=.3):
        """ General attention in `Effective Approaches to Attention-based Neural Machine Translation`
            Ref: https://arxiv.org/abs/1508.04025
            
            Share input and output embeddings:
            Ref:
                - "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
                   https://arxiv.org/abs/1608.05859
                - "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
                   https://arxiv.org/abs/1611.01462
        """
        super(DecoderRNNAttention, self).__init__()
        
        self.hidden_size = hidden_size
        
        self.vocab_size = output_vocab_size
        self.embedding_dim = embedding_dim

        self.attention = Attention(encoder.hidden_size,self.hidden_size)
        
        self.embedding = nn.Embedding(self.vocab_size, embedding_dim)
        self.rnn = nn.GRU(input_size=self.embedding_dim + encoder.hidden_size,hidden_size=self.hidden_size)
        self.fc = nn.Linear(encoder.hidden_size + self.hidden_size + embedding_dim, embedding_dim)
        self.output_layer = nn.Linear(self.embedding_dim, self.vocab_size)
        self.output_layer.weight = self.embedding.weight

        self.dropout = nn.Dropout(dropout_prob)
        
    def forward(self, input_seq, decoder_hidden, encoder_outputs):
        """ Args:
            - input_seq      : (batch_size,seq_len=1)
            - decoder_hidden : (t=0) last encoder hidden state (num_layers * num_directions, batch_size, hidden_size) 
                               (t>0) previous decoder hidden state (num_layers, batch_size, hidden_size)
            - encoder_outputs: (max_src_len, batch_size, hidden_size * num_directions)
        
            Returns:
            - output           : (batch_size, vocab_size)
            - decoder_hidden   : (num_layers, batch_size, hidden_size)
            - attention_weights: (batch_size, max_src_len)
        """        
        # (batch_size,seq_len=1) => (seq_len=1, batch_size)
        if len(input_seq.size())>1:
            input_seq = input_seq.transpose(0,1)
        # (batch_size) => (seq_len=1, batch_size)
        else:
            input_seq = input_seq.unsqueeze(0)
        
        # (seq_len=1, batch_size) => (seq_len=1, batch_size, word_vec_size) 
        embedded = self.dropout(self.embedding(input_seq))

        # Calculate the attention weights
        a = self.attention(decoder_hidden, encoder_outputs).unsqueeze(1)
 
        # We need to perform the batch wise dot product.
        # Hence need to shift the batch dimension to the front.
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
 
        # Use PyTorch's bmm function to calculate the weight W.
        W = torch.bmm(a, encoder_outputs)
 
        # Revert the batch dimension.
        W = W.permute(1, 0, 2)
 
        # concatenate the previous output with W
        rnn_input = torch.cat((embedded, W), dim=2)
 
        rnn_output, decoder_hidden = self.rnn(rnn_input, decoder_hidden)
 
        # Remove the sentence length dimension and pass them to the Linear layer
        output = self.output_layer(self.fc(torch.cat((rnn_output.squeeze(0), W.squeeze(0), embedded.squeeze(0)), dim=1)))

        return output,decoder_hidden

### Seq2SeqModel w/ Beam Search

In [None]:
def _inflate(tensor, times, dim):
    # repeat_dims = [1] * tensor.dim()
    # repeat_dims[dim] = times
    # return tensor.repeat(*repeat_dims)
    return torch.repeat_interleave(tensor, times, dim)



class Seq2SeqModelTopK(nn.Module):
    def __init__(self, encoder, decoder,k=5,tf_ratio=.5,max_grad_norm=1.,max_output_len=MAX_OUTPUT_LEN):
        super(Seq2SeqModelTopK, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.max_output_len=max_output_len
        self.teacher_forcing_ratio = tf_ratio
        self.max_grad_norm = max_grad_norm

        self.hidden_size = decoder.hidden_size
        self.V = decoder.vocab_size
        self.SOS = SOS_TOKEN
        self.EOS = EOS_TOKEN
        self.k = k


    def forward(self, src_seqs,tgt_seqs,src_lens,tgt_lens,retain_output_probs=True):
         # Last batch might not have the same size as we set to the `batch_size`
        batch_size = src_seqs.size(1)
        assert(batch_size == tgt_seqs.size(1))

        # Pack tensors to variables for neural network inputs (in order to autograd)
        src_seqs = Variable(src_seqs).to(device)
        tgt_seqs = Variable(tgt_seqs).to(device)
        src_lens = Variable(src_lens).to(device)
        tgt_lens = Variable(tgt_lens).to(device)

        # Decoder's input
        input_seq = Variable(torch.LongTensor([SOS_TOKEN] * batch_size)).to(device)

        # Decoder's output sequence length = max target sequence length of current batch.
        max_tgt_len = tgt_lens.data.max()

        # Store all decoder's outputs for loss computation
        decoder_outputs = Variable(torch.zeros((max_tgt_len, batch_size, self.decoder.vocab_size))).to(device)

        #Store actual predicted sequences lengths for metrics computation
        pred_lens = torch.ones(batch_size).to(device) * max_tgt_len
        pred_seq = torch.zeros((max_tgt_len,batch_size))

        # -------------------------------------
        # Forward encoder
        # -------------------------------------
        encoder_outputs, encoder_hidden = self.encoder(src_seqs, src_lens.data.tolist())

        # -------------------------------------
        # Forward decoder
        # -------------------------------------
        # Initialize decoder's hidden state as encoder's last hidden state.
        decoder_hidden = encoder_hidden

        use_teacher_forcing = True if np.random.random() < self.teacher_forcing_ratio else False
        # Run through decoder one time step at a time.
        for t in range(max_tgt_len):
            
            # decoder returns:
            # - decoder_output   : (batch_size, vocab_size)
            # - decoder_hidden   : (num_layers, batch_size, hidden_size)
            decoder_output, decoder_hidden = self.decoder(input_seq, decoder_hidden,encoder_outputs)

            # Store decoder outputs.
            decoder_outputs[t] = decoder_output
            val,predictions = decoder_output.data.topk(1)
            # Next input is current target if teacher forcing
            if use_teacher_forcing:
                input_seq = tgt_seqs[t]
            # Otherwise it's the current output most probable prediction
            else:
                input_seq = predictions.squeeze()

            pred_seq[t] = predictions.squeeze()

            finished_sequences = (input_seq.data == EOS_TOKEN).nonzero(as_tuple=True)[0]
            currently_finished_sequences = finished_sequences[pred_lens[finished_sequences] == max_tgt_len] 
            pred_lens[currently_finished_sequences] = t + 1
            
        metadata = dict()
        metadata["sequence"] = pred_seq.transpose(0,1)
        metadata["length"] = pred_lens
        return decoder_outputs,metadata
            
    def _backtrack(self, nw_output,predecessors, symbols, scores, b,max_tgt_len):
        """Backtracks over batch to generate optimal k-sequences.

        Args:
            nw_output [(batch*k, vocab_size)] * sequence_length: A Tensor of outputs from network
            nw_hidden [(num_layers, batch*k, hidden_size)] * sequence_length: A Tensor of hidden states from network
            predecessors [(batch*k)] * sequence_length: A Tensor of predecessors
            symbols [(batch*k)] * sequence_length: A Tensor of predicted tokens
            scores [(batch*k)] * sequence_length: A Tensor containing sequence scores for every token t = [0, ... , seq_len - 1]
            b: Size of the batch
            hidden_size: Size of the hidden state

        Returns:
            output [(batch, k, vocab_size)] * sequence_length: A list of the output probabilities (p_n)
            from the last layer of the RNN, for every n = [0, ... , seq_len - 1]

            h_t [(batch, k, hidden_size)] * sequence_length: A list containing the output features (h_n)
            from the last layer of the RNN, for every n = [0, ... , seq_len - 1]

            h_n(batch, k, hidden_size): A Tensor containing the last hidden state for all top-k sequences.

            score [batch, k]: A list containing the final scores for all top-k sequences

            length [batch, k]: A list specifying the length of each sequence in the top-k candidates

            p (batch, k, sequence_len): A Tensor containing predicted sequence
        """
        start_time= time.time()

        # initialize return variables given different types
        p = list()
        output = list()
        l = torch.ones((b,self.k),dtype=torch.long).to(device) * max_tgt_len  # Placeholder for lengths of top-k sequences

        # the last step output of the beams are not sorted
        # thus they are sorted here
        sorted_score, sorted_idx = scores[-1].view(b, self.k).topk(self.k)
        s = sorted_score.clone()

        batch_eos_found = [0] * b  # the number of EOS found
        # in the backward loop below for each batch

        t = max_tgt_len - 1
        # initialize the back pointer with the sorted order of the last step beams.
        # add self.pos_index for indexing variable with b*k as the first dimension.
        t_predecessors = (sorted_idx + self.pos_index.expand_as(sorted_idx)).view(b * self.k)
        start_loop = time.time()
        while t >= 0:
            
            current_symbol = symbols[t].index_select(0, t_predecessors)
            current_output = nw_output[t].index_select(0,t_predecessors)
            # Re-order the back pointer of the previous step with the back pointer of
            # the current step
            t_predecessors = predecessors[t].index_select(0, t_predecessors).squeeze()
            tricky_block_start = time.time()
            # This tricky block handles dropped sequences that see EOS earlier.
            # The basic idea is summarized below:
            #
            #   Terms:
            #       Ended sequences = sequences that see EOS early and dropped
            #       Survived sequences = sequences in the last step of the beams
            #
            #       Although the ended sequences are dropped during decoding,
            #   their generated symbols and complete backtracking information are still
            #   in the backtracking variables.
            #   For each batch, everytime we see an EOS in the backtracking process,
            #       1. If there is survived sequences in the return variables, replace
            #       the one with the lowest survived sequence score with the new ended
            #       sequences
            #       2. Otherwise, replace the ended sequence with the lowest sequence
            #       score with the new ended sequence
            #
            eos_indices = symbols[t].data.squeeze(1).eq(self.EOS).nonzero()
            if eos_indices.dim() > 0:
                for i in range(eos_indices.size(0) - 1, -1, -1):
                    # Indices of the EOS symbol for both variables
                    # with b*k as the first dimension, and b, k for
                    # the first two dimensions
                    idx = eos_indices[i]
                    b_idx = int(idx[0] // self.k)
                    # The indices of the replacing position
                    # according to the replacement strategy noted above
                    res_k_idx = self.k - (batch_eos_found[b_idx] % self.k) - 1
                    batch_eos_found[b_idx] += 1
                    res_idx = b_idx * self.k + res_k_idx

                    # Replace the old information in return variables
                    # with the new ended sequence information
                    t_predecessors[res_idx] = predecessors[t][idx[0]]                   
                    current_symbol[res_idx, :] = symbols[t][idx[0]]
                    current_output[res_idx, :] = nw_output[t][idx[0]]
                    s[b_idx, res_k_idx] = scores[t][idx[0]].data[0]
                    l[b_idx][res_k_idx] = t + 1
            tricky_block_time = time.time() - tricky_block_start
            # record the back tracked results
            p.append(current_symbol)
            output.append(current_output)
            t -= 1

        loop_time = time.time() - start_loop
        start_reverse = time.time()
        # Sort and re-order again as the added ended sequences may change
        # the order (very unlikely)
        _, re_sorted_idx = s.topk(self.k)
        l = torch.gather(l,1,re_sorted_idx)

        re_sorted_idx = (re_sorted_idx + self.pos_index.expand_as(re_sorted_idx)).view(b * self.k)
        # Reverse the sequences and re-order at the same time
        # It is reversed because the backtracking happens in reverse time order
        p = [step.index_select(0, re_sorted_idx).view(b, self.k, -1).squeeze() for step in reversed(p)]
        output = [step.index_select(0, re_sorted_idx).view(b, self.k, -1).squeeze() for step in reversed(output)]
        reverse_time = time.time() - start_reverse

        backtrack_time = time.time() - start_time


        # print(f"Total backtrack time: {backtrack_time}")
        # print(f"Part of loop computation : {loop_time/backtrack_time * 100}%")
        # print(f"Part of tricky block in loop computation : {tricky_block_time/loop_time * 100}%")
        # print(f"Part of reversing computation : {reverse_time/backtrack_time * 100}%")


        return l, p, output

    def _mask_symbol_scores(self, score, idx, masking_score=-float('inf')):
        score[idx] = masking_score

    def _mask(self, tensor, idx, dim=0, masking_score=-float('inf')):
        if len(idx.size()) > 0:
            indices = idx[:, 0]
            tensor.index_fill_(dim, indices, masking_score)


    def predict(self,src_seqs,src_lens):
        # Last batch might not have the same size as we set to the `batch_size`
        batch_size = src_seqs.size(1)

        # Pack tensors to variables for neural network inputs (in order to autograd)
        src_seqs = Variable(src_seqs).to(device)
        src_lens = Variable(src_lens).to(device)
        
        # -------------------------------------
        # Forward encoder
        # -------------------------------------
        encoder_outputs, encoder_hidden = self.encoder(src_seqs, src_lens.data.tolist())

        # ---------------------------------------------
        # Declare variables for beam search decoding
        # ---------------------------------------------
        
        self.pos_index = (torch.LongTensor(range(batch_size)) * self.k).view(-1, 1).cuda()

        # Initialize decoder's hidden state as encoder's last hidden state.
        # Inflate the initial hidden states to be of size: b*k x h
        decoder_hidden = _inflate(encoder_hidden,self.k,1,)

        # ... same idea for encoder_outputs
        inflated_encoder_outputs = _inflate(encoder_outputs, self.k, 1)

        # Initialize the scores; for the first step,
        # ignore the inflated copies to avoid duplicate entries in the top k
        sequence_scores = torch.Tensor(batch_size * self.k, 1)
        sequence_scores.fill_(-float('Inf'))
        sequence_scores.index_fill_(0, torch.LongTensor([i * self.k for i in range(0, batch_size)]), 0.0)
        sequence_scores = sequence_scores.cuda()

        # Initialize the input vector
        input_var = torch.transpose(torch.LongTensor([[self.SOS] * batch_size * self.k]), 0, 1).cuda()

        # Store decisions for backtracking
        stored_scores = list()
        stored_predecessors = list()
        stored_emitted_symbols = list()
        stored_outputs = list()

        # Run through decoder one time step at a time.
        for t in range(self.max_output_len):
            # decoder returns:
            # - decoder_output   : (batch_size * k, vocab_size)
            # - decoder_hidden   : (num_layers, batch_size * k, hidden_size)
            # - attention_weights: (batch_size * k, max_src_len)
            decoder_output, decoder_hidden = self.decoder(input_var, decoder_hidden,inflated_encoder_outputs)
            
            log_softmax_output = nn.functional.log_softmax(decoder_output,dim=1)

            # To get the full sequence scores for the new candidates, add the local scores for t_i to the predecessor scores for t_(i-1)
            sequence_scores = _inflate(sequence_scores, self.V, 1)
            sequence_scores += log_softmax_output.squeeze(1)
            scores, candidates = sequence_scores.view(batch_size, -1).topk(self.k, dim=1)

            # Reshape input = (bk, 1) and sequence_scores = (bk, 1)
            input_var = (candidates % self.V).view(batch_size * self.k, 1)
            sequence_scores = scores.view(batch_size * self.k, 1)

            # Update fields for next timestep
            predecessors = (candidates // self.V + self.pos_index.expand_as(candidates)).view(batch_size * self.k, 1)

            decoder_hidden = decoder_hidden.index_select(1, predecessors.squeeze())

            # Update sequence scores and erase scores for end-of-sentence symbol so that they aren't expanded
            stored_scores.append(sequence_scores.clone())
            eos_indices = input_var.data.eq(self.EOS)
            if eos_indices.nonzero().dim() > 0:
                sequence_scores.data.masked_fill_(eos_indices, -float('inf'))

            # Cache results for backtracking
            stored_predecessors.append(predecessors)
            stored_emitted_symbols.append(input_var)
            stored_outputs.append(decoder_output)

        # Do backtracking to return the optimal values
        l, p, outputs = self._backtrack(stored_outputs,stored_predecessors,stored_emitted_symbols,stored_scores, batch_size,self.max_output_len)

        # Build return objects
        topk_length = l
        topk_sequence = torch.stack(p).transpose(0,1).transpose(1,2)
        length = l[:,0]
        sequence = topk_sequence[:,0,:]
        outputs = torch.stack(outputs).transpose(0,1).transpose(1,2)
        
        return topk_sequence,topk_length,sequence,length,outputs


    def train_mode_(self):
        self.encoder.train()
        self.decoder.train()
    
    def eval_mode_(self):
        self.encoder.eval()
        self.decoder.eval()

    
    def clip_gradients_(self):
        nn.utils.clip_grad_norm_(self.encoder.parameters(), self.max_grad_norm)
        nn.utils.clip_grad_norm_(self.decoder.parameters(), self.max_grad_norm)
        

## Allamanis et al. CNN Encoder + LSTM Decoder w/ Attention

### CNN Encoder w/ attention

In [None]:
class EncoderConvAttention(nn.Module):
    def __init__(self,embedding_dim, hidden_size, vocab_size,kernel1=24,kernel2=29):
        super(EncoderConvAttention, self).__init__()
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.conv1 = torch.nn.Conv1d(embedding_dim, hidden_size,kernel_size=kernel1,padding="same")
        self.conv2 = torch.nn.Conv1d(hidden_size, hidden_size,kernel_size=kernel2,padding="same")


    def forward(self,input):
        embedded = self.embedding(input)
        
        #embedded (seq_len,batch_size,embedding_dim) => (batch_size,embedding_dim,seq_len)
        embedded = embedded.transpose(0,1).transpose(1,2)
        
        out = nn.functional.relu(self.conv1(embedded))
        out = self.conv2(out)

        return out,embedded


### Decoder for CNN w/ Attention

In [None]:
class AllamanisGRUDecoder(nn.Module):
    def __init__(self, encoder, embedding_dim, attention_kernel=10, bias=True,padding_idx=0):
        super(AllamanisGRUDecoder, self).__init__()
        
        self.hidden_size = encoder.hidden_size
        
        self.vocab_size = encoder.vocab_size
        self.embedding_dim = embedding_dim
        
        self.embedding = nn.Embedding(self.vocab_size, embedding_dim, padding_idx=padding_idx)
        self.embedding.weight = encoder.embedding.weight
        self.rnn = nn.GRU(input_size=self.embedding_dim,
                            hidden_size=self.hidden_size)
        
        self.attention_weights = nn.Conv1d(self.hidden_size,1,kernel_size=attention_kernel,padding="same")
        
        self.W_s = nn.Linear(self.embedding_dim, self.vocab_size, bias=bias)
        self.W_s.weight = self.embedding.weight

    
    def forward(self,input_seq,decoder_hidden,encoder_outputs,embedded_src_seq):

        # (batch_size,seq_len=1) => (seq_len=1, batch_size)
        if len(input_seq.size())>1:
            input_seq = input_seq.transpose(0,1)
        # (batch_size) => (seq_len=1, batch_size)
        else:
            input_seq = input_seq.unsqueeze(0)


        # Compute next hidden state:
        # (seq_len=1, batch_size) => (seq_len=1, batch_size, word_vec_size) 
        emb = self.embedding(input_seq)
        # - decoder_hidden: (num_layers, batch_size, hidden_size)
        _, decoder_hidden = self.rnn(emb, decoder_hidden)

        # hidden: (num_layers, batch_size, hidden_size) => (batch_size,hidden_size,1)
        hidden = decoder_hidden.squeeze().unsqueeze(-1)

        #Compute attention features using encoder output
        # L2 = (batch_size,hidden_size,seq_len)
        L2 = torch.mul(encoder_outputs,hidden)

        # Lfeat = normalized L2 (batch_size,hidden_size,seq_len)
        Lfeat = L2/torch.linalg.norm(L2,dim=(1,2)).unsqueeze(-1).unsqueeze(-1)

        # attention features = (batch_size,seq_len) normalized attention for each element in the input sequence
        attention_features = nn.functional.softmax(self.attention_weights(Lfeat).squeeze(),dim=1)

        # attention features = (batch_size,seq_len) => (batch_size,seq_len,1) (for following matmul)
        attention_features = attention_features.unsqueeze(-1)

        # context_vector = (batch_size,embedding_dim)
        context_vector = torch.matmul(embedded_src_seq,attention_features).squeeze()

        # final_pred = (batch_size,vocab_size)
        output = self.W_s(context_vector)

        return output,decoder_hidden
    

        

### Allamanis et al. Full model

In [None]:
def _inflate(tensor, times, dim):
    # repeat_dims = [1] * tensor.dim()
    # repeat_dims[dim] = times
    # return tensor.repeat(*repeat_dims)
    return torch.repeat_interleave(tensor, times, dim)



class AllamanisCNNModel(nn.Module):
    def __init__(self, encoder, decoder,k=5,tf_ratio=.5,max_grad_norm=1.,max_output_len=MAX_OUTPUT_LEN):
        super(AllamanisCNNModel, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.max_output_len=max_output_len
        self.teacher_forcing_ratio = tf_ratio
        self.max_grad_norm = max_grad_norm

        self.hidden_size = decoder.hidden_size
        self.V = decoder.vocab_size
        self.SOS = SOS_TOKEN
        self.EOS = EOS_TOKEN
        self.k = k


    def forward(self, src_seqs,tgt_seqs,src_lens,tgt_lens,retain_output_probs=True):
         # Last batch might not have the same size as we set to the `batch_size`
        batch_size = src_seqs.size(1)
        assert(batch_size == tgt_seqs.size(1))

        # Pack tensors to variables for neural network inputs (in order to autograd)
        src_seqs = Variable(src_seqs).to(device)
        tgt_seqs = Variable(tgt_seqs).to(device)
        src_lens = Variable(src_lens).to(device)
        tgt_lens = Variable(tgt_lens).to(device)

        # Decoder's input
        input_seq = Variable(torch.LongTensor([SOS_TOKEN] * batch_size)).to(device)

        # Decoder's output sequence length = max target sequence length of current batch.
        max_tgt_len = tgt_lens.data.max()

        # Store all decoder's outputs for loss computation
        decoder_outputs = Variable(torch.zeros((max_tgt_len, batch_size, self.V))).to(device)

        #Store actual predicted sequences lengths for metrics computation
        pred_lens = torch.ones(batch_size).to(device) * max_tgt_len
        pred_seq = torch.zeros((max_tgt_len,batch_size))

        # -------------------------------------
        # Forward encoder
        # -------------------------------------
        encoder_outputs, embedded_src_seq = self.encoder(src_seqs)

        # -------------------------------------
        # Forward decoder
        # -------------------------------------
        # Initialize decoder's hidden state as zero
        decoder_hidden = Variable(torch.zeros(1,batch_size, self.decoder.hidden_size)).to(device) 

        use_teacher_forcing = True if np.random.random() < self.teacher_forcing_ratio else False
        # Run through decoder one time step at a time.
        for t in range(max_tgt_len):
            
            # decoder returns:
            # - decoder_output   : (batch_size, vocab_size)
            # - decoder_hidden   : (num_layers, batch_size, hidden_size)
            # - attention_weights: (batch_size, max_src_len)
            decoder_output, decoder_hidden = self.decoder(input_seq,decoder_hidden,encoder_outputs,embedded_src_seq)

            # Store decoder outputs.
            decoder_outputs[t] = decoder_output
            val,predictions = decoder_output.data.topk(1)
            # Next input is current target if teacher forcing
            if use_teacher_forcing:
                input_seq = tgt_seqs[t]
            # Otherwise it's the current output most probable prediction
            else:
                input_seq = predictions.squeeze()

            pred_seq[t] = predictions.squeeze()

            finished_sequences = (input_seq.data == EOS_TOKEN).nonzero(as_tuple=True)[0]
            currently_finished_sequences = finished_sequences[pred_lens[finished_sequences] == max_tgt_len] 
            pred_lens[currently_finished_sequences] = t + 1
            
        metadata = dict()
        metadata["sequence"] = pred_seq.transpose(0,1)
        metadata["length"] = pred_lens
        return decoder_outputs,metadata
            
    def _backtrack(self, predecessors, symbols, scores, b,max_tgt_len):
        """Backtracks over batch to generate optimal k-sequences.

        Args:
            nw_output [(batch*k, vocab_size)] * sequence_length: A Tensor of outputs from network
            nw_hidden [(num_layers, batch*k, hidden_size)] * sequence_length: A Tensor of hidden states from network
            predecessors [(batch*k)] * sequence_length: A Tensor of predecessors
            symbols [(batch*k)] * sequence_length: A Tensor of predicted tokens
            scores [(batch*k)] * sequence_length: A Tensor containing sequence scores for every token t = [0, ... , seq_len - 1]
            b: Size of the batch
            hidden_size: Size of the hidden state

        Returns:
            output [(batch, k, vocab_size)] * sequence_length: A list of the output probabilities (p_n)
            from the last layer of the RNN, for every n = [0, ... , seq_len - 1]

            h_t [(batch, k, hidden_size)] * sequence_length: A list containing the output features (h_n)
            from the last layer of the RNN, for every n = [0, ... , seq_len - 1]

            h_n(batch, k, hidden_size): A Tensor containing the last hidden state for all top-k sequences.

            score [batch, k]: A list containing the final scores for all top-k sequences

            length [batch, k]: A list specifying the length of each sequence in the top-k candidates

            p (batch, k, sequence_len): A Tensor containing predicted sequence
        """
        start_time= time.time()

        # initialize return variables given different types
        p = list()
        l = torch.ones((b,self.k),dtype=torch.long).to(device) * max_tgt_len  # Placeholder for lengths of top-k sequences

        # the last step output of the beams are not sorted
        # thus they are sorted here
        sorted_score, sorted_idx = scores[-1].view(b, self.k).topk(self.k)
        s = sorted_score.clone()

        batch_eos_found = [0] * b  # the number of EOS found
        # in the backward loop below for each batch

        t = max_tgt_len - 1
        # initialize the back pointer with the sorted order of the last step beams.
        # add self.pos_index for indexing variable with b*k as the first dimension.
        t_predecessors = (sorted_idx + self.pos_index.expand_as(sorted_idx)).view(b * self.k)
        start_loop = time.time()
        while t >= 0:
            
            current_symbol = symbols[t].index_select(0, t_predecessors)
            # Re-order the back pointer of the previous step with the back pointer of
            # the current step
            t_predecessors = predecessors[t].index_select(0, t_predecessors).squeeze()
            tricky_block_start = time.time()
            # This tricky block handles dropped sequences that see EOS earlier.
            # The basic idea is summarized below:
            #
            #   Terms:
            #       Ended sequences = sequences that see EOS early and dropped
            #       Survived sequences = sequences in the last step of the beams
            #
            #       Although the ended sequences are dropped during decoding,
            #   their generated symbols and complete backtracking information are still
            #   in the backtracking variables.
            #   For each batch, everytime we see an EOS in the backtracking process,
            #       1. If there is survived sequences in the return variables, replace
            #       the one with the lowest survived sequence score with the new ended
            #       sequences
            #       2. Otherwise, replace the ended sequence with the lowest sequence
            #       score with the new ended sequence
            #
            eos_indices = symbols[t].data.squeeze(1).eq(self.EOS).nonzero()
            if eos_indices.dim() > 0:
                for i in range(eos_indices.size(0) - 1, -1, -1):
                    # Indices of the EOS symbol for both variables
                    # with b*k as the first dimension, and b, k for
                    # the first two dimensions
                    idx = eos_indices[i]
                    b_idx = int(idx[0] // self.k)
                    # The indices of the replacing position
                    # according to the replacement strategy noted above
                    res_k_idx = self.k - (batch_eos_found[b_idx] % self.k) - 1
                    batch_eos_found[b_idx] += 1
                    res_idx = b_idx * self.k + res_k_idx

                    # Replace the old information in return variables
                    # with the new ended sequence information
                    t_predecessors[res_idx] = predecessors[t][idx[0]]                   
                    current_symbol[res_idx, :] = symbols[t][idx[0]]
                    s[b_idx, res_k_idx] = scores[t][idx[0]].data[0]
                    l[b_idx][res_k_idx] = t + 1
            tricky_block_time = time.time() - tricky_block_start
            # record the back tracked results
            p.append(current_symbol)
            t -= 1

        loop_time = time.time() - start_loop
        start_reverse = time.time()
        # Sort and re-order again as the added ended sequences may change
        # the order (very unlikely)
        _, re_sorted_idx = s.topk(self.k)
        l = torch.gather(l,1,re_sorted_idx)

        re_sorted_idx = (re_sorted_idx + self.pos_index.expand_as(re_sorted_idx)).view(b * self.k)
        # Reverse the sequences and re-order at the same time
        # It is reversed because the backtracking happens in reverse time order
        p = [step.index_select(0, re_sorted_idx).view(b, self.k, -1).squeeze() for step in reversed(p)]
    
        reverse_time = time.time() - start_reverse

        backtrack_time = time.time() - start_time


        # print(f"Total backtrack time: {backtrack_time}")
        # print(f"Part of loop computation : {loop_time/backtrack_time * 100}%")
        # print(f"Part of tricky block in loop computation : {tricky_block_time/loop_time * 100}%")
        # print(f"Part of reversing computation : {reverse_time/backtrack_time * 100}%")


        return l, p

    def _mask_symbol_scores(self, score, idx, masking_score=-float('inf')):
        score[idx] = masking_score

    def _mask(self, tensor, idx, dim=0, masking_score=-float('inf')):
        if len(idx.size()) > 0:
            indices = idx[:, 0]
            tensor.index_fill_(dim, indices, masking_score)


    def predict(self,src_seqs,src_lens):
        # Last batch might not have the same size as we set to the `batch_size`
        batch_size = src_seqs.size(1)

        # Pack tensors to variables for neural network inputs (in order to autograd)
        src_seqs = Variable(src_seqs).to(device)
        src_lens = Variable(src_lens).to(device)
        
        # -------------------------------------
        # Forward encoder
        # -------------------------------------
        encoder_outputs, embedded_src_seq = self.encoder(src_seqs)

        # ---------------------------------------------
        # Declare variables for beam search decoding
        # ---------------------------------------------
        
        self.pos_index = (torch.LongTensor(range(batch_size)) * self.k).view(-1, 1).cuda()

        # Initialize decoder's hidden state as encoder's last hidden state.
        # Inflate the initial hidden states to be of size: b*k x h
        decoder_hidden = decoder_hidden = Variable(torch.zeros(1,batch_size * self.k, self.decoder.hidden_size)).to(device) 

        # ... same idea for encoder_outputs and src lens
        inflated_encoder_outputs = _inflate(encoder_outputs, self.k, 0)
        inflated_embedded_src_seq = _inflate(embedded_src_seq, self.k, 0)

        # Initialize the scores; for the first step,
        # ignore the inflated copies to avoid duplicate entries in the top k
        sequence_scores = torch.Tensor(batch_size * self.k, 1)
        sequence_scores.fill_(-float('Inf'))
        sequence_scores.index_fill_(0, torch.LongTensor([i * self.k for i in range(0, batch_size)]), 0.0)
        sequence_scores = sequence_scores.cuda()

        # Initialize the input vector
        input_var = torch.transpose(torch.LongTensor([[self.SOS] * batch_size * self.k]), 0, 1).cuda()

        # Store decisions for backtracking
        stored_scores = list()
        stored_predecessors = list()
        stored_emitted_symbols = list()

        # Run through decoder one time step at a time.
        for t in range(self.max_output_len):
            # decoder returns:
            # - decoder_output   : (batch_size * k, vocab_size)
            # - decoder_hidden   : (num_layers, batch_size * k, hidden_size)
            decoder_output, decoder_hidden = self.decoder(input_var, decoder_hidden,inflated_encoder_outputs, inflated_embedded_src_seq)
            
            log_softmax_output = nn.functional.log_softmax(decoder_output,dim=1)

            # To get the full sequence scores for the new candidates, add the local scores for t_i to the predecessor scores for t_(i-1)
            sequence_scores = _inflate(sequence_scores, self.V, 1)
            sequence_scores += log_softmax_output.squeeze(1)
            scores, candidates = sequence_scores.view(batch_size, -1).topk(self.k, dim=1)

            # Reshape input = (bk, 1) and sequence_scores = (bk, 1)
            input_var = (candidates % self.V).view(batch_size * self.k, 1)
            sequence_scores = scores.view(batch_size * self.k, 1)

            # Update fields for next timestep
            predecessors = (candidates // self.V + self.pos_index.expand_as(candidates)).view(batch_size * self.k, 1)

            decoder_hidden = decoder_hidden.index_select(1, predecessors.squeeze())

            # Update sequence scores and erase scores for end-of-sentence symbol so that they aren't expanded
            stored_scores.append(sequence_scores.clone())
            eos_indices = input_var.data.eq(self.EOS)
            if eos_indices.nonzero().dim() > 0:
                sequence_scores.data.masked_fill_(eos_indices, -float('inf'))

            # Cache results for backtracking
            stored_predecessors.append(predecessors)
            stored_emitted_symbols.append(input_var)

        # Do backtracking to return the optimal values
        l, p = self._backtrack(stored_predecessors,stored_emitted_symbols,stored_scores, batch_size,self.max_output_len)

        # Build return objects
        topk_length = l
        topk_sequence = torch.stack(p).transpose(0,1).transpose(1,2)
        length = l[:,0]
        sequence = topk_sequence[:,0,:]
        
        return topk_sequence,topk_length,sequence,length


    def train_mode_(self):
        self.encoder.train()
        self.decoder.train()
    
    def eval_mode_(self):
        self.encoder.eval()
        self.decoder.eval()

    
    def clip_gradients_(self):
        nn.utils.clip_grad_norm_(self.encoder.parameters(), self.max_grad_norm)
        nn.utils.clip_grad_norm_(self.decoder.parameters(), self.max_grad_norm)
        

# Training

## RNN Encoder-Decoder training

### Compute cross entropy for all but padded tokens

In [None]:
def get_acc_and_f1_values(tgt_seqs,pred_seqs,tgt_lens,pred_lens):
    batch_size = tgt_seqs.size(0)
    #get numpy arrays
    tgt_seqs = tgt_seqs.cpu().data.numpy()
    pred_seqs = pred_seqs.cpu().data.numpy()
    tgt_lens = tgt_lens.cpu().data.numpy()
    pred_lens = pred_lens.cpu().data.numpy().astype(int)
    
    #metrics to compute
    precision = 0
    recall = 0
    acc = 0
    #loop: for each prediction, different pred_len and tgt_len make vectorized computation impossible
    for i in range(batch_size):
        tgt = tgt_seqs[i,1:tgt_lens[i]-1]
        pred = pred_seqs[i,1:pred_lens[i]-1]

        tp = float((np.isin(pred,tgt)*1).sum())
        fp = float((np.isin(pred,tgt,invert=True)*1).sum())
        fn = float((np.isin(tgt,pred,invert=True)*1).sum())

        #Precision
        if (tp + fp != 0.): precision += tp/(tp + fp)
        #Recall
        if (tp + fn != 0.): recall += tp/(tp + fn)
        #Acc
        acc += (fp==0. and fn==0.) * 1.

    #average values
    precision /= batch_size
    recall /= batch_size
    acc /= batch_size
    
    if precision + recall != 0.:
        f1 = 2 * precision * recall / (precision + recall)
    else:
        f1 = 0.

    return acc,f1,precision,recall

def sequence_mask(sequence_length, max_len=None):
    """
    Caution: Input and Return are VARIABLE.
    """
    if max_len is None:
        max_len = sequence_length.data.max()
    batch_size = sequence_length.size(0)
    seq_range = torch.arange(0, max_len).long().to(device)
    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
    seq_range_expand = Variable(seq_range_expand)
    if sequence_length.is_cuda:
        seq_range_expand = seq_range_expand.cuda()
    seq_length_expand = (sequence_length.unsqueeze(1)
                         .expand_as(seq_range_expand))
    mask = seq_range_expand < seq_length_expand
    
    return mask

def masked_cross_entropy(logits, target, length):
    """
    Args:
        logits: A Variable containing a FloatTensor of size
            (batch, max_len, num_classes) which contains the
            unnormalized probability for each class.
        target: A Variable containing a LongTensor of size
            (batch, max_len) which contains the index of the true
            class for each corresponding step.
        length: A Variable containing a LongTensor of size (batch,)
            which contains the length of each data in a batch.
    Returns:
        loss: An average loss value masked by the length.
        
    The code is same as:
    
    weight = torch.ones(tgt_vocab_size)
    weight[padding_idx] = 0
    criterion = nn.CrossEntropyLoss(weight.cuda(), size_average)
    loss = criterion(logits_flat, losses_flat)
    """
    logits=logits.to(device)
    target=target.to(device)
    length=length.to(device)
    # logits_flat: (batch * max_len, num_classes)
    
    logits_flat = logits.view(-1, logits.size(-1))
    # log_probs_flat: (batch * max_len, num_classes)
    log_probs_flat = F.log_softmax(logits_flat,dim=1)
    # target_flat: (batch * max_len, 1)
    target_flat = target.view(-1, 1)
    # losses_flat: (batch * max_len, 1)
    losses_flat = -torch.gather(log_probs_flat, dim=1, index=target_flat)
    # losses: (batch, max_len)
    losses = losses_flat.view(*target.size())
    # mask: (batch, max_len)
    mask = sequence_mask(sequence_length=length, max_len=target.size(1))
    # Note: mask need to bed casted to float!
    losses = losses * mask.float()
    loss = losses.sum() / mask.float().sum()
    return loss

### Single batch training

In [None]:
def train_batch(batch, model, optimizer):
    start_batch_time = time.time()

    
    #Unpack batch data
    src_seqs,tgt_seqs,src_lens,tgt_lens = paddedTensorsFromPairBatch(batch)

    # -------------------------------------
    # Training mode (enable dropout)
    # -------------------------------------
    model.train_mode_()    
    # -------------------------------------
    # Zero gradients, since optimizers will accumulate gradients for every backward.
    # -------------------------------------
    optimizer.zero_grad()

    # -------------------------------------
    # Forward model
    # -------------------------------------
    start_pred_time = time.time()
    decoder_outputs,metadata = model(src_seqs,tgt_seqs,src_lens,tgt_lens)
    pred_time = time.time() - start_pred_time
    pred_lens = metadata["length"]
    pred_seqs = metadata["sequence"]
    # -------------------------------------
    # Compute loss
    # -------------------------------------
    start_loss_time = time.time()
    loss = masked_cross_entropy(
        decoder_outputs.transpose(0,1).contiguous(), 
        tgt_seqs.transpose(0,1).contiguous(),
        tgt_lens
    )
    loss_time = time.time() - start_loss_time

    # -------------------------------------
    # Compute metrics
    # -------------------------------------
    start_metrics_time = time.time()
    acc,f1,precision,recall = get_acc_and_f1_values(tgt_seqs.transpose(0,1).contiguous(),
                                                    pred_seqs,
                                                    tgt_lens,
                                                    pred_lens)
    metrics_time = time.time() - start_metrics_time
    
    
    # -------------------------------------
    # Backward and optimize
    # -------------------------------------
    # Backward to get gradients w.r.t parameters in model.
    start_backward_time = time.time()
    loss.backward()
    backward_time = time.time() - start_backward_time

    # Clip gradients
    model.clip_gradients_()
    
    # Update parameters with optimizer
    optimizer.step()
    batch_time = time.time() - start_batch_time

#    print(f"Total batch training time: {batch_time}")
#    print(f"Part of inference computation : {pred_time/batch_time * 100}")
#    print(f"Part of loss computation : {loss_time/batch_time * 100}")
#    print(f"Part of metrics computation : {metrics_time/batch_time * 100}")
#    print(f"Part of backward computation : {backward_time/batch_time * 100}")
        
    return loss.item(),acc,f1,precision,recall

### Train for multiple epochs

In [None]:
def trainEpochs(model, train_dataloader, val_dataloader, optimizer, n_epochs=10,print_every_step=100,save_every_step=50,
                encoder_file="encoder.pt",decoder_file="decoder.pt"):
    global_step = 0

    total_loss = 0
    total_acc = 0
    total_f1 = 0


    epoch_loss = 0
    epoch_acc = 0
    epoch_f1 = 0
    for epoch in range(1,n_epochs):
        train_dataloader.create_batches()
        epoch_loss = 0
        epoch_acc = 0
        epoch_f1 = 0
        for batch_id,batch_data in enumerate(tqdm(train_dataloader.batches)):
            start_train_time = time.time()
            # Train.
            loss,acc,f1,_,_ = train_batch(batch_data, model, optimizer)
            
            # Statistics.
            global_step += 1
            total_loss += loss
            total_acc += acc
            total_f1 += f1


            epoch_loss += loss
            epoch_acc += acc
            epoch_f1 += f1
            
            
            # Save checkpoint.
            if save_every_step is not None:
                if global_step % save_every_step == 0:
                    
                    checkpoint_path = "/content/drive/MyDrive/Sourcery/"
                    torch.save(model.encoder.state_dict(),checkpoint_path + encoder_file)
                    torch.save(model.decoder.state_dict(),checkpoint_path + decoder_file)
                    
                    print('\n' + '='*100)
                    print('Save checkpoint to "{}".'.format(checkpoint_path))
                    print('='*100 + '\n')

            # Print statistics and write to Tensorboard.
            if print_every_step is not None:
                if global_step % print_every_step == 0:
                    
                    print('='*100)
                    print('Training log:')
                    print('- Epoch: {}/{}'.format(epoch, n_epochs))
                    print('- Global step: {}'.format(global_step))
                    print('- Train loss: {}'.format(total_loss/print_every_step))
                    print('- Train accuracy: {}'.format(total_acc/print_every_step))
                    print('- Train f1: {}'.format(total_f1/print_every_step))
                    print('='*100 + '\n')
                    
                    total_loss = 0
                    total_acc = 0
                    total_f1 = 0
            #print(f"Time to end train loop: {time.time() - start_train_time}")

        #### End of epoch, print stats for epoch
        val_top1_acc,val_top1_f1,val_topK_acc,val_topK_f1= evaluate_full_dataset(val_dataloader,model)
        print('\n' + '='*100)
        print('Training log:')
        print('- Epoch: {}/{}'.format(epoch, n_epochs))
        print('- Global step (number of training batch): {}'.format(global_step))
        print('- Train loss: {}'.format(epoch_loss/len(train_dataloader)))
        print('- Train Accuracy : {}'.format(epoch_acc/len(train_dataloader)))
        print('- Train F1 Score : {}'.format(epoch_f1/len(train_dataloader)))
        print('- Val Top-1 Accuracy: {}'.format(val_top1_acc))
        print('- Val Top-1 F1 Score: {}'.format(val_top1_f1))
        print('- Val Top-K Accuracy: {}'.format(val_topK_acc))
        print('- Val Top-K F1 Score: {}'.format(val_topK_f1))
        print('='*100 + '\n')

        if save_every_step is None:
                    
            checkpoint_path = "/content/drive/MyDrive/Sourcery/"
            torch.save(model.encoder.state_dict(),checkpoint_path + encoder_file)
            torch.save(model.decoder.state_dict(),checkpoint_path + decoder_file)
            
            print('\n' + '='*100)
            print('Save checkpoint to "{}".'.format(checkpoint_path))
            print('='*100 + '\n')
        

# Evaluation

## RNN Encoder-Decoder Evaluation

In [None]:
def get_topK_metrics(tgt_seqs,topk_sequence,tgt_lens,topk_length):
    k = topk_length.size(1)
    batch_size = tgt_seqs.size(0)
    #get numpy arrays
    tgt_seqs = tgt_seqs.cpu().data.numpy()    
    topk_sequence = topk_sequence.cpu().data.numpy()
    topk_length = topk_length.cpu().data.numpy()
    
    #metrics to compute
    top1_f1 = 0
    top1_acc = 0
    topK_acc = 0
    topK_f1 = 0
    #loop: for each prediction, different pred_len and tgt_len make vectorized computation impossible
    for i in range(batch_size):
        tgt = tgt_seqs[i,1:tgt_lens[i].item()-1]
        best_acc = 0
        best_f1 = 0
        for j in range(k):
            pred = topk_sequence[i,j,1:topk_length[i,j]-1]

            tp = float((np.isin(pred,tgt)*1).sum())
            fp = float((np.isin(pred,tgt,invert=True)*1).sum())
            fn = float((np.isin(tgt,pred,invert=True)*1).sum())

            #Precision
            if (tp + fp != 0.): precision = tp/(tp + fp)
            else: precision = 0
            #Recall
            if (tp + fn != 0.): recall = tp/(tp + fn)
            else: recall = 0
            #Acc
            acc = (fp==0. and fn==0.) * 1.
            #F1
            if precision + recall != 0.:
                f1 = 2 * precision * recall / (precision + recall)
            else:
                f1 = 0.
            
            #record top1 value
            if j==0:
                top1_acc += acc
                top1_f1 += f1

            #keep best of K values
            if f1>best_f1:
                best_f1 = f1
            if acc>best_acc:
                best_acc = acc

        #add best values to topK metrics
        topK_acc += best_acc
        topK_f1 += best_f1
            

    #average values
    top1_acc /= batch_size
    top1_f1 /= batch_size
    topK_acc /= batch_size
    topK_f1 /= batch_size
    
    return top1_acc,top1_f1,topK_acc,topK_f1

def evaluate(eval_batch, model):
    with torch.no_grad():
        
        #Unpack batch data
        src_seqs,tgt_seqs,src_lens,tgt_lens = paddedTensorsFromPairBatch(eval_batch)

        # -------------------------------------
        # Eval mode mode (disable dropout)
        # -------------------------------------
        model.eval_mode_()   

        # -------------------------------------
        # Forward model
        # -------------------------------------
        topk_sequence,topk_length,sequence,length,outputs = model.predict(src_seqs,src_lens)
        

        # -------------------------------------
        # Compute Accuracy and F1 values
        # -------------------------------------
        #Transpose tgt_seqs from (max_tgt_len,batch_size) => (batch_size,max_tgt_len)
        tgt_seqs = tgt_seqs.transpose(0,1)
        top1_acc,top1_f1,topK_acc,topK_f1 = get_topK_metrics(tgt_seqs,topk_sequence,tgt_lens,topk_length)

        
            
    return top1_acc,top1_f1,topK_acc,topK_f1

In [None]:
def evaluate_full_dataset(val_dataloader,model):
    val_dataloader.create_batches()
    total_top1_acc = 0
    total_top1_f1 = 0
    total_topK_acc = 0
    total_topK_f1 = 0
    nb_eval = len(val_dataloader)
    for batch in tqdm(val_dataloader.batches):
        top1_acc,top1_f1,topK_acc,topK_f1 = evaluate(batch,model)

        total_top1_acc += top1_acc
        total_top1_f1 += top1_f1
        total_topK_acc += topK_acc
        total_topK_f1 += topK_f1

    #avg values
    total_top1_acc /= nb_eval
    total_top1_f1 /= nb_eval
    total_topK_acc /= nb_eval
    total_topK_f1 /= nb_eval

    
    return total_top1_acc,total_top1_f1,total_topK_acc,total_topK_f1

# Run full training

In [None]:
hidden_size = 128
embedding_dim = 128
learning_rate = 0.01
weight_decay = 1e-5

encoder = EncoderRNN(vocab_size=len(unified_vocab),embedding_dim=embedding_dim, hidden_size=hidden_size).to(device)
decoder = DecoderRNN(encoder=encoder,embedding_dim=embedding_dim,tie_embeddings=True).to(device)
attn_decoder = DecoderRNNAttention(encoder=encoder,hidden_size=hidden_size,output_vocab_size=len(unified_vocab),embedding_dim=embedding_dim,dropout_prob=0).to(device)

model = Seq2SeqModelTopK(encoder,decoder)

optimizer = optim.Adam([p for p in model.encoder.parameters() if p.requires_grad] +
                       [p for p in model.decoder.parameters() if p.requires_grad],
                       lr=learning_rate, weight_decay=weight_decay)


trainEpochs(model, train_dataloader,val_dataloader,optimizer, print_every_step=None,save_every_step=None,encoder_file="RNNencoder.pt",decoder_file="RNNdecoder.pt")

  super(Adam, self).__init__(params, defaults)


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)




Training log:
- Epoch: 1/10
- Global step (number of training batch): 613
- Train loss: 3.4618142604050006
- Train Accuracy : 0.09279414763458402
- Train F1 Score : 0.33473934629721513
- Val Top-1 Accuracy: 0.11134244409824047
- Val Top-1 F1 Score: 0.3856574641024958
- Val Top-K Accuracy: 0.18192700696480937
- Val Top-K F1 Score: 0.5306549201419564


Save checkpoint to "/content/drive/MyDrive/Sourcery/".



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



Training log:
- Epoch: 2/10
- Global step (number of training batch): 1226
- Train loss: 2.600475539196568
- Train Accuracy : 0.1453872688961392
- Train F1 Score : 0.43901466795724614
- Val Top-1 Accuracy: 0.13460525109970675
- Val Top-1 F1 Score: 0.43087604334828966
- Val Top-K Accuracy: 0.2141564332844575
- Val Top-K F1 Score: 0.5694660905401642


Save checkpoint to "/content/drive/MyDrive/Sourcery/".



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



Training log:
- Epoch: 3/10
- Global step (number of training batch): 1839
- Train loss: 2.4331236830737812
- Train Accuracy : 0.16015837411636757
- Train F1 Score : 0.4629167145464437
- Val Top-1 Accuracy: 0.14836590450879766
- Val Top-1 F1 Score: 0.4424508543862211
- Val Top-K Accuracy: 0.22756197305718476
- Val Top-K F1 Score: 0.5760311329542667


Save checkpoint to "/content/drive/MyDrive/Sourcery/".



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



Training log:
- Epoch: 4/10
- Global step (number of training batch): 2452
- Train loss: 2.347648586573453
- Train Accuracy : 0.17084692767808593
- Train F1 Score : 0.4801033594367567
- Val Top-1 Accuracy: 0.15342913764662758
- Val Top-1 F1 Score: 0.44805894908619287
- Val Top-K Accuracy: 0.23448955278592376
- Val Top-K F1 Score: 0.5830186268534292


Save checkpoint to "/content/drive/MyDrive/Sourcery/".



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



Training log:
- Epoch: 5/10
- Global step (number of training batch): 3065
- Train loss: 2.292299182535579
- Train Accuracy : 0.17901203099510604
- Train F1 Score : 0.4883960689718715
- Val Top-1 Accuracy: 0.15751008064516128
- Val Top-1 F1 Score: 0.4548534097911896
- Val Top-K Accuracy: 0.23759106946480937
- Val Top-K F1 Score: 0.5887141397779211


Save checkpoint to "/content/drive/MyDrive/Sourcery/".



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



Training log:
- Epoch: 6/10
- Global step (number of training batch): 3678
- Train loss: 2.2537950285107238
- Train Accuracy : 0.1804309407286569
- Train F1 Score : 0.4936457374587325
- Val Top-1 Accuracy: 0.1557345124633431
- Val Top-1 F1 Score: 0.4629511405418743
- Val Top-K Accuracy: 0.23892560942082114
- Val Top-K F1 Score: 0.5929424627479999


Save checkpoint to "/content/drive/MyDrive/Sourcery/".



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



Training log:
- Epoch: 7/10
- Global step (number of training batch): 4291
- Train loss: 2.221321210192234
- Train Accuracy : 0.18540137302882
- Train F1 Score : 0.4978420649250689
- Val Top-1 Accuracy: 0.16292842741935484
- Val Top-1 F1 Score: 0.4669479672068935
- Val Top-K Accuracy: 0.24701017228739006
- Val Top-K F1 Score: 0.6000204679769945


Save checkpoint to "/content/drive/MyDrive/Sourcery/".



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



Training log:
- Epoch: 8/10
- Global step (number of training batch): 4904
- Train loss: 2.199977583636273
- Train Accuracy : 0.18868525693311583
- Train F1 Score : 0.502475947661999
- Val Top-1 Accuracy: 0.1616855296920821
- Val Top-1 F1 Score: 0.46213939670842
- Val Top-K Accuracy: 0.24398884255865105
- Val Top-K F1 Score: 0.5920265306996099


Save checkpoint to "/content/drive/MyDrive/Sourcery/".



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



Training log:
- Epoch: 9/10
- Global step (number of training batch): 5517
- Train loss: 2.190959471279412
- Train Accuracy : 0.18962836460032625
- Train F1 Score : 0.5071613851810551
- Val Top-1 Accuracy: 0.16239575696480937
- Val Top-1 F1 Score: 0.4604734123442624
- Val Top-K Accuracy: 0.24665219483137832
- Val Top-K F1 Score: 0.5953665793267887


Save checkpoint to "/content/drive/MyDrive/Sourcery/".



# Load model

In [None]:
hidden_size = 128
embedding_dim = 128
learning_rate = 0.01
weight_decay = 1e-5

encoder = EncoderRNN(input_vocab_size=len(input_vocab),embedding_dim=embedding_dim, hidden_size=hidden_size).to(device)
decoder = DecoderRNN(encoder=encoder,embedding_dim=embedding_dim,output_vocab_size=len(output_vocab)).to(device)

encoder.load_state_dict(torch.load("/content/drive/MyDrive/Sourcery/encoder.pt"))
decoder.load_state_dict(torch.load("/content/drive/MyDrive/Sourcery/decoder.pt"))

model = Seq2SeqModel(encoder,decoder)

TypeError: ignored

In [None]:
#Single evaluation comparison
val_dataloader.create_batches()
model = Seq2SeqModel(encoder,decoder)
for batch in val_dataloader.batches:
    precision,recall,f1,acc,tgt_seqs,pred_seqs = evaluate(batch,model)
    decoded_tgt_seqs = []
    decoded_pred_seqs = []
    #ADD FOR LOOP OVER ELEMENT IN THE BATCH
    for token in tgt_seq.squeeze().cpu().data.numpy():
        decoded_tgt_seq.append(output_idx2word[token])
    
    for token in pred_seq.squeeze().cpu().data.numpy():
        decoded_pred_seq.append(output_idx2word[token])
    print(decoded_tgt_seq)
    print(decoded_pred_seq)
    break


# Make inferences for the prediction tool

In [None]:
def decode_seq(seq,idx2word_dict):
    decoded_seq = []
    for word in seq:
        if word == EOS_TOKEN:
            break
        if word != SOS_TOKEN:
            decoded_seq.append(idx2word_dict[word])
    decoded_string = "_".join(decoded_seq)
    return decoded_string

hidden_size = 128
embedding_dim = 128
learning_rate = 0.01
weight_decay = 1e-5

idx2word_dict = {idx:w for w,idx in unified_vocab.items()}

encoder = EncoderRNN(vocab_size=len(unified_vocab),embedding_dim=embedding_dim, hidden_size=hidden_size).to(device)
decoder = DecoderRNN(encoder=encoder,embedding_dim=embedding_dim,tie_embeddings=True).to(device)

encoder.load_state_dict(torch.load("/content/drive/MyDrive/Sourcery/RNNencoder.pt"))
decoder.load_state_dict(torch.load("/content/drive/MyDrive/Sourcery/RNNdecoder.pt"))

model = Seq2SeqModelTopK(encoder,decoder)

#Single evaluation comparison
val_dataloader.create_batches()
batch = next(iter(val_dataloader.batches))

src_seqs,tgt_seqs,src_lens,tgt_lens = paddedTensorsFromPairBatch(batch)
topk_sequence,topk_length,sequence,length,outputs = model.predict(src_seqs,src_lens)
#Turn outputs into probability distribution
probability_distribution = torch.nn.functional.softmax(outputs,dim=3)

tgt_seqs = tgt_seqs.transpose(0,1)

In [None]:
json_data = []
batch_size = topk_sequence.size(0)
num_sequences_per_prediction = topk_sequence.size(1)
for i in range(batch_size):
    evaluation_data = {}
    evaluation_data["label"] = decode_seq(tgt_seqs[i].cpu().data.numpy().tolist(),idx2word_dict)
    predictions = []
    for k in range(num_sequences_per_prediction):
        prediction_data = {}
        encoded_seq = topk_sequence[i][k].cpu().data.numpy().tolist()
        prediction_data["prediction"] = decode_seq(encoded_seq,idx2word_dict)
        prediction_data["encoded_prediction"] = encoded_seq
        prediction_data["softmax_output"] = probability_distribution[i][k].cpu().data.numpy().tolist()

        predictions.append(prediction_data)


    evaluation_data["predictions"] = predictions
    json_data.append(evaluation_data)
    break

In [None]:
idx = 0
print(json_data[idx]["label"])
for pred in json_data[idx]["predictions"]:
    print(pred["prediction"])

test_ttf_instance
test_compute_instance
test_quiesce_instance
test_suspend
test_finish_instance
test_suspend_instance


In [None]:
import json
with open("single_pred2.json","w") as f:
    json.dump(json_data,f)