In [1]:
# !pip install rouge

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext
print(torchtext.__version__) # 0.3.1
from torchtext import data
from torchtext.data import Field, BucketIterator

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import spacy
import numpy as np

import random
import math
import time
import pickle
import json
import re

from rouge import Rouge
from inltk.inltk import tokenize

0.6.0


In [2]:
# ! git clone https://github.com/akashe/Python-Code-Generation

In [3]:
import sys
sys.path.append("./Code/")

In [4]:
from data_processing import getTokenizer,getData

In [5]:
# !wget -c "http://www.phontron.com/download/conala-corpus-v1.1.zip"
# from inltk.inltk import setup
# setup('hi')

In [6]:
# !head -20 conala-corpus/conala-test.json
# with open('./conala-corpus/conala-test.json') as f:
#     for _ in range(20): # first 10 lines
#         print(f.readline())

In [7]:
questions, answers = [],[]

In [8]:
# # loading train json
# f = open("./conala-corpus/conala-train.json","r")
# train_file = json.load(f)
# f.close()

In [9]:
# for num,i in enumerate(train_file):
#     if i['intent'] is not None:
#         questions.append(i['intent'])
#         answers.append(i['snippet'])

In [10]:
# # loading test json
# f = open("./conala-corpus/conala-test.json","r")
# test_file = json.load(f)
# f.close()

In [11]:
# for num,i in enumerate(test_file):
#     if i['intent'] is not None:
#         questions.append(i['intent'])
#         answers.append(i['snippet'])

In [12]:
# questions[50:70],answers[50:70]

In [8]:
# questions_, answers_ = getData("./Python-Code-Generation/data/english_python_data_pruned.txt")
questions_, answers_ = getData('./Data/Data.txt')

In [9]:
# Setting max word len
max_word_len = 301

In [10]:
# removing examples with len more than max_word_len
# Only checking in original data because I checked in CONALA data and answers their dont exceed 301.
pruned_questions = []
pruned_answers = []
for j,i in zip(questions_,answers_):
    tokens = getTokenizer(i)
    if not len(tokens) > max_word_len:
        pruned_answers.append(i)
        pruned_questions.append(j) 

print(len(pruned_answers))
answers_ = pruned_answers
questions_ = pruned_questions

Error in tokenization
Error in tokenization
Error in tokenization
Error in tokenization
Error in tokenization
1058


In [11]:
questions = questions + questions_
answers = answers + answers_

In [12]:
questions[100],answers[100]

('संख्या पूरी तरह से विभाजित होने पर तोड़ने के लिए एक पायथन प्रोग्राम लिखें',
 'i = 1\nwhile True:\n\tif i%3 == 0:\n\t\tbreak\n\tprint(i)\n\ti+= 1\n')

In [13]:
print(len(questions),len(answers))

1058 1058


In [14]:
SEED = 1327

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [14]:
# spacy_en = spacy.load('en_core_web_sm')
# tok_hin = tokenize()

In [15]:
def tokenize_hi(text):
    """
    Tokenizes Hindi text from a string into a list of strings
    """
    return [tok for tok in tokenize(text, 'hi')]

In [16]:
SRC = Field(tokenize = tokenize_hi, 
            init_token = '', 
            eos_token = '', 
            lower = True, 
            batch_first = True)

TRG = Field(tokenize = getTokenizer, 
            init_token = '', 
            eos_token = '', 
            lower = False, 
            batch_first = True)

In [17]:
fields = [('src', SRC), ('trg', TRG)]

Examples = [data.Example.fromlist([i,j], fields) for i,j in zip(questions,answers)]
Dataset = data.Dataset(Examples, fields)

Error in tokenization
Error in tokenization
Error in tokenization
Error in tokenization
Error in tokenization


In [18]:
train_data,valid_data = Dataset.split(split_ratio=[0.85,0.15])

In [19]:
SRC.build_vocab(train_data, min_freq = 1)
TRG.build_vocab(train_data, min_freq = 1)

In [20]:
print(len(SRC.vocab))
print(len(TRG.vocab))

1491
1730


In [21]:
# Dumps dicts
with open("./SRC_stio_local","wb") as f:
  pickle.dump(SRC.vocab.stoi,f)
with open("./TRG_itos_local","wb") as f:
  pickle.dump(TRG.vocab.itos,f)

In [22]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [23]:
BATCH_SIZE = 32

train_iterator, valid_iterator = BucketIterator.splits(
    (train_data, valid_data), 
     batch_size = BATCH_SIZE,
     sort_key = lambda x: len(x.trg),
     device = device)

In [24]:
class PositionalEncodingComponent(nn.Module):
    '''
    Class to encode positional information to tokens.
    

    '''
    def __init__(self,hid_dim,device,dropout=0.2,max_len=5000):
        super().__init__()

        assert hid_dim%2==0 # If not, it will result error in allocation to positional_encodings[:,1::2] later

        self.dropout = nn.Dropout(dropout)

        self.positional_encodings = torch.zeros(max_len,hid_dim)

        pos = torch.arange(0,max_len).unsqueeze(1) # pos : [max_len,1]
        div_term  = torch.exp(-torch.arange(0,hid_dim,2)*math.log(10000.0)/hid_dim) # Calculating value of 1/(10000^(2i/hid_dim)) in log space and then exponentiating it
        # div_term: [hid_dim//2]

        self.positional_encodings[:,0::2] = torch.sin(pos*div_term) # pos*div_term [max_len,hid_dim//2]
        self.positional_encodings[:,1::2] = torch.cos(pos*div_term) 

        self.positional_encodings = self.positional_encodings.unsqueeze(0) # To account for batch_size in inputs

        self.device = device

    def forward(self,x):
        x = x + self.positional_encodings[:,:x.size(1)].detach().to(self.device)
        return self.dropout(x)

In [25]:
class FeedForwardComponent(nn.Module):
    '''
    Class for pointwise feed forward connections
    '''
    def __init__(self,hid_dim,pf_dim,dropout):
        super().__init__()

        self.dropout = nn.Dropout(dropout)

        self.fc1 = nn.Linear(hid_dim,pf_dim)
        self.fc2 = nn.Linear(pf_dim,hid_dim)

    def forward(self,x):

        # x : [batch_size,seq_len,hid_dim]
        x = self.dropout(torch.relu(self.fc1(x)))

        # x : [batch_size,seq_len,pf_dim]
        x = self.fc2(x)

        # x : [batch_size,seq_len,hid_dim]
        return x

In [26]:
class MultiHeadedAttentionComponent(nn.Module):
    '''
    Multiheaded attention Component. This implementation also supports mask. 
    The reason for mask that in Decoder, we don't want attention mechanism to get
    important information from future tokens.
    '''
    def __init__(self,hid_dim, n_heads, dropout, device):
        super().__init__()

        assert hid_dim % n_heads == 0 # Since we split hid_dims into n_heads

        self.hid_dim = hid_dim
        self.n_heads = n_heads # no of heads in 'multiheaded' attention
        self.head_dim = hid_dim//n_heads # dims of each head

        # Transformation from source vector to query vector
        self.fc_q = nn.Linear(hid_dim,hid_dim)

        # Transformation from source vector to key vector
        self.fc_k = nn.Linear(hid_dim,hid_dim)

        # Transformation from source vector to value vector
        self.fc_v = nn.Linear(hid_dim,hid_dim)

        self.fc_o = nn.Linear(hid_dim,hid_dim)

        self.dropout = nn.Dropout(dropout)

        # Used in self attention for smoother gradients
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)

    def forward(self,query,key,value,mask=None):

        #query : [batch_size, query_len, hid_dim]
        #key : [batch_size, key_len, hid_dim]
        #value : [batch_size, value_len, hid_dim]

        batch_size = query.shape[0]

        # Transforming quey,key,values
        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)

        #Q : [batch_size, query_len, hid_dim]
        #K : [batch_size, key_len, hid_dim]
        #V : [batch_size, value_len,hid_dim]

        # Changing shapes to acocmadate n_heads information
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)

        #Q : [batch_size, n_heads, query_len, head_dim]
        #K : [batch_size, n_heads, key_len, head_dim]
        #V : [batch_size, n_heads, value_len, head_dim]

        # Calculating alpha
        score = torch.matmul(Q,K.permute(0,1,3,2))/self.scale
        # score : [batch_size, n_heads, query_len, key_len]

        if mask is not None:
            score = score.masked_fill(mask==0,-1e10)

        alpha = torch.softmax(score,dim=-1)
        # alpha : [batch_size, n_heads, query_len, key_len]

        # Get the final self-attention  vector
        x = torch.matmul(self.dropout(alpha),V)
        # x : [batch_size, n_heads, query_len, head_dim]

        # Reshaping self attention vector to concatenate
        x = x.permute(0,2,1,3).contiguous()
        # x : [batch_size, query_len, n_heads, head_dim]

        x = x.view(batch_size,-1,self.hid_dim)
        # x: [batch_size, query_len, hid_dim]

        # Transforming concatenated outputs 
        x = self.fc_o(x)
        #x : [batch_size, query_len, hid_dim] 

        return x, alpha

In [27]:
class EncoderLayer(nn.Module):  
    '''
    Operations of a single layer in an Encoder. An Encoder employs multiple such layers. Each layer contains:
    1) multihead attention, folllowed by
    2) LayerNorm of addition of multihead attention output and input to the layer, followed by
    3) FeedForward connections, followed by
    4) LayerNorm of addition of FeedForward outputs and output of previous layerNorm.
    '''
    def __init__(self, hid_dim,n_heads,pf_dim,dropout,device):
        super().__init__()
        
        self.self_attn_layer_norm = nn. LayerNorm(hid_dim) #Layer norm after self-attention
        self.ff_layer_norm = nn.LayerNorm(hid_dim) # Layer norm after FeedForward component

        self.self_attention = MultiHeadedAttentionComponent(hid_dim,n_heads,dropout,device)
        self.feed_forward = FeedForwardComponent(hid_dim,pf_dim,dropout)

        self.dropout = nn.Dropout(dropout)
        
    def forward(self,src,src_mask):
        
        # src : [batch_size, src_len, hid_dim]
        # src_mask : [batch_size, 1, 1, src_len]

        # get self-attention
        _src, _ = self.self_attention(src,src,src,src_mask)

        # LayerNorm after dropout
        src = self.self_attn_layer_norm(src + self.dropout(_src))
        # src : [batch_size, src_len, hid_dim]

        # FeedForward
        _src = self.feed_forward(src)

        # layerNorm after dropout
        src = self.ff_layer_norm(src + self.dropout(_src))
        # src: [batch_size, src_len, hid_dim]

        return src


In [28]:
class DecoderLayer(nn.Module):
    '''
    Operations of a single layer in an Decoder. An Decoder employs multiple such layers. Each layer contains:
    1) masked decoder self attention, followed by
    2) LayerNorm of addition of previous attention output and input to the layer,, followed by
    3) encoder self attention, followed by
    4) LayerNorm of addition of result of encoder self attention and its input, followed by
    5) FeedForward connections, followed by
    6) LayerNorm of addition of Feedforward results and its input.
    '''
    def __init__(self,hid_dim,n_heads,pf_dim,dropout,device):
        super().__init__()

        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)

        # decoder self attention
        self.self_attention = MultiHeadedAttentionComponent(hid_dim,n_heads,dropout,device)

        # encoder attention
        self.encoder_attention = MultiHeadedAttentionComponent(hid_dim,n_heads,dropout,device)

        # FeedForward
        self.feed_forward = FeedForwardComponent(hid_dim,pf_dim,dropout)

        self.dropout = nn.Dropout(dropout)

    def forward(self,trg, enc_src,trg_mask,src_mask):

        #trg : [batch_size, trg_len, hid_dim]
        #enc_src : [batch_size, src_len, hid_dim]
        #trg_mask : [batch_size, 1, trg_len, trg_len]
        #src_mask : [batch_size, 1, 1, src_len]

        '''
        Decoder self-attention
        trg_mask is to force decoder to look only into past tokens and not get information from future tokens.
        Since we apply mask before doing softmax, the final self attention vector gets no information from future tokens.
        '''
        _trg, _ = self.self_attention(trg,trg,trg,trg_mask)

        # LayerNorm and dropout with resdiual connection
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
        # trg : [batch_size, trg_len, hid_dim]

        '''
        Encoder attention:
        Query: trg
        key: enc_src
        Value : enc_src
        Why? 
        the idea here is to extract information from encoder outputs. So we use decoder self-attention as a query to find important values from enc_src
        and that is why we use src_mask, to avoid getting information from enc_src positions where it is equal to pad-id
        After we get necessary infromation from encoder outputs we add them back to decoder self-attention.
        '''
        _trg, encoder_attn_alpha = self.encoder_attention(trg,enc_src,enc_src,src_mask)

            # LayerNorm , residual connection and dropout
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))
        # trg : [ batch_size, trg_len, hid_dim]

        # Feed Forward
        _trg = self.feed_forward(trg)

        # LayerNorm, residual connection and dropout
        trg = self.ff_layer_norm(trg + self.dropout(_trg))

        return trg, encoder_attn_alpha

In [29]:
class Encoder(nn.Module):
    '''
    An encoder, creates token embeddings and position embeddings and passes them through multiple encoder layers
    '''
    def __init__(self,input_dim,hid_dim,n_layers,n_heads,pf_dim,dropout,device,max_length = 5000):
        super().__init__()
        self.device = device

        self.tok_embedding = nn.Embedding(input_dim,hid_dim)
        self.pos_embedding = PositionalEncodingComponent(hid_dim,device,dropout,max_length)

        # encoder layers
        self.layers = nn.ModuleList([EncoderLayer(hid_dim,n_heads,pf_dim,dropout,device) for _ in range(n_layers)])

        self.dropout = nn.Dropout(dropout)

        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)

    def forward(self,src,src_mask):

        # src : [batch_size, src_len]
        # src_mask : [batch_size,1,1,src_len]

        batch_size = src.shape[0]
        src_len = src.shape[1]

        tok_embeddings = self.tok_embedding(src)*self.scale

        # token plus position embeddings
        src  = self.pos_embedding(tok_embeddings)

        for layer in self.layers:
            src = layer(src,src_mask)
        # src : [batch_size, src_len, hid_dim]

        return src

In [30]:
class Decoder(nn.Module):
    '''
    An decoder, creates token embeddings and position embeddings and passes them through multiple decoder layers
    '''
    def __init__(self,output_dim,hid_dim,n_layers,n_heads,pf_dim,dropout,device,max_length= 5000):
        super().__init__()

        self.device = device

        self.tok_embedding = nn.Embedding(output_dim,hid_dim)
        self.pos_embedding = PositionalEncodingComponent(hid_dim,device,dropout,max_length)

        # decoder layers
        self.layers = nn.ModuleList([DecoderLayer(hid_dim,n_heads,pf_dim,dropout,device) for _ in range(n_layers)])

        # convert decoder outputs to real outputs
        self.fc_out = nn.Linear(hid_dim,output_dim)

        self.dropout = nn.Dropout(dropout)

        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)

    def forward(self, trg, enc_src,trg_mask,src_mask):
        
        #trg : [batch_size, trg_len]
        #enc_src : [batch_size, src_len, hid_dim]
        #trg_mask : [batch_size, 1, trg_len, trg_len]
        #src_mask : [batch_size, 1, 1, src_len]

        batch_size = trg.shape[0]
        trg_len = trg.shape[1]

        tok_embeddings = self.tok_embedding(trg)*self.scale

        # token plus pos embeddings
        trg = self.pos_embedding(tok_embeddings)
        # trg : [batch_size, trg_len, hid_dim]

        # Pass trg thorugh decoder layers
        for layer in self.layers:
            trg, encoder_attention = layer(trg,enc_src,trg_mask,src_mask)
        
        # trg : [batch_size,trg_len,hid_dim]
        # encoder_attention :  [batch_size, n_head,trg_len, src_len]

        # Convert to outputs
        output = self.fc_out(trg)
        # output : [batch_size, trg_len, output_dim]
        
        return output, encoder_attention


In [31]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, src_pad_idx, trg_pad_idx, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def make_src_mask(self,src):
        # src : [batch_size, src_len]

        # Masking pad values
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        # src_mask : [batch_size,1,1,src_len]

        return src_mask

    def make_trg_mask(self,trg):
        # trg : [batch_size, trg_len]

        # Masking pad values
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        # trg_pad_mask : [batch_size,1,1, trg_len]

        # Masking future values
        trg_len = trg.shape[1]
        trg_sub_mask = torch.tril(torch.ones((trg_len,trg_len),device= self.device)).bool()
        # trg_sub_mask : [trg_len, trg_len]

        # combine both masks
        trg_mask = trg_pad_mask & trg_sub_mask
        # trg_mask = [batch_size,1,trg_len,trg_len]

        return trg_mask

    def forward(self,src,trg):

        # src : [batch_size, src_len]
        # trg : [batch_size, trg_len]

        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)

        # src_mask : [ batch_size, 1,1,src_len]
        # trg_mask : [batch_size, 1, trg_len, trg_len]

        enc_src = self.encoder(src,src_mask)
        #enc_src : [batch_size, src_len, hid_dim]

        output, encoder_decoder_attention = self.decoder(trg,enc_src,trg_mask,src_mask)
        # output : [batch_size, trg_len, output_dim]
        # encoder_decoder_attention : [batch_size, n_heads, trg_len, src_len]

        return output, encoder_decoder_attention

In [32]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
HID_DIM = 256
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

enc = Encoder(INPUT_DIM, 
              HID_DIM, 
              ENC_LAYERS, 
              ENC_HEADS, 
              ENC_PF_DIM, 
              ENC_DROPOUT, 
              device)

dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT, 
              device)

SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)

In [33]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

model.apply(initialize_weights)

Seq2Seq(
  (encoder): Encoder(
    (tok_embedding): Embedding(1491, 256)
    (pos_embedding): PositionalEncodingComponent(
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): ModuleList(
      (0): EncoderLayer(
        (self_attn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (ff_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (self_attention): MultiHeadedAttentionComponent(
          (fc_q): Linear(in_features=256, out_features=256, bias=True)
          (fc_k): Linear(in_features=256, out_features=256, bias=True)
          (fc_v): Linear(in_features=256, out_features=256, bias=True)
          (fc_o): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward): FeedForwardComponent(
          (dropout): Dropout(p=0.1, inplace=False)
          (fc1): Linear(in_features=256, out_features=512, bias=True)
          (fc2): Linear(in_features=512

In [34]:
model.decoder.tok_embedding

Embedding(1730, 256)

In [35]:
# load python embedding weights
pretrained_embeddings = torch.load('./Data/python_embedding_weigts.pt')
# pretrained_embeddings = torch.load('./best_train_loss.pt')
pretrained_embeddings

Embedding(15018, 256)

In [44]:
# Loading pretrained embeddings

with open("./Data/TRG_stio","rb") as f:
    stoi_weights = pickle.load(f)

    with torch.no_grad():
        indexes = []
        for i,j in enumerate(TRG.vocab.stoi):
            if j in stoi_weights:
                model.decoder.tok_embedding.weight[TRG.vocab.stoi[j]] = pretrained_embeddings.weight[stoi_weights[j]]
            else:
                model.decoder.tok_embedding.weight[TRG.vocab.stoi[j]] = pretrained_embeddings.weight[stoi_weights['']]

In [45]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 5,222,850 trainable parameters


In [46]:
LEARNING_RATE = 0.0001

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [47]:
def get_rouge_score(output,trg,vocab=TRG.vocab.itos):
    rouge = Rouge()
    rouge_score = 0
    argmax_outputs = output.argmax(2)
    assert argmax_outputs.shape == trg.shape
    with torch.no_grad():
        for i,row in enumerate(argmax_outputs):
            output_sentence = []
            trg_sentence = []
            for j,column in enumerate(row):
                output_sentence.append(vocab[argmax_outputs[i][j]])
                trg_sentence.append(vocab[trg[i][j]])
            output_sentence = "".join(output_sentence)
            trg_sentence = "".join(trg_sentence)

            rouge_score += rouge.get_scores(output_sentence, trg_sentence)[0]["rouge-l"]["f"]

    return rouge_score/len(output)

In [48]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output, _ = model(src, trg[:,:-1])
                
        #output = [batch size, trg len - 1, output dim]
        #trg = [batch size, trg len]
            
        output_dim = output.shape[-1]
            
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:,1:].contiguous().view(-1)
                
        #output = [batch size * trg len - 1, output dim]
        #trg = [batch size * trg len - 1]
            
        loss = criterion(output, trg)
        
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [49]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0

    epoch_rouge = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output, _ = model(src, trg[:,:-1])
            
            #output = [batch size, trg len - 1, output dim]
            #trg = [batch size, trg len]

            rouge_score = get_rouge_score(output, trg[:,1:])

            epoch_rouge += rouge_score
            
            output_dim = output.shape[-1]
            
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)
            
            #output = [batch size * trg len - 1, output dim]
            #trg = [batch size * trg len - 1]
            
            loss = criterion(output, trg)

            epoch_loss += loss.item()

        
    return epoch_loss / len(iterator) , epoch_rouge / len(iterator)

In [50]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [51]:
N_EPOCHS = 300
CLIP = 1

best_valid_loss = float('inf')
best_train_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss, rouge_score = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'conala_plus_original_data.pt')
    
    if train_loss < best_train_loss:
        best_train_loss = train_loss
        torch.save(model.state_dict(), 'best_train_loss.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f} | Val. Rouge: {rouge_score:7.3f}')

Epoch: 01 | Time: 0m 7s
	Train Loss: 5.462 | Train PPL: 235.506
	 Val. Loss: 4.870 |  Val. PPL: 130.303 | Val. Rouge:   0.000
Epoch: 02 | Time: 0m 6s
	Train Loss: 4.693 | Train PPL: 109.136
	 Val. Loss: 4.150 |  Val. PPL:  63.433 | Val. Rouge:   0.001
Epoch: 03 | Time: 0m 6s
	Train Loss: 4.123 | Train PPL:  61.756
	 Val. Loss: 3.729 |  Val. PPL:  41.643 | Val. Rouge:   0.001
Epoch: 04 | Time: 0m 6s
	Train Loss: 3.764 | Train PPL:  43.115
	 Val. Loss: 3.492 |  Val. PPL:  32.847 | Val. Rouge:   0.002
Epoch: 05 | Time: 0m 6s
	Train Loss: 3.568 | Train PPL:  35.446
	 Val. Loss: 3.317 |  Val. PPL:  27.568 | Val. Rouge:   0.003
Epoch: 06 | Time: 0m 6s
	Train Loss: 3.394 | Train PPL:  29.788
	 Val. Loss: 3.176 |  Val. PPL:  23.941 | Val. Rouge:   0.003
Epoch: 07 | Time: 0m 6s
	Train Loss: 3.271 | Train PPL:  26.329
	 Val. Loss: 3.060 |  Val. PPL:  21.333 | Val. Rouge:   0.005
Epoch: 08 | Time: 0m 6s
	Train Loss: 3.142 | Train PPL:  23.141
	 Val. Loss: 2.986 |  Val. PPL:  19.800 | Val. Rouge: 

In [52]:
import pickle

with open("SRC_stio_local","rb") as f:
  stoi = pickle.load(f)
with open("TRG_itos_local","rb") as f:
  itos = pickle.load(f)

# Load model
# trained_model = 'conala_plus_original_data.pt'
trained_model = './best_train_loss.pt'
model.load_state_dict(torch.load(trained_model));
model.eval()

Seq2Seq(
  (encoder): Encoder(
    (tok_embedding): Embedding(1491, 256)
    (pos_embedding): PositionalEncodingComponent(
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): ModuleList(
      (0): EncoderLayer(
        (self_attn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (ff_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (self_attention): MultiHeadedAttentionComponent(
          (fc_q): Linear(in_features=256, out_features=256, bias=True)
          (fc_k): Linear(in_features=256, out_features=256, bias=True)
          (fc_v): Linear(in_features=256, out_features=256, bias=True)
          (fc_o): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward): FeedForwardComponent(
          (dropout): Dropout(p=0.1, inplace=False)
          (fc1): Linear(in_features=256, out_features=512, bias=True)
          (fc2): Linear(in_features=512

In [53]:
# import spacy
# spacy_en = spacy.load('en_core_web_sm')

def encode_inputs(input,vocab):
  tokenized_input_ = [tok for tok in tokenize(input, 'hi')]
  tokenized_input = [''] + tokenized_input_ +['']

  numericalized_input = [vocab[i] for i in tokenized_input]

  tensor_input = torch.LongTensor([numericalized_input])
  
  return tensor_input,tokenized_input_

def decode_outputs(output,vocab):
  # output: [1,1,hid_dim]
  predicted_token = output.argmax(-1)
  return vocab[predicted_token.item()], predicted_token

In [54]:
def display_attention(sentence, translation, attention, n_heads = 8, n_rows = 4, n_cols = 2):
    
    assert n_rows * n_cols == n_heads
    
    fig = plt.figure(figsize=(15,25))
    
    for i in range(n_heads):
        
        ax = fig.add_subplot(n_rows, n_cols, i+1)
        
        _attention = attention.squeeze(0)[i].cpu().detach().numpy()

        cax = ax.matshow(_attention, cmap='bone')

        ax.tick_params(labelsize=12)
        ax.set_xticklabels(['']+['']+[t.lower() for t in sentence]+[''], 
                           rotation=45)
        ax.set_yticklabels(['']+translation)

        ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
        ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

In [55]:
def variables_names_in_print(matchobj):
  statement = matchobj.group(1)
  statement = statement.replace(" ","")
  return "{"+statement+"}"

def print_decoder_output(decoder_outputs):
  decoder_outputs = [i for i in decoder_outputs if i is not ''] # removing redundant empty token created by tokenizer while identation during tokenization
  combined_output = " ".join(decoder_outputs)
  pruned_output = re.sub(r'\n |\n  |\n   ',r'\n',combined_output) # removing empty lines
  pruned_output = re.sub(r'{(.*?)}',variables_names_in_print,pruned_output) # setting printing variable names inside print(f'{}') statements
  print(pruned_output)

In [58]:
print(" Enter q or quit to exit.")

answer_max_len = 500

while(True):

  input_ = input("Enter text:")

  if input_=='q' or input_=='quit':
    break

  src,tokenized_input_ = encode_inputs(input_,stoi)
  src = src.to(device)
  # src_mask = torch.ones([1,1,1,src.shape[-1]]).to(device)
  src_mask = model.make_src_mask(src)

  with torch.no_grad():
    enc_src = model.encoder(src,src_mask)
  
  trg = ''
  trg_indexes = [stoi[trg]]
  # trg_mask = torch.ones([1,1,1,1]).to(device)

  decoder_outputs = []
  for i in range(answer_max_len):
    trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
    trg_mask = model.make_trg_mask(trg_tensor)
    
    with torch.no_grad():
      decoder_output,encoder_decoder_attention = model.decoder(trg_tensor,enc_src,trg_mask,src_mask)

    pred_token = decoder_output.argmax(2)[:,-1].item()

    if pred_token == TRG.vocab.stoi[TRG.eos_token]:
      break
    decoder_outputs.append(itos[pred_token])
    trg_indexes.append(pred_token)


  print_decoder_output(decoder_outputs)
  # display_attention(tokenized_input_,decoder_outputs,encoder_decoder_attention,DEC_HEADS)

 Enter q or quit to exit.
def bmi ( height : " M e t e r s " , weight : " K g s " ) : 
	 bmi = weight / ( height ** 2 ) 
	 print ( " Y o u r   B M I   i s :   {0}   a n d   y o u   a r e   " . format ( bmi ) , end = ' ' ' ) 
	 if ( bmi < 16 ) : 
		 print ( " s e v e r e l y   u n d e r w e i g h t . " ) 
	 elif ( bmi >= 16 and bmi < 18.5 ) : 
		 print ( " u n d e r w e i g h t . " ) 
	 elif ( bmi >= 18.5 and bmi < 25 ) : 
		 print ( " h e a l t h y . " ) 
	 elif ( bmi < 30 ) : 
		 print ( " o v e r w e i g h t . " ) 
	 elif ( bmi >= 25 and bmi >= 30 ) : 
		 print ( " s e r e r e l y   o v e r w e i g h t . " )


In [56]:
# check_question = "Write a Python function to remove leading zeros from an IP address"
# for i,j in enumerate(questions):
#   if j == check_question:
#     print(i)
questions[54]

'एक फ़ंक्शन लिखें जो height (m) और weight (kg) लेता है, BMI की गणना करता है और टिप्पणियों को प्रिंट करता है'

In [57]:
answers[54]

'def bmi(height: "Meters", weight: "Kgs"):\n\tbmi = weight/(height**2)\n\tprint("Your BMI is: {0} and you are ".format(bmi), end=\'\')\n\tif ( bmi < 16):\n\t\tprint("severely underweight.")\n\telif ( bmi >= 16 and bmi < 18.5):\n\t\tprint("underweight.")\n\telif ( bmi >= 18.5 and bmi < 25):\n\t\tprint("healthy.")\n\telif ( bmi >= 25 and bmi < 30):\n\t\tprint("overweight.")\n\telif ( bmi >=30):\n\t\tprint("severely overweight.")\n'