In [1]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F


In [2]:
PAD_token = 0 
SOS_token = 1
EOS_token = 2
BATCH_SIZE = 50

In [3]:


class EncoderRNN(nn.Module):
    def __init__(self,hidden_size,vocab_size, n_layers=1,dropout = 0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout =(0 if n_layers ==1 else dropout), bidirectional= True)
    
    def forward(self, input_seq, input_length , hidden=None):
        embedded = self.embedding(input_seq)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths) #for faster computation
        outputs, hidden = self.gru(packed, hidden)

        outputs , _ = nn.utils.rnn.pad_packed_sequence(outputs)
        outputs = outputs[:,:, :self.hidden_size] + outputs[:,:,self.hidden_size:] # bidirectional gru so  adding both halves 

        return outputs, hidden

    

In [4]:
class Attn(nn.Module):
    def __init__(self, hidden_size,sequence_length):
        super(Attn,self).__init__()
        self.hidden_size = hidden_size
        self.w1 = nn.Linear(sequence_length* hidden_size, hidden_size)
        self.w2 = nn.Linear(hidden_size, hidden_size)
        

    def dot_score(self, hidden,encoder_output):
        return torch.sum(hidden * encoder_output, dim =2 )

    def forward(self, encoder_output, decoder_state):
        FC = self.w1(encoder_output) + self.w2(decoder_state)
        tan = F.tanh(FC)
        attention_weights = F.softmax(tan)
        return attention_weights

    




In [5]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size,output_size, vocab_size, n_layers = 1,dropout=0.1):
        super(DecoderRNN , self).__init__()
        self.hidden_size = hidden_size
        self.output_size= output_size
        self.n_layers = n_layers
        self.dropout = dropout   #vocab_size == output_size
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(2*hidden_size, hidden_size, n_layers, dropout = (0 if n_layers==1 else dropout))
        
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input_step, context,last_hidden):
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        x= torch.cat(embedded, context)

        rnn_output, hidden = self.gru(x, last_hidden)
        rnn_output = rnn_output.squeeze(0)
        output = self.out(rnn_output)
        output = F.softmax(output, dim =1 )
        return output, hidden
        
        
        


        


In [6]:
class MainModel(nn.Module):
    def __init__(self,hidden_size,vocab_size,sequence_length, n_layers=1,dropout = 0):
        super(DecoderRNN, self).__init__()
        
        self.Encode = EncoderRNN(hidden_size, vocab_size, n_layers,dropout )
        self.Attention = Attn(hidden_size, sequence_length)
        self.Decode = DecoderRNN(hidden_size,vocab_size, vocab_size, n_layers, dropout)
        
    def forward(self, input_seq, input_length , hidden ):
        encoder_output, encoder_hidden = self.Encode(input_seq, input_length, hidden)
        attn_weights = self.Attention(encoder_output,encoder_hidden)
        context = torch.sum(attn_weights*encoder_output) # context vector
        input_step = SOS_token* torch.ones(50)
        last_hidden = hidden
        output =[]
        for i in range(input_length):
            output_decoder, decoder_hidden = self.Decode(input_step, context, last_hidden)
            
            input_step = output_decoder.squeeze(0) 
            output.append(input_step)
            _, input_step = input_step.topk(1)
            input_step = input_step.squeeze(1)
            

            last_hidden = decoder_hidden
            attn_weights = self.Attention(encoder_output,last_hidden)
            context = torch.sum(attn_weights*encoder_output) # context vector

        return output

        

            
            
            




        

        
        

In [8]:
# USE cross entropy loss
#The model returns tensor from a softmax operation The index with max value is the required ans 
# the ans is the key value for the dictonary which has 1-> hi , 2-> bye key value pairs for words and numbers
# make both index2string and string2index dictionary
# hidden_size,vocab_size,sequence_length, n_layers=1,dropout = 0

In [None]:
criterion = nn.CrossEntropyLoss()
