In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext.datasets import Multi30k
from torchtext.datasets import WMT14
from torchtext.datasets import IWSLT 
from torchtext.data import Field, BucketIterator,Iterator

import spacy
import numpy as np
import random
import math
import time

In [2]:
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
spacy_en = spacy.load('en')



In [4]:
spacy_de = spacy.load('de_core_news_sm-2.3.0/de_core_news_sm/de_core_news_sm-2.3.0')

In [5]:
spacy_fr = spacy.load('fr_core_news_sm-2.3.0/fr_core_news_sm/fr_core_news_sm-2.3.0')

In [6]:
def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]


def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_fr(text):
    return [tok.text for tok in spacy_fr.tokenizer(text)]

In [7]:
SRC = Field(tokenize=tokenize_en,init_token='<sos>',eos_token=
           '<eos>',lower=True,batch_first=True,include_lengths=True)
TRG = Field(tokenize=tokenize_de,init_token='<sos>',eos_token=
           '<eos>',lower=True,batch_first=True,include_lengths=True)

In [8]:
train_data , valid_data , test_data = Multi30k.splits(exts=('.de','.en'),fields = (SRC,TRG))

In [9]:
#t_data , v_data , te_data = IWSLT.splits(exts=('.de','.en'),fields = (SRC,TRG))

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device("cpu")

In [11]:
torch.cuda.is_available()

True

In [12]:
device

device(type='cuda')

In [13]:
SRC.build_vocab(train_data,min_freq = 2)
TRG.build_vocab(train_data,min_freq = 2)

In [14]:
#BATCH_SIZE = 128

#train_iterator , valid_iterator , test_iterator = BucketIterator.splits(
#    (train_data,valid_data,test_data),batch_size=BATCH_SIZE,device=device,shuffle=False)

In [26]:
BATCH_SIZE = 128

train_iterator , valid_iterator , test_iterator = Iterator.splits(
    (train_data,valid_data,test_data),batch_size=BATCH_SIZE,device=device,shuffle=False)

# Model Description

In [14]:
class Encoder(nn.Module):
    def __init__(self,input_dim,emb_dim,enc_hid_dim,dec_hid_dim,drop):
        super().__init__()
        self.embedding = nn.Embedding(input_dim,emb_dim)
        self.rnn = nn.GRU(emb_dim,enc_hid_dim,bidirectional=True,batch_first=True)
        self.fc = nn.Linear(enc_hid_dim*2,dec_hid_dim)
        self.dropout = nn.Dropout(drop)
        self.activ = nn.Tanh()
    def forward(self,src,src_len):
        #print("Input Shape",src.shape)
        embedded = self.dropout(self.embedding(src))
        packed_src = nn.utils.rnn.pack_padded_sequence(embedded,src_len,
                                                       batch_first=True,
                                                       enforce_sorted=False)
        packed_outputs , hidden = self.rnn(packed_src)
        outputs,_ = nn.utils.rnn.pad_packed_sequence(packed_outputs)
        #print("Encoder Shape in Encoder Output ",outputs.shape)
        outputs = outputs.permute(1,0,2)
        # outputs ---> batch x seq x enc_hid_dim
        # hidden ----> 2 x batch x enc_hid_dim
        # hidden[-2,:,:] batch * enc_hid_dim
        #hidden = self.activ(self.fc(torch.cat((hidden[-2,:,:],hidden[-1,:,:]),dim=1)))
        # Adding Dropout version
        hidden = self.dropout(self.activ(self.fc(torch.cat((hidden[-2,:,:],hidden[-1,:,:]),dim=1))))
        return outputs,hidden

# This is an implementation of Bahdanau (ICLR 2014 ) Attention architecture

In [15]:
class Attention(nn.Module):
    def __init__(self,enc_hid_dim,dec_hid_dim):
        super().__init__()
        self.attn = nn.Linear(enc_hid_dim*2+dec_hid_dim,dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim,1,bias=False)
        self.actv = nn.Tanh()
        self.dropout = nn.Dropout(0.5)
    def forward(self,hidden,encoder_outputs,mask):
        batch_size = encoder_outputs.shape[0]
        src_len = encoder_outputs.shape[1]
        #print("Hidden Shape Pre ",hidden.shape)
        #print("Encoder Shape in Attention Layer",encoder_outputs.shape)
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        #print("Hidden Shape Post ",hidden.shape)
        concat_input = torch.cat((hidden, encoder_outputs), dim = 2)
        #print("Concatenated in Attention Layer",concat_input.shape)
        #print("Linear Layer Shape ",self.attn.weight.shape)
        # torch.cat((hidden, encoder_outputs) -->  batch x seq x (enc_hid_dim*2)+dec_hid_dim
        #energy = self.actv(self.attn(concat_input))
        # Dropout Added 
        energy = self.dropout(self.actv(self.attn(concat_input)))
        # attention --->  batch * seq * 1 
        #print("Energy ",energy.shape)
        attention = self.v(energy).squeeze(2)
        #print("Attention ",attention.shape)
        # attention --> batch x seq 
        attention = attention.masked_fill(mask==1,1e-10)
        return F.softmax(attention, dim=1)

In [16]:
class Decoder(nn.Module):
    def __init__(self,out_dim,attn,dec_hid_dim,emb_dim,enc_hid_dim,drop):
        super().__init__()
        self.attn = attn
        self.out_dim = out_dim
        self.embed = nn.Embedding(out_dim,emb_dim)
        self.fc = nn.Linear(dec_hid_dim+enc_hid_dim*2+emb_dim,out_dim)
        self.rnn = nn.GRU(emb_dim+enc_hid_dim*2,dec_hid_dim,batch_first=True,bidirectional=False)
        self.drop = nn.Dropout(drop)
    def forward(self,inputs,hidden,encoder_states,mask):
        # inputs --->  batch * 1
        #print("Decoding Input Shape",inputs.shape)
        embedded = self.drop(self.embed(inputs))
        # embedded --> batch * 1 * emb_dim 
        att_weights = self.attn(hidden,encoder_states,mask)
        att_weights = att_weights.unsqueeze(2)
        #print("Att weights ",att_weights.shape)
        # att_weights --> batch x seq x 1
        # encoder_states --> batch x seq x 2*enc_hid_dim
        #print("Pre Encoder States ",encoder_states.shape)
        encoder_states = encoder_states.permute(0,2,1)
        #print("Post Encoder States ",encoder_states.shape)
        context_vector = torch.bmm(encoder_states,att_weights)
        #print("Pre context vector ",context_vector.shape)
        #context_vector -->  batch x 2*enc_hid_dim * 1
        context_vector = context_vector.permute(0,2,1)
        #print("Post context vector ",context_vector.shape)
        #print("Pre Embedded vector ",embedded.shape)
        embedded = embedded.unsqueeze(1)
        #print("Post Embedded vector ",embedded.shape)
        #context_vector -->  batch x  1 x 2* enc_hid_dim
        concat_input = torch.cat((context_vector,embedded),dim=2)
        #print("Concat Input Shape in Decoder ",concat_input.shape)
        hidden = hidden.unsqueeze(0)
        out , hidden  = self.rnn(concat_input,hidden)
        #out -->  batch * 1 * 2*dec_hid_dim
        out = out.squeeze(1)
        #out -->  batch x 2*dec_hid_dim
        embedded = embedded.squeeze(1)
        # embedded --> batch x emb_dim
        context_vector = context_vector.squeeze(1)
        #context_vector -->  batch x 2* enc_hid_dim
        prediction = self.fc(torch.cat((out,embedded,context_vector),dim=1))
        return prediction,att_weights,hidden

In [17]:
class Seq2Seq(nn.Module):
    def __init__(self,encoder,decoder,attn,out_dim,input_dim,dec_hid_dim,
                 enc_hid_dim,enc_emb_dim,dec_emb_dim,drop,pad_tok,device):
        super().__init__()
        self.encoder = encoder
        self.attn  = attn
        self.decoder = decoder
        self.out_dim = out_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.drop = drop
        self.pad_tok = pad_tok
        self.device = device
        #self.teacher_force_ratio = teacher_force_ratio
    
    def create_mask(self,src):
        mask = (src == self.pad_tok).to(torch.int8)
        return mask
           
    def forward(self,src,src_len,trg,teacher_force_ratio=True):
        batch_sz = src.size(0)
        src_seqlen = src.size(1)
        mask = self.create_mask(src)
        outputs , hidden = self.encoder(src,src_len)
        trg_len = trg.size(1)
        attention_weights = torch.zeros((trg_len,batch_sz,src_seqlen)).to(self.device)
        # For storing attention per target token
        total_outputs = torch.zeros((trg_len,batch_sz,self.out_dim)).to(self.device)
        inputs =  trg[:,0]
        #print("Initial Input Shape",inputs.shape)
        for idx in range(1,trg_len,1):
            #print("Decoding token ",idx)
            prediction, attention_wt , hidden = self.decoder(
                inputs,hidden,outputs,mask.to(self.device))
            #print("Prediction Shape",prediction.shape)
            #print("attention weight Shape",attention_wt.shape)
            #print("Hidden Shape",hidden.shape)
            attention_weights[idx,:,:] = attention_wt.squeeze(2)
            total_outputs[idx,:,:] = prediction
            hidden = hidden.squeeze(0)
            inputs = prediction.argmax(dim=1)
            #total_outputs[idx,:,:] = inputs
            #print("Input Shape while decoding",inputs.shape)
            if teacher_force_ratio:
                flag = torch.rand(1).item() >= 0.65 
                if flag:
                    inputs =  prediction.argmax(dim=1)
                else:
                    inputs = trg[:,idx]
            #inputs =  prediction.argmax(dim=1)
        return total_outputs

In [18]:
def model_param_init(model):
    for name,param in model.named_parameters():
        if "weight" in name:
            nn.init.normal_(param.data,mean=0,std=0.01)
        else:
            nn.init.constant_(param.data,0)
            

In [19]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
DROPOUT = 0.5
TEACHER_FORCE = 1
SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]

In [20]:
encoder = Encoder(INPUT_DIM,ENC_EMB_DIM,ENC_HID_DIM,DEC_HID_DIM,DROPOUT)
attn  = Attention(ENC_HID_DIM,DEC_HID_DIM)
decoder = Decoder(OUTPUT_DIM,attn,DEC_HID_DIM,DEC_EMB_DIM,
                  ENC_HID_DIM,DROPOUT)
model = Seq2Seq(encoder,decoder,attn,OUTPUT_DIM,INPUT_DIM,DEC_HID_DIM,ENC_HID_DIM,ENC_EMB_DIM,
                DEC_EMB_DIM,DROPOUT,SRC_PAD_IDX,device)

model.to(device)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7873, 256)
    (rnn): GRU(256, 512, batch_first=True, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
    (activ): Tanh()
  )
  (attn): Attention(
    (attn): Linear(in_features=1536, out_features=512, bias=True)
    (v): Linear(in_features=512, out_features=1, bias=False)
    (actv): Tanh()
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attn): Attention(
      (attn): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
      (actv): Tanh()
      (dropout): Dropout(p=0.5, inplace=False)
    )
    (embed): Embedding(5972, 256)
    (fc): Linear(in_features=1792, out_features=5972, bias=True)
    (rnn): GRU(1280, 512, batch_first=True)
    (drop): Dropout(p=0.5, inplace=False)
  )
)

In [21]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 20,685,396 trainable parameters


In [22]:
optimizer = torch.optim.Adam(model.parameters())
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [23]:
def train(model,iterator,optimizer,criterion,clip):
    epoch_loss = 0.0
    model.train()
    for i,batch in enumerate(iterator):
        src , src_len = batch.src
        trg, trg_len = batch.trg
        #print(src)
        #print(src_len)
        #print(trg)
        #print(src,src_len)
        #print(trg,trg_len)
        outputs = model(src,src_len,trg)
        #outputs = model(src,src_len,trg)
        outputs = outputs[1:].view(-1,OUTPUT_DIM)
        #print("Pred output shape ",outputs.shape)
        trg = trg.permute(1,0)
        trg = trg[1:]
        trg = trg.reshape(-1)
        #print("Gold output shape ",trg.shape)
        optimizer.zero_grad()
        loss = criterion(outputs,trg)
        epoch_loss = epoch_loss + loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(),clip)
        optimizer.step()
    
    return (epoch_loss)/len(iterator)
    

In [24]:
def evaluate(model,iterator,criterion):
    epoch_loss = 0.0
    model.eval()
    with torch.no_grad():
        for i,batch in enumerate(iterator):
            src , src_len = batch.src
            trg, trg_len = batch.trg
            #print("SRC Shape ",src.shape)
            outputs = model(src,src_len,trg,False)
            outputs = outputs[1:].reshape(-1,OUTPUT_DIM)
            #print("Outputs Shape ",outputs.shape)
            trg = trg.permute(1,0)
            trg = trg[1:]
            trg = trg.reshape(-1)
            #print("target Shape ",trg.shape)
            #print("Outputs Shape ",outputs.shape)
            loss = criterion(outputs,trg)
            epoch_loss = epoch_loss + loss.item()

    return (epoch_loss)/len(iterator)

In [None]:
num_epochs = 10
clip = 1
best_loss = float('inf')

for epoch in range(num_epochs):
    print(f' Epoch-----> {epoch}')
    train_loss = train(model,train_iterator,optimizer,criterion,clip)
    valid_loss = evaluate(model,valid_iterator,criterion)
    if valid_loss <= best_loss:
        best_loss = valid_loss
        torch.save(model.state_dict(), 'translation_model.pt')
        
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')  

 Epoch-----> 0
	Train Loss: 3.997 | Train PPL:  54.423
	 Val. Loss: 3.602 |  Val. PPL:  36.678
 Epoch-----> 1
	Train Loss: 2.835 | Train PPL:  17.031
	 Val. Loss: 3.372 |  Val. PPL:  29.129
 Epoch-----> 2
	Train Loss: 2.392 | Train PPL:  10.940
	 Val. Loss: 3.313 |  Val. PPL:  27.455
 Epoch-----> 3
	Train Loss: 2.135 | Train PPL:   8.457
	 Val. Loss: 3.305 |  Val. PPL:  27.247
 Epoch-----> 4
	Train Loss: 1.910 | Train PPL:   6.754
	 Val. Loss: 3.352 |  Val. PPL:  28.573
 Epoch-----> 5
	Train Loss: 1.774 | Train PPL:   5.893
	 Val. Loss: 3.421 |  Val. PPL:  30.613
 Epoch-----> 6
	Train Loss: 1.651 | Train PPL:   5.214
	 Val. Loss: 3.531 |  Val. PPL:  34.156
 Epoch-----> 7
	Train Loss: 1.567 | Train PPL:   4.791
	 Val. Loss: 3.529 |  Val. PPL:  34.074
 Epoch-----> 8
	Train Loss: 1.506 | Train PPL:   4.509
	 Val. Loss: 3.555 |  Val. PPL:  34.997
 Epoch-----> 9


In [27]:
model.load_state_dict(torch.load('translation_model.pt'))

test_loss = evaluate(model,test_iterator,criterion)

print(f'\t| Test Loss: {test_loss:.4f} |  Test PPL: {math.exp(test_loss):4.3f} |')

	| Test Loss: 3.3549 |  Test PPL: 28.643 |


In [79]:
def translate(sentence,src_field,trg_field,model,max_len=50):
    model.eval()
    tokens = [tok.lower() for tok in tokenize_de(sentence)]
    print(f'Post tokenization: {tokens}')
    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
    numeralized_token = [SRC.vocab.stoi[tok] for tok in tokens]
    inputs = torch.LongTensor(numeralized_token).reshape(1,len(numeralized_token))
    input_len = torch.LongTensor([len(numeralized_token)])    
    out , hidden = encoder(inputs.to(device),input_len.to(device))
    mask = torch.LongTensor([0 for _ in range(out.size(1))]).resize(1,len(numeralized_token))
    inputs = torch.LongTensor([src_field.vocab.stoi[src_field.init_token]])
    
    attentions = []
    predictions = []
    for i in range(50):
        prediction, attention_wt , hidden = decoder(inputs.to(device),
                                                    hidden.to(device),
                                                    out.to(device),
                                                    mask.to(device))     
        attentions.append(attention_wt.squeeze(2))
        hidden = hidden.squeeze(0)
        inputs = prediction.argmax(dim=1)
        predictions.append(inputs)
        if inputs.item() == trg_field.vocab.stoi[trg_field.eos_token]:
            break
    return predictions,attentions
    

In [106]:
example_sent =  "einer Stadt ."

In [107]:
prediction,attentions = translate(example_sent,SRC,TRG,model)

Post tokenization: ['einer', 'stadt', '.']


In [108]:
translated_sent = [TRG.vocab.itos[tok.item()] for tok in prediction]

In [109]:
' '.join(translated_sent)

'a city city . <eos>'