In [84]:
import pdb
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [85]:
import torch
import torch.nn as nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer, TransformerDecoder, TransformerDecoderLayer, Transformer
import torch.nn.functional as F
import numpy as np
import torch.optim as optim
from torchtext.vocab import GloVe
from torchtext.data import Field, BucketIterator
from torchtext.data.metrics import bleu_score
import spacy
import math

In [86]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [87]:
#Reading files

with open('trainen.txt', encoding='utf8') as f:
    eng_train = list(map(lambda x: x.rstrip(), f.readlines()))
    
with open('trainta.txt', encoding='utf8') as f:
    tamil_train = list(map(lambda x: x.rstrip(), f.readlines()))
    
with open('deven.txt', encoding='utf8') as f:
    eng_test = list(map(lambda x: x.rstrip(), f.readlines()))
    
with open('devta.txt', encoding='utf8') as f:
    tamil_test = list(map(lambda x: x.rstrip(), f.readlines()))
    
embedding_glove = GloVe(name='6B', dim=100)

spacy_en = spacy.load('en_core_web_sm')

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

stop_words = [',','.','?','!',')','(',':',']','[','$','#','&','%','--']
ENG = Field(tokenize = tokenize_en, init_token='sos', eos_token = 'eos', lower=True, stop_words=stop_words)
processed_eng_train = list(map(lambda x: ENG.preprocess(x), eng_train))
processed_eng_test = list(map(lambda x: ENG.preprocess(x), eng_test))

ENG.build_vocab(processed_eng_train, vectors=embedding_glove)

In [88]:
def preprocess(processed_eng):
    
    #function to return the numericalized version of the tokenized sentences
    X = []
    for tokenized_sentence in processed_eng:
        int_sequence = [2]  #first element is the SOS token 
        for token in tokenized_sentence:
            int_sequence.append(ENG.vocab.stoi[token])
        int_sequence.append(3) #last element is the EOS token
        X.append(int_sequence)
    
    return X

# X_train and X_test are lists of lists with the integer sequences for a given sentence
X_train = preprocess(processed_eng_train)
X_test = preprocess(processed_eng_test)

In [89]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Same thing for Tamil sentences
TAM = Tokenizer()
TAM.fit_on_texts(tamil_train)
Y_train = TAM.texts_to_sequences(tamil_train)
Y_test = TAM.texts_to_sequences(tamil_test)

#adding EOS token
_ = [y.append(0) for y in Y_train]
_ = [y.append(0) for y in Y_test]

In [90]:
source_vocab_size = len(ENG.vocab)
target_vocab_size = len(TAM.word_index)+2
print(source_vocab_size)
print(target_vocab_size)

9723
18670


In [55]:
class PositionEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=60):
        super(PositionEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pos_enc = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        # even columns use sin
        pos_enc[:, 0::2] = torch.sin(position * div_term)
        # odd columns use cos
        pos_enc[:, 1::2] = torch.cos(position * div_term)
        print('pos_enc.shape', pos_enc.shape)
        #pos_enc = pos_enc.unsqueeze(0).transpose(0, 1)
        
        # to prevent the optimiser from changing the position encodings
        self.register_buffer('pos_enc', pos_enc)

    def forward(self, x):
        x = x + self.pos_enc[:x.size(0), :]
        return self.dropout(x)

In [74]:
class Trans(nn.Module):
    
    def __init__(self, target_vocab_size, embed_size, num_heads, num_layers, dropout, ENG):
        
        super(Trans, self).__init__()
        self.pos_encoder = PositionEncoding(embed_size, dropout)
        self.encoder = nn.Embedding.from_pretrained(ENG.vocab.vectors)
        self.transformer = nn.Transformer(embed_size, num_heads, num_layers, num_layers, dropout=dropout)
        self.decoder = nn.Embedding(target_vocab_size, embed_size)
        self.fc = nn.Linear(embed_size, target_vocab_size)
        self.logsoftmax = nn.LogSoftmax(dim=1)
        
    def forward(self, src, trg):
        
        # src and trg will be 1-D tensors with size Ts and Tt
        src = self.encoder(src)     # Ts x 100
        src = self.pos_encoder(src) # Ts x 100
        trg = self.decoder(trg)     # Tt x 100
        trg = self.pos_encoder(trg) # Tt x 100
        output = self.transformer(src.unsqueeze(1), trg.unsqueeze(1)) # Ts x 1 x 100 and Tt x 1 X 100 
        output = self.fc(output.squeeze(1))  # output will now be Tt x target_vocab_size
        output = self.logsoftmax(output)
        
        return output

In [75]:
embed_size = 100
num_heads = 4
num_layers = 2
dropout = 0.2
trans_model = Trans(target_vocab_size, embed_size, num_heads, num_layers, dropout, ENG)

pos_enc.shape torch.Size([60, 100])


In [76]:
trans_model.to(device)

Trans(
  (pos_encoder): PositionEncoding(
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (encoder): Embedding(9736, 100)
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): Linear(in_features=100, out_features=100, bias=True)
          )
          (linear1): Linear(in_features=100, out_features=2048, bias=True)
          (dropout): Dropout(p=0.2, inplace=False)
          (linear2): Linear(in_features=2048, out_features=100, bias=True)
          (norm1): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.2, inplace=False)
          (dropout2): Dropout(p=0.2, inplace=False)
        )
        (1): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): Linear(in_features=100, out_features=100, bias

In [82]:
def loss_per_pair(source_sentence, target_sentence, trans, trans_optimiser, loss_fn):
    
    Tt = target_sentence.size(-1)
    trans_optimiser.zero_grad()
    
    loss_val = 0
    
    output = trans(source_sentence, target_sentence)
         
    for i in range(Tt):
        target_word = torch.cuda.LongTensor([target_sentence[i].item()])
        loss_val += loss_fn(output[i].unsqueeze(0), target_word)

    loss_val.backward()
    
    nn.utils.clip_grad_norm_(trans.parameters(), 0.5, 1)

    trans_optimiser.step()
    
    return loss_val.item()/Tt

In [64]:
def train_model(sources, targets, trans):
    
    loss_fn = nn.NLLLoss()
    trans_optimiser = optim.SGD(trans.parameters(), lr=0.001, momentum=0.9)
    
    max_epochs = 100
    old_loss = np.inf
    indices = [i for i in range(len(sources))]
    
    for epoch in range(max_epochs):
        
        # shuffling the input data manually at the start of every new epoch
        np.random.shuffle(indices)
        sources = list(np.array(sources)[indices])
        targets = list(np.array(targets)[indices])
        
        running_loss = 0.0
        num_sentences = len(sources)
        
        for i in range(num_sentences):
            source_sentence = torch.cuda.LongTensor(sources[i])
            target_sentence = torch.cuda.LongTensor(targets[i])

            loss = loss_per_pair(source_sentence, target_sentence, trans, trans_optimiser, loss_fn)
            running_loss += loss

            if i%int(num_sentences*0.1)==0:
                print("Epoch", epoch+1, ":", (i/int(num_sentences*0.1))*10,'% done')
                print("Current loss:", running_loss)
        
        
        if abs(running_loss-old_loss)/running_loss < 1e-3:
            print('Converged')
            break
    
        old_loss = running_loss

    print("Finished Training")

In [83]:
train_model(X_train, Y_train, trans_model)

Epoch 1 : 0.0 % done
Current loss: 10.227967262268066
Epoch 1 : 10.0 % done
Current loss: 26127.432748616837
Epoch 1 : 20.0 % done
Current loss: 52138.69940703216
Epoch 1 : 30.0 % done
Current loss: 78144.8166929593
Epoch 1 : 40.0 % done
Current loss: 104068.20179332324
Epoch 1 : 50.0 % done
Current loss: 129948.20768857018
Epoch 1 : 60.0 % done
Current loss: 155776.95081889405
Epoch 1 : 70.0 % done
Current loss: 181539.76147778556
Epoch 1 : 80.0 % done
Current loss: 207259.32553306952
Epoch 1 : 90.0 % done
Current loss: 232898.9758581852
Epoch 1 : 100.0 % done
Current loss: 258483.4850621908
Epoch 2 : 0.0 % done
Current loss: 9.78580093383789
Epoch 2 : 10.0 % done
Current loss: 25544.086800405923
Epoch 2 : 20.0 % done
Current loss: 51007.276134223735
Epoch 2 : 30.0 % done
Current loss: 76397.7731893967
Epoch 2 : 40.0 % done
Current loss: 101766.95441605119
Epoch 2 : 50.0 % done
Current loss: 127083.27817015359
Epoch 2 : 60.0 % done
Current loss: 152328.2571887134
Epoch 2 : 70.0 % done

KeyboardInterrupt: 