In [24]:
!pip install torchtext==0.6



In [25]:
import pdb
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [26]:
import torch
import torch.nn as nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer, TransformerDecoder, TransformerDecoderLayer, Transformer
import torch.nn.functional as F
import numpy as np
import torch.optim as optim
from torchtext.vocab import GloVe
from torchtext.data import Field, BucketIterator
from torchtext.data.metrics import bleu_score
import spacy
import math
import pickle

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [7]:
#Reading files

with open('/content/drive/My Drive/Assignment4_2/trainen.txt', encoding='utf8') as f:
    eng_train = list(map(lambda x: x.rstrip(), f.readlines()))
    
with open('/content/drive/My Drive/Assignment4_2/trainta.txt', encoding='utf8') as f:
    tamil_train = list(map(lambda x: x.rstrip(), f.readlines()))
    
with open('/content/drive/My Drive/Assignment4_2/deven.txt', encoding='utf8') as f:
    eng_test = list(map(lambda x: x.rstrip(), f.readlines()))
    
with open('/content/drive/My Drive/Assignment4_2/devta.txt', encoding='utf8') as f:
    tamil_test = list(map(lambda x: x.rstrip(), f.readlines()))

embedding_glove = pickle.load(open('/content/drive/My Drive/Assignment4_2/glove.sav', 'rb'))

# with open('trainen.txt', encoding='utf8') as f:
#     eng_train = list(map(lambda x: x.rstrip(), f.readlines()))
    
# with open('trainta.txt', encoding='utf8') as f:
#     tamil_train = list(map(lambda x: x.rstrip(), f.readlines()))
    
# with open('deven.txt', encoding='utf8') as f:
#     eng_test = list(map(lambda x: x.rstrip(), f.readlines()))
    
# with open('devta.txt', encoding='utf8') as f:
#     tamil_test = list(map(lambda x: x.rstrip(), f.readlines()))
    
# embedding_glove = GloVe(name='6B', dim=100)

spacy_en = spacy.load('en_core_web_sm')

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

stop_words = [',','.','?','!',')','(',':',']','[','$','#','&','%','--']
ENG = Field(tokenize = tokenize_en, init_token='sos', eos_token = 'eos', lower=True, stop_words=stop_words)
processed_eng_train = list(map(lambda x: ENG.preprocess(x), eng_train))
processed_eng_test = list(map(lambda x: ENG.preprocess(x), eng_test))

ENG.build_vocab(processed_eng_train, vectors=embedding_glove)

In [8]:
def preprocess(processed_eng):
    
    #function to return the numericalized version of the tokenized sentences
    X = []
    for tokenized_sentence in processed_eng:
        int_sequence = [2]  #first element is the SOS token 
        for token in tokenized_sentence:
            int_sequence.append(ENG.vocab.stoi[token])
        int_sequence.append(3) #last element is the EOS token
        X.append(int_sequence)
    
    return X

# X_train and X_test are lists of lists with the integer sequences for a given sentence
X_train = preprocess(processed_eng_train)
X_test = preprocess(processed_eng_test)

In [9]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Same thing for Tamil sentences
TAM = Tokenizer()
TAM.fit_on_texts(tamil_train)
Y_train = TAM.texts_to_sequences(tamil_train)
Y_test = TAM.texts_to_sequences(tamil_test)

#adding EOS token
EOS = len(TAM.word_index)+2
SOS = len(TAM.word_index)+1
_ = [y.append(EOS) for y in Y_train]
_ = [y.append(EOS) for y in Y_test]

Y_train = pad_sequences(Y_train, padding='post', value=0, maxlen=30)
Y_test = pad_sequences(Y_test, padding='post', value=0, maxlen=30)

Using TensorFlow backend.


In [10]:
# pre-padding the inputs with value 1, from the dictionary ENG.vocab.stoi
X_train = pad_sequences(X_train, padding='pre', value=1, maxlen=55)
X_test = pad_sequences(X_test, padding='pre', value=1, maxlen=55)

In [17]:
len(Y_test[0])

30

In [18]:
source_vocab_size = len(ENG.vocab)
target_vocab_size = len(TAM.word_index)+3
print(source_vocab_size)
print(target_vocab_size)

9724
18671


In [19]:
from torch.utils.data import Dataset, DataLoader, ConcatDataset, random_split

class DatasetClass(Dataset):
    
    def __init__(self, source, target):
        
        self.source = source
        self.target = target
      
    def __len__(self):
        
        return len(self.source)
    
    def __getitem__(self, idx):
        
        return self.source[idx], self.target[idx]

In [20]:
def train_test_loader(source_train, target_train, source_test, target_test, num_workers=0, batch_size=32):

    train_data = DatasetClass(source_train, target_train)
    test_data = DatasetClass(source_test, target_test)

    trainloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    testloader = DataLoader(test_data, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    
    return trainloader, testloader

In [21]:
class PositionEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=55):
        super(PositionEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pos_enc = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        # even columns use sin
        pos_enc[:, 0::2] = torch.sin(position * div_term)
        # odd columns use cos
        pos_enc[:, 1::2] = torch.cos(position * div_term)
        pos_enc = pos_enc.unsqueeze(0).transpose(0, 1)    # 55 x 1 x 100
        print('pos_enc.shape', pos_enc.shape)
        
        # to prevent the optimiser from changing the position encodings
        self.register_buffer('pos_enc', pos_enc)

    def forward(self, x):
        x = x + self.pos_enc[:x.size(0), :]
        return self.dropout(x)

In [None]:
class Trans(nn.Module):
    
    def __init__(self, target_vocab_size, embed_size, num_heads, num_layers, dropout, ENG):
        
        super(Trans, self).__init__()
        self.pos_encoder = PositionEncoding(embed_size, dropout)
        self.encoder = nn.Embedding.from_pretrained(ENG.vocab.vectors)
        self.transformer = nn.Transformer(embed_size, num_heads, num_layers, num_layers, dropout=dropout)
        self.decoder = nn.Embedding(target_vocab_size, embed_size)
        self.fc = nn.Linear(embed_size, target_vocab_size)
        self.logsoftmax = nn.LogSoftmax(dim=1)
        
    def forward(self, src, trg):
        
        # src and trg will be 1-D tensors with size Ts and Tt
        src = self.encoder(src)     # batch_size x Ts x 100
        src = self.pos_encoder(src.transpose(0,1)) # Ts x batch_size x 100
        trg = self.decoder(trg)     # batch_size x Tt x 100
        trg = self.pos_encoder(trg.transpose(0,1)) # Tt x batch_size x 100
        src_mask = (torch.tril(torch.ones(src.shape[0], src.shape[0])) == 0).to(device)
        trg_mask = (torch.tril(torch.ones(trg.shape[0], trg.shape[0])) == 0).to(device)
        output = self.transformer(src, trg, src_mask, trg_mask)
        output = self.fc(output)  # output will now be Tt x batch_size x target_vocab_size
        output = self.logsoftmax(output)
        
        return output

In [33]:
output = torch.rand(3, 5, 10)
out = nn.Softmax(dim=2)(output)

In [34]:
out[:, 1, :]

tensor([[0.0767, 0.1109, 0.0709, 0.1384, 0.1029, 0.0781, 0.0671, 0.1345, 0.1013,
         0.1192],
        [0.0728, 0.0674, 0.0872, 0.1636, 0.0696, 0.0767, 0.1741, 0.1151, 0.0737,
         0.0997],
        [0.0752, 0.0696, 0.0916, 0.1046, 0.1260, 0.1461, 0.0797, 0.0940, 0.1198,
         0.0934]])

In [None]:
embed_size = 100
num_heads = 4
num_layers = 2
dropout = 0.2
trans_model = Trans(target_vocab_size, embed_size, num_heads, num_layers, dropout, ENG)

pos_enc.shape torch.Size([55, 1, 100])


In [None]:
trans_model.to(device)

Trans(
  (pos_encoder): PositionEncoding(
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (encoder): Embedding(9723, 100)
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): Linear(in_features=100, out_features=100, bias=True)
          )
          (linear1): Linear(in_features=100, out_features=2048, bias=True)
          (dropout): Dropout(p=0.2, inplace=False)
          (linear2): Linear(in_features=2048, out_features=100, bias=True)
          (norm1): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.2, inplace=False)
          (dropout2): Dropout(p=0.2, inplace=False)
        )
        (1): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): Linear(in_features=100, out_features=100, bias

In [None]:
def train_model(trainloader, trans, lr=0.001, mom=0.9):
    
    loss_fn = nn.NLLLoss()
    trans_optimiser = optim.SGD(trans.parameters(), lr=lr, momentum=mom)
    
    max_epochs = 100
    old_loss = np.inf
    
    for epoch in range(max_epochs):
        
        running_loss = 0.0
    
        for i in trainloader:
            
            loss_val = 0
            i[0] = i[0].type(torch.LongTensor)   # batch_size x Ts
            i[1] = i[1].type(torch.LongTensor)   # batch_size x Tt
            source_batch = i[0].to(device)
            target_batch = i[1].to(device)

            output = trans(source_batch, target_batch)  # Tt x batch_size x target_vocab_size

            # calculating loss for the batch
            for j in range(output.shape[1]):
                logprobs = output[:, j, :].squeeze(1)
                target_classes = target_batch[j].squeeze(0)
                # inputs to NLL Loss should be of sizes Tt x n_classes and Tt
                loss_val += loss_fn(logprobs, target_classes)

            loss_val.backward()

            nn.utils.clip_grad_norm_(trans.parameters(), 0.5)
            trans_optimiser.step()
            
            running_loss += loss_val.item()
        
        print('Epoch', epoch+1, ': Loss =', running_loss)
        
        if abs(running_loss-old_loss)/running_loss < 1e-3:
            print('Converged')
            break
            
        old_loss = running_loss
    
    print('Finished Training')

In [None]:
trainloader, testloader = train_test_loader(X_train, Y_train, X_test, Y_test)

In [None]:
train_model(trainloader, trans_model)

KeyboardInterrupt: 