In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [3]:
#neccessary imports 
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pandas as pd
from torchtext.data.metrics import bleu_score
from sklearn.model_selection import train_test_split
import unicodedata
import re
import spacy
from torch import Tensor
!pip install einops --quiet
from einops import rearrange
from torch.nn import (TransformerEncoder, TransformerDecoder,
                      TransformerEncoderLayer, TransformerDecoderLayer)
import math
from torch.nn.utils.rnn import pad_sequence
from torch.utils.tensorboard import SummaryWriter

!python -m spacy download en_core_web_sm --quiet
!python -m spacy download de_core_news_sm --quiet

2023-02-14 22:15:58.564164: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia
2023-02-14 22:15:58.564276: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
2023-02-14 22:16:19.018838: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open sha

In [4]:
df = pd.read_csv('/content/drive/MyDrive/Datasets/deu.txt',delimiter='\t',header=None)
df.columns = ['English','German','Source']


In [5]:
df.dropna(inplace=True)

In [6]:
#cleaning the text
#turning unicode string to plain ASCII

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )
#clean text by converting to lower case, removing non -letter characters
def clean_text(text):
    text = unicodeToAscii(text.lower().strip())
    text = re.sub(r"([.!?])", r" \1", text)
    text = re.sub("[.!?]", '', text)
    text = re.sub(r"[^a-zA-Z.!?]+", r" ", text)
    return text



#applying the clean_text method to df
df["English"] = df["English"].apply(clean_text)
df["German"] = df["German"].apply(clean_text)


In [7]:
start_token = 1
end_token =2
oov_token=3

class Vocabulary:
    def __init__(self, language):
        self.language = language
        self.stoi = {}#string2index
        self.stoc = {} # string2count
        self.itos = {0:"<PAD>",start_token:"<START>",end_token:"<END>"}#index2sting #additional- add ,oov_token:"<OOV>"
        for k,v in self.itos.items():
            self.stoi[v]=k
        self.num_words =31
        if self.language == "English":
            self.tokenizer = spacy.load("en_core_web_sm")
        elif self.language == "German":
            self.tokenizer = spacy.load("de_core_news_sm")
        else:
            print("Invalid language")

    def tokenize_sent(self,sentence):
        return [token.text.lower() for token in self.tokenizer.tokenizer(sentence)]
    
    def add_word(self,word):
        if word not in self.stoi:
            self.stoi[word]=self.num_words
            self.stoc[word]=1
            self.itos[self.num_words]=word
            self.num_words+=1
        else:
            self.stoc[word]+=1

    def build_vocab(self,sentence):
        for word in self.tokenize_sent(sentence):
            self.add_word(word)

    def process_sentence(self,sentence):
        
        return [self.stoi[token] for token in self.tokenize_sent(sentence)]


In [8]:
#building vocab for english and german
eng_vocab = Vocabulary("English")
ger_vocab = Vocabulary("German")

for english_sentence,german_sentence in zip(df["English"].values.tolist(),df["German"].values.tolist()):
    eng_vocab.build_vocab(english_sentence)
    ger_vocab.build_vocab(german_sentence)

In [9]:
print("Length of English vocab - {}".format(len(eng_vocab.stoi)))
print("Length of German vocab - {}".format(len(ger_vocab.stoi)))


Length of English vocab - 15607
Length of German vocab - 34126


In [10]:
class TranslationDataset(Dataset):
    def __init__(self,dataframe,eng_vocab,ger_vocab):
        super(TranslationDataset,self).__init__()
        self.dataframe = dataframe
        self.eng_vocab = eng_vocab
        self.ger_vocab = ger_vocab
        self.english = self.dataframe['English'].values.tolist()
        self.german = self.dataframe['German'].values.tolist()
    
    def __len__(self):
        return len(self.dataframe)

    def process_sent(self,sent,vocab):
        #starting each sentence with start_token and ending with end_token
        processed_sent = [vocab.stoi["<START>"]]
        processed_sent.extend(vocab.process_sentence(sent))
        processed_sent.append(vocab.stoi["<END>"])
        return processed_sent

    def __getitem__(self, index):
        processed_eng_sent = self.process_sent(self.english[index],self.eng_vocab)
        processed_ger_sent = self.process_sent(self.german[index],self.ger_vocab)

        
        item = {'input': torch.tensor(processed_eng_sent), 'output':torch.tensor(processed_ger_sent)}
        return item

In [11]:
train_dataset = TranslationDataset(df,eng_vocab,ger_vocab)


In [12]:
#sanity check of train_dataset
for i in range(len(train_dataset)):
  train_dataset.__getitem__(i)
print("No Errors found")

No Errors found


In [13]:
#collate_function
class Collater(object):
    def __init__(self, pad_index):
        self.pad_index = pad_index

    def __call__(self, batch):

        input = [item['input'] for item in batch]
        output = [item['output'] for item in batch]
        input = pad_sequence(input, batch_first=False, padding_value=self.pad_index)
        output = pad_sequence(output, batch_first=False, padding_value=self.pad_index)
        item = {'input':input, 'output':output}
        return item



In [18]:
#creating dataloaders
batch_size=2
pad_idx = eng_vocab.stoi["<PAD>"]

train_loader = DataLoader(train_dataset,batch_size, num_workers=1, shuffle=False,pin_memory=True, collate_fn=Collater(pad_idx))

In [22]:
for i, batch in enumerate(train_loader):
    
    print("Input batch details ",batch['input'],batch['input'].shape)

    print("_______________________________")
    print("Output batch details", batch['output'],batch['output'].shape)
    print("_______________________________")

    if i==10:
      break

Input batch details  tensor([[ 1,  1],
        [31, 32],
        [ 2,  2]]) torch.Size([3, 2])
_______________________________
Output batch details tensor([[ 1,  1],
        [31, 32],
        [ 2,  2]]) torch.Size([3, 2])
_______________________________
Input batch details  tensor([[ 1,  1],
        [32, 33],
        [ 2,  2]]) torch.Size([3, 2])
_______________________________
Output batch details tensor([[ 1,  1],
        [33, 35],
        [34,  2],
        [ 2,  0]]) torch.Size([4, 2])
_______________________________
Input batch details  tensor([[ 1,  1],
        [33, 34],
        [ 2,  2]]) torch.Size([3, 2])
_______________________________
Output batch details tensor([[ 1,  1],
        [35, 36],
        [ 2,  2]]) torch.Size([3, 2])
_______________________________
Input batch details  tensor([[ 1,  1],
        [34, 35],
        [ 2,  2]]) torch.Size([3, 2])
_______________________________
Output batch details tensor([[ 1,  1],
        [37, 38],
        [ 2,  2]]) torch.Size([3, 2]

In [20]:
INPUT_DIM = len(eng_vocab.stoi)
OUTPUT_DIM = len(ger_vocab.stoi)
dim_model = 256
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.english_embedding = nn.Embedding(INPUT_DIM, dim_model)
        self.german_embedding = nn.Embedding(OUTPUT_DIM, dim_model)
        self.transformer = nn.Transformer(d_model=dim_model, 
            num_encoder_layers=2, num_decoder_layers=2, 
            dropout=0.5, dim_feedforward=2048)
        self.fc1 = nn.Linear(dim_model, OUTPUT_DIM)
    
    def forward(self, inputs, targets):
        x = self.english_embedding(inputs)
        y = self.german_embedding(targets)
        tgt_mask = torch.triu(torch.ones(targets.size(0), targets.size(0)), diagonal=1).bool().to(device)
        out = self.transformer(x, y, tgt_mask=tgt_mask)
        out = self.fc1(out.permute(1, 0, 2)) # (batch, sequence, feature)
        return out.permute(1, 0, 2).reshape(-1, OUTPUT_DIM) # (sequence, batch, feature)

model = Net().to(device)

In [21]:
num_epochs = 1
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=0)
step=0
writer = SummaryWriter(f'runs/loss_plot')
#Init checkpoint
checkpoint = {'state_dict':model.state_dict(), 'optimizer':optimizer.state_dict()}
#Training
for epoch in range(num_epochs):
    model.train()
    for idx, batch in enumerate(train_loader):   
            input_seq = batch['input'].to(device)
            output_seq = batch['output'].to(device)
            pred = model(input_seq.to(device), output_seq[:-1,].to(device))
            loss = criterion(pred, output_seq[1:,].view(-1))
            loss.backward()
            optimizer.step()
            if step%100==0:
              print("Epoch- {}, Step-{}, Loss- {}".format(epoch,step, loss))
              torch.save(checkpoint,"checkpoint.pth.tar")
            writer.add_scalar("Train loss",loss, global_step=step)
            step+=1



Epoch- 0, Step-0, Loss- 10.543071746826172
Epoch- 0, Step-100, Loss- 10.763671875
Epoch- 0, Step-200, Loss- 9.161202430725098
Epoch- 0, Step-300, Loss- 8.945785522460938
Epoch- 0, Step-400, Loss- 12.551543235778809
Epoch- 0, Step-500, Loss- 9.107523918151855
Epoch- 0, Step-600, Loss- 7.778702259063721
Epoch- 0, Step-700, Loss- 12.402889251708984
Epoch- 0, Step-800, Loss- 10.04754638671875
Epoch- 0, Step-900, Loss- 16.693405151367188
Epoch- 0, Step-1000, Loss- 16.928977966308594
Epoch- 0, Step-1100, Loss- 15.601746559143066
Epoch- 0, Step-1200, Loss- 9.093459129333496
Epoch- 0, Step-1300, Loss- 28.988901138305664
Epoch- 0, Step-1400, Loss- 24.852602005004883
Epoch- 0, Step-1500, Loss- 25.00566864013672
Epoch- 0, Step-1600, Loss- 28.026077270507812
Epoch- 0, Step-1700, Loss- 16.028865814208984
Epoch- 0, Step-1800, Loss- 23.67810821533203
Epoch- 0, Step-1900, Loss- 22.488731384277344
Epoch- 0, Step-2000, Loss- 25.10645294189453
Epoch- 0, Step-2100, Loss- 31.892921447753906
Epoch- 0, Step-

RuntimeError: ignored

# Prediction Stage.

In [None]:
from torchtext.data.metrics import bleu_score
def to_sentence(tokens,PAD_IDX=0):
    """ Convert list of word-index to a sentence """
    return ' '.join([ger_vocab.itos[x] for x in tokens.squeeze() if x != PAD_IDX])

In [None]:
data = [['Follow me.','Folge mir.'],['I got it.','Ich habe es verstanden.']]
test_df = pd.DataFrame(data,columns=['English','German'])

In [None]:
test_dataset = TranslationDataset(test_df,eng_vocab,ger_vocab)
batch_size=1
pad_idx = eng_vocab.stoi["<PAD>"]

test_loader = DataLoader(test_dataset,batch_size, num_workers=1, shuffle=False,pin_memory=True, collate_fn=Collater(pad_idx))

In [None]:
def test_model():
    losses = 0.
    scores = 0.
    model.eval()
    for i, (batch) in enumerate(test_loader):
        input_seq = batch['input'].to(device)
        output_seq = batch['output'].to(device)
        target = output_seq[:1]
        while len(target) < 50 and target[-1] != eng_vocab['<END>']:
            pred = model(input_seq, target)
            my_targets = torch.cat((
                my_targets, 
                pred[-1,].argmax().unsqueeze(dim=0).unsqueeze(dim=0).to('cpu')
            ))

        target_sentence = to_sentence(output_seq[1:-1])
        pred_sentence = to_sentence(output_seq[1:-1])
        score = bleu_score([pred_sentence.split()], [[target_sentence.split()]])
        scores += score
        print('Bleu score: {}'.format(score))
        print('Original-{},Predicted-{}'.format(target_sentence,pred_sentence))
    

In [None]:
test_model()