In [None]:
import re
import gc
import time
import os
import pickle
import numpy as np
import matplotlib.pyplot as plt

In [None]:
! pip install torchtext==0.6.0



In [None]:
import torchtext
torchtext.__version__

'0.6.0'

In [None]:
from torchtext.data.metrics import bleu_score
import torch
import torch.nn as nn

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


http://peterbloem.nl/blog/transformers

In [None]:
! pip install spacy
! python -m spacy download de_core_news_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')


# **EMBEDDINGS**


I am using spacy english and german embeddings

In [None]:
import spacy
import de_core_news_sm
spacy.prefer_gpu()

True

In [None]:
class NMT(nn.Module):
    def __init__(self,embedding_size,heads,encoders_layers,decoders_layers,source_vocab_size,target_vocab_size,pad_index,max_len):
        super(NMT,self).__init__()
        self.pad_index = pad_index
        #create embedding of vectors for the source language
        self.source_embedding = nn.Embedding(source_vocab_size,embedding_size)
        #mark the index only for an input sentence of max_len
        self.source_position_embedding  =  nn.Embedding(max_len,embedding_size)

        #create embedding of vectors for the target language
        self.target_embedding = nn.Embedding(target_vocab_size,embedding_size)

        #mark the index only for an input sentence of max_len
        self.target_position_embedding  =  nn.Embedding(max_len,embedding_size)

        #transformer syntax : https://pytorch.org/hub/huggingface_pytorch-transformers/
        self.transformer = nn.Transformer(embedding_size,heads,encoders_layers,decoders_layers,dim_feedforward=2048,dropout=0.1)
        #last layer
        self.fullyconnected = nn.Linear(embedding_size,target_vocab_size)
        #drop outs for the embeddings
        self.dropout = nn.Dropout(0.1)             
    
    def get_source_mask(self,source_text):
        #input source text size is embedding size, batch size
        
        source_mask = source_text.transpose(0,1) == self.pad_index
        # print(source_text.transpose(0,1).shape)
        return source_mask.to(device)

    def forward(self,source,target):

        source_len,batch_size = source.shape
        target_len,batch_size = target.shape
        #create a list of indices from 0 to the len of the source language
        #add a dimension by unsqueezing
        #convert to input size by sequence length by batch_size dimension
        source_positions = (torch.arange(0,source_len).unsqueeze(1).expand(source_len,batch_size)).to(device) #invariant to positions
        target_positions = (torch.arange(0,target_len).unsqueeze(1).expand(target_len,batch_size)).to(device) #invariant to positions

        #create the final source embedding by appending the word embedding and the position embedding
        source_embedding = self.dropout(self.source_embedding(source) + self.source_position_embedding(source_positions))
        target_embedding = self.dropout(self.target_embedding(target) + self.target_position_embedding(target_positions))


        source_mask = self.get_source_mask(source)
        target_mask = self.transformer.generate_square_subsequent_mask(target_len).to(device)

        out = self.transformer(source_embedding,target_embedding,src_key_padding_mask = source_mask,tgt_mask = target_mask)
        out = self.fullyconnected(out)
        return out

# **DATASET**

In [None]:
import zipfile
import urllib.request as request
import os
import tarfile

url = "https://www.cl.uni-heidelberg.de/statnlpgroup/decoco/ms_coco_parallel.tar.bz2"


#if already uploaded read from here
if not os.path.exists('ms_coco_parallel.tar.bz2'):
    request.urlretrieve(url,"ms_coco_parallel.tar.bz2")


with tarfile.open("ms_coco_parallel.tar.bz2","r") as f:
    f.extractall()

! ls

checkpoint.pth.tar  ms_coco_parallel  ms_coco_parallel.tar.bz2	sample_data


# **TOKENIZER**

In [None]:
def deutsch_tokenizer(text):
    """
    Helper function to create tokens
    """
    spacy_deutsch = de_core_news_sm.load()
    return [token.text for token in spacy_deutsch.tokenizer(text)]
def english_tokenizer(text):
    """
    Helper function to create tokens
    """
    spacy_english = spacy.load(name = "en")
    return [token.text for token in spacy_english.tokenizer(text)]    

# **SOURCE AND TARGET**


***You can change to source and target languages***

In [None]:
source_language,source_extention = "english",".en"
#the downloaded files of the dataset has english text in .en extention
source = torchtext.data.Field(tokenize = english_tokenizer,lower = True, init_token="<sos>",eos_token="<eos>")

target_language,target_extention = "deutsch",".de"
#the downloaded files of the dataset has deutsch text in .de extention
target = torchtext.data.Field(tokenize = deutsch_tokenizer,lower = True, init_token="<sos>",eos_token="<eos>")


In [None]:
# source_language,source_extention = "deutsch",".de"
# #the downloaded files of the dataset has english text in .en extention
# source = torchtext.data.Field(tokenize = deutsch_tokenizer,lower = True, init_token="<sos>",eos_token="<eos>")

# target_language,target_extention = "english",".en"
# #the downloaded files of the dataset has deutsch text in .de extention
# target = torchtext.data.Field(tokenize = english_tokenizer,lower = True, init_token="<sos>",eos_token="<eos>")


In [None]:
import torchtext.datasets
train = torchtext.datasets.TranslationDataset(path='/content/ms_coco_parallel/dev', exts=(source_extention,target_extention), fields=(source, target))
# validation = torchtext.datasets.TranslationDataset(path='/content/ms_coco_parallel/devtest', exts=(source_extention,target_extention), fields=(source, target))
# test = torchtext.datasets.TranslationDataset(path='/content/ms_coco_parallel/test', exts=(source_extention,target_extention), fields=(source, target))

In [None]:
source.build_vocab(train, max_size=5000) #maximum lenght of sequence give to the neural network
target.build_vocab(train, max_size=5000)

In [None]:
if source_language == "english":
    # from_ = english
    # to = deutsch
    spacy_source = spacy.load(name = "en")
else:
    spacy_source = de_core_news_sm.load()

In [None]:
#make an iterator of batch size 32
train_loader = torchtext.data.BucketIterator(dataset=train, batch_size=32,sort_key=lambda x: data.interleave_keys(len(x.source), len(x.target)))  #source,target

***HYPERPARAMETERS***

In [None]:
source_vocab_size = len(source.vocab)
target_vocab_size = len(target.vocab)

source_pad_index = source.vocab.stoi["<pad>"]
target_pad_index = target.vocab.stoi["<pad>"]

embedding_size = 512  #d_model
heads = 8             #n_heads
encoders_layers = 3   #num_encoder_layers
decoders_layers = 3   #num_decoder_layers
max_len = 100 #sentence length used for positional embedding anything greater than max length will be deleted
learning_rate =0.0003

In [None]:
model = NMT(embedding_size,heads,encoders_layers,decoders_layers,source_vocab_size,target_vocab_size,source_pad_index,max_len).to(device)

optimizer = torch.optim.Adam(model.parameters(),lr = learning_rate)

criterion = nn.CrossEntropyLoss(ignore_index = target_pad_index)

In [None]:
import os
if os.path.exists("checkpoint.ptr.ptar"):
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

save_model = True
filename = "checkpoint.pth.tar"

In [None]:
model

NMT(
  (source_embedding): Embedding(756, 512)
  (source_position_embedding): Embedding(100, 512)
  (target_embedding): Embedding(722, 512)
  (target_position_embedding): Embedding(100, 512)
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
          )
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
        (1): TransformerEncoderLayer(
          (self_attn): MultiheadAttent

In [None]:
def translate_sentence(model, sentence, source,target, max_len):
    # print(sentence)
    #step 1: get the index for each source word
    #insert start of sentence token
    tokens = [source.init_token]
    if type(sentence) == str:
        tokens.extend([token.text.lower() for token in spacy_source(sentence)])
    else:
        tokens.extend([token.lower() for token in sentence])

    #insert end of sentence token
    tokens.append(source.eos_token)
    # print("Token",tokens)

    #step 2: get the index corresponding to the index from step 1
    
    text_to_indices = [source.vocab.stoi[token] for token in tokens]
    # list of indices to tensor 
    source_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)
    #start the output with the start of the sentence in the target language
    outputs = [target.vocab.stoi["<sos>"]]

    #max len is the maximum number of words in the sentence used for translation

    for _ in range(max_len):
        #take the last output and give it to the decoder
        target_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)
        with torch.no_grad():
            output = model(source_tensor, target_tensor)
        prediction = output.argmax(2)[-1,:].item()
        outputs.append(prediction)
        #stop if it is the end of sentence
        if prediction == target.vocab.stoi["<eos>"]:
            break

    translated_sentence = [target.vocab.itos[idx] for idx in outputs]
    # send without the sos
    return translated_sentence[1:]



In [None]:
# model.transformer.encoder

In [None]:
# from torch.utils.tensorboard import SummaryWriter
# writer = SummaryWriter("runs/loss_plot")

In [None]:
# sentence = "ich bin ein mann"
sentence = "It it raining cats and dogs"

In [None]:
step =0
nEpochs = 50

for epoch in range(nEpochs):
    print(f"[{epoch}/{nEpochs}]")
    if save_model:
        checkpoint = {
            "state_dict":model.state_dict(),
            "optimizer":optimizer.state_dict()

        }
        torch.save(checkpoint, filename)
        
    model.eval()
    translation = translate_sentence(model,sentence,source,target,max_len)
    print(f"Translated sentence{translation}")
    losses = []
    model.train()
    for batch_num,data in enumerate(train_loader):
        input_text = data.src.to(device)
        target_text = data.trg.to(device)
        output = model(input_text, target_text[:-1, :]) #create shift output is 1 time step ahead of the input for that timestep
        output = output.reshape(-1,output.shape[2])
        target_text = target_text[1:].reshape(-1) #shift of 1, output is 1 ahead this just the index for every word we want
        optimizer.zero_grad()
        loss = criterion(output,target_text)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(),max_norm=1)
        optimizer.step()
        losses.append(loss.detach().item()*input_text.size(0))
        if step%100 == 0:
            print("Training Loss",losses[-1])
        # writer.add_scalar("Training loss",loss.detach().item(),global_step=step)
        step+=1
    mean_loss = sum(losses) / len(train_loader.dataset)


[0/50]
Translated sentence['a', 'man', 'with', 'a', 'tennis', 'of', 'a', 'man', 'with', 'a', 'man', '.', '<eos>']
Training Loss 44.45225214958191
[1/50]
Translated sentence['a', 'man', 'is', 'a', 'white', 'shirt', 'and', 'a', 'frisbee', '.', '<eos>']
[2/50]
Translated sentence['a', 'man', 'is', 'only', 'a', 'tennis', 'ball', '<eos>']
[3/50]
Translated sentence['a', 'man', 'is', 'only', 'a', 'picture', 'in', 'a', 'frisbee', '.', '<eos>']
[4/50]
Translated sentence['a', 'man', 'is', 'surfing', 'on', 'a', 'red', 'surf', '.', '<eos>']
[5/50]
Translated sentence['a', 'man', 'is', 'surfing', 'on', 'a', 'snow', 'covered', 'slope', '.', '<eos>']
[6/50]
Translated sentence['a', 'man', 'is', 'surfing', 'on', 'a', 'snow', 'covered', 'to', 'a', 'snow', '.', '<eos>']
[7/50]
Translated sentence['a', 'man', 'is', 'only', 'a', 'picture', 'in', 'a', 'frisbee', '.', '<eos>']
[8/50]
Translated sentence['a', 'man', 'is', 'surfing', 'on', 'a', 'red', 'surf', 'board', '.', '<eos>']
[9/50]
Translated sentenc

In [None]:
test = torchtext.datasets.TranslationDataset(path='/content/ms_coco_parallel/test', exts=(source_extention,target_extention), fields=(source, target))
test_loader = torchtext.data.BucketIterator(dataset=test, batch_size=32,sort_key=lambda x: data.interleave_keys(len(x.source), len(x.target)))  #source,target

In [None]:
actual_texts = []
predicted_texts = []

model.eval()
for test_text in test:
    source_test = test_text.src
    target_test = test_text.trg
    prediction = translate_sentence(model, source_test, source,target,max_len)
    prediction = prediction[:-1]  # don't include end of sentence token

    actual_texts.append([target_test])
    predicted_texts.append(prediction)
# print(bleu_score(predicted_texts, actual_texts))

In [None]:
predicted_texts

In [None]:
count=0
model.eval()
for test_text in test:
    if count ==10:
        break
    count+=1
    source_test = test_text.src
    target_test = test_text.trg
    prediction = translate_sentence(model, source_test, source,target,max_len)
    prediction = prediction[:-1]  # don't include end of sentence token
    print(source_test,prediction)

['ein', 'mann', 'führt', 'auf', 'einen', 'skateboard', 'einen', 'trick', 'vor', '.'] ['a', 'man', 'riding', 'a', 'motorcycle', 'down', 'a', 'street', 'next', 'to', 'people', '.']
['ein', 'hund', 'auf', 'einer', 'wiese', 'mit', 'einem', 'frisbee', 'im', 'maul', '.'] ['a', 'dog', 'is', 'barking', 'on', 'the', 'sheep', 'grazing', 'in', 'the', 'field', '.']
['eine', 'tasche', 'steht', 'neben', 'anderem', 'gepäck', 'auf', 'dem', 'boden', '.'] ['a', 'giraffe', 'walks', 'on', 'grass', 'looking', 'for', 'something', 'to', 'eat', '.']
['eine', 'gruppe', 'von', 'leuten', 'steht', 'neben', 'einer', 'maschine', '.'] ['a', 'group', 'of', 'people', 'holding', 'umbrellas', 'near', 'a', 'wet', 'street', '.']
['ein', 'eleganter', 'tisch', 'mit', 'vielen', 'blauen', 'vasen', '.'] ['a', 'wooden', 'cutting', 'board', 'with', 'lots', 'of', 'carrots', 'on', 'top', 'of', 'it', '.']
['eine', 'einzelne', 'giraffe', 'steht', 'zwischen', 'trockenen', 'büschen', '.'] ['a', 'little', 'kid', 'is', 'standing', 'in',