In [1]:
import spacy
import torch
import torch.nn as nn
from torchtext.data import Field, BucketIterator, TabularDataset
import pandas as pd
import string
from torchtext.data.metrics import bleu_score
from torch.utils.tensorboard import SummaryWriter
import random

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
def extract_data():
    Eng_train = open('DataSet2/train.en',encoding='utf8').read().strip().split('\n')
    Hin_train = open('DataSet2/train.hi',encoding='utf8').read().strip().split('\n')
    Eng_validation = open('DataSet2/dev.en',encoding='utf8').read().strip().split('\n')
    Hin_validation = open('DataSet2/dev.hi',encoding='utf8').read().strip().split('\n')
    #Its stupid but this is the only way Tabular dataset allows construction of data :/
    Training_df = pd.DataFrame({'English':[text for text in Eng_train],'Hindi':[text for text in Hin_train]},columns=['English','Hindi'])
    Validation_df = pd.DataFrame({'English':[text for text in Eng_validation],'Hindi':[text for text in Hin_validation]},columns=['English','Hindi'])
    Training_df.to_csv('train.csv',index=False)
    Validation_df.to_csv('validation.csv',index=False)

In [3]:
extract_data()

In [4]:
tok_eng = spacy.load("en")
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))
def tokenize_eng(text):
    return [tok.text.lower() for tok in tok_eng.tokenizer(text)]
def tokenize_hindi(text):
    text = text.replace("|",".")
    #text = text.replace("/",".")
    #text = text.replace("♪",".")
    #text = remove_punctuation(text)
    return text.strip().split(' ')

In [5]:
english = Field(sequential=True,use_vocab=True,tokenize=tokenize_eng,lower=True, init_token="<sos>", eos_token="<eos>")
hindi = Field(sequential=True,use_vocab=True,tokenize=tokenize_hindi, init_token="<sos>", eos_token="<eos>")
fields = {"English": ("eng", english), "Hindi": ("hin", hindi)}
train_data, test_data = TabularDataset.splits(
    path="", train="train.csv", test="validation.csv", format="csv", fields=fields
)
english.build_vocab(train_data, max_size=30000, min_freq=2,vectors="glove.6B.200d")
hindi.build_vocab(train_data, max_size=30000, min_freq=2,vectors="glove.6B.200d")
train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data), 
    batch_size=100, 
    device="cuda:0",
    sort_within_batch=True,
    sort_key=lambda x: len(x.eng),
)

In [6]:
print(hindi.vocab.stoi["<eos>"])
print(hindi.vocab.stoi["<sos>"])

3
2


In [7]:
class Transformer(nn.Module):
    def __init__(
        self,
        embedding_size,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        forward_expansion,
        dropout,
        max_len,
        device,
    ):
        super(Transformer, self).__init__()
        self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
        self.src_position_embedding = nn.Embedding(max_len, embedding_size)
        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
        self.trg_position_embedding = nn.Embedding(max_len, embedding_size)

        self.device = device
        self.transformer = nn.Transformer(
            embedding_size,
            num_heads,
            num_encoder_layers,
            num_decoder_layers,
            forward_expansion,
            dropout,
        )
        self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = src_pad_idx

    def make_src_mask(self, src):
        src_mask = src.transpose(0, 1) == self.src_pad_idx

        # (N, src_len)
        return src_mask.to(self.device)

    def forward(self, src, trg):
        src_seq_length, N = src.shape
        trg_seq_length, N = trg.shape

        src_positions = (
            torch.arange(0, src_seq_length)
            .unsqueeze(1)
            .expand(src_seq_length, N)
            .to(self.device)
        )

        trg_positions = (
            torch.arange(0, trg_seq_length)
            .unsqueeze(1)
            .expand(trg_seq_length, N)
            .to(self.device)
        )

        embed_src = self.dropout(
            (self.src_word_embedding(src) + self.src_position_embedding(src_positions))
        )
        embed_trg = self.dropout(
            (self.trg_word_embedding(trg) + self.trg_position_embedding(trg_positions))
        )

        src_padding_mask = self.make_src_mask(src)
        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(
            self.device
        )

        out = self.transformer(
            embed_src,
            embed_trg,
            src_key_padding_mask=src_padding_mask,
            tgt_mask=trg_mask,
        )
        out = self.fc_out(out)
        return out


In [8]:
import torch
import spacy
from torchtext.data.metrics import bleu_score
import sys


def translate_sentence(model, sentence, english, hindi, device, max_length=50):
    # Load german tokenizer
    spacy_eng = spacy.load("en")

    # Create tokens using spacy and everything in lower case (which is what our vocab is)
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_eng(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # Add <SOS> and <EOS> in beginning and end respectively
    tokens.insert(0, english.init_token)
    tokens.append(english.eos_token)

    # Go through each german token and convert to an index
    text_to_indices = [english.vocab.stoi[token] for token in tokens]

    # Convert to Tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    outputs = [hindi.vocab.stoi["<sos>"]]
    for i in range(max_length):
        trg_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)

        with torch.no_grad():
            output = model(sentence_tensor, trg_tensor)

        best_guess = output.argmax(2)[-1, :].item()
        outputs.append(best_guess)

        if best_guess == hindi.vocab.stoi["<eos>"]:
            break

    translated_sentence = [hindi.vocab.itos[idx] for idx in outputs]
    # remove start token
    return translated_sentence[1:]


def bleu(data, model, english, hindi, device):
    targets = []
    outputs = []

    for example in data:
        src = vars(example)["eng"]
        trg = vars(example)["hin"]

        prediction = translate_sentence(model, src, english,hindi, device)
        prediction = prediction[:-1]  # remove <eos> token

        targets.append([trg])
        outputs.append(prediction)

    return bleu_score(outputs, targets)

In [None]:
# We're ready to define everything we need for training our Seq2Seq model
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Training hyperparameters
num_epochs = 100
learning_rate = 3e-4
batch_size = 100

# Model hyperparameters
src_vocab_size = len(english.vocab)
trg_vocab_size = len(hindi.vocab)
embedding_size = 200
num_heads = 4
num_encoder_layers = 2
num_decoder_layers = 2
dropout = 0.0001
max_len = 100
forward_expansion = 2048
src_pad_idx = hindi.vocab.stoi["<pad>"]

# Tensorboard to get nice loss plot
writer = SummaryWriter("runs/loss_plot")
step = 0

train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data),
    batch_size=batch_size,
    sort_within_batch=True,
    sort_key=lambda x: len(x.eng),
    device=device,
)

model = Transformer(
    embedding_size,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device,
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, factor=0.1, patience=10, verbose=True
)

pad_idx = hindi.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

sentence = "I want to tell you something"

for epoch in range(num_epochs):
    print(f"[Epoch {epoch} / {num_epochs}]")

    model.eval()
    translated_sentence = translate_sentence(
        model, sentence, english, hindi, device, max_length=50
    )

    print(f"Translated example sentence: \n {translated_sentence}")
    model.train()
    losses = []

    for batch_idx, batch in enumerate(train_iterator):
        # Get input and targets and get to cuda
        inp_data = batch.eng.to(device)
        target = batch.hin.to(device)

        # Forward prop
        output = model(inp_data, target[:-1, :])

        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
        # doesn't take input in that form. For example if we have MNIST we want to have
        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
        # way that we have output_words * batch_size that we want to send in into
        # our cost function, so we need to do some reshapin.
        # Let's also remove the start token while we're at it
        output = output.reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()

        loss = criterion(output, target)
        losses.append(loss.item())
            
        # Back prop
        loss.backward()
        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

        if(batch_idx%1000 == 0):
            print(loss.item())
            if(epoch%10==9):
                score = bleu(test_data, model, english, hindi, device)
                print(f"Bleu score {score*100:.2f}")
        
        # plot to tensorboard
        writer.add_scalar("Training loss", loss, global_step=step)
        step += 1

    mean_loss = sum(losses) / len(losses)
    scheduler.step(mean_loss)



[Epoch 0 / 100]
Translated example sentence: 
 ['माफी,', 'बाथरूम.', 'आई', 'अंदर,', 'सलामत', 'लेटे', 'जेसी,', 'कोरटेज!', 'चिल्लाते', 'प्र.', 'जिम,.', 'बेन', 'उन्हें', 'थोड़े', 'कहते', 'तमाशा', '[सारा', 'छतें', 'होंगी?', 'हैं', 'पचास', 'करुँगा.', 'टॉल्जन', 'चॉपस्टिक्स', 'पीते.', 'विटामिन', 'रहूँगा', 'योनी', 'दुश्मन,', 'पट्टा', 'बारी..', 'कर्नल.', 'अस्त-व्यस्त', 'पचास', 'आचरण', 'Wildlings', 'सम्पर्क', 'आंकड़े.', 'मिलेगा।', 'पैकेज', '"ये', 'मारोगे,', 'डेन्वर', 'जाऊँगा."', 'आचरण', 'हानी', 'हानी', 'कृपया,', 'छाल)', 'खुले']
10.003636360168457
[Epoch 1 / 100]
Translated example sentence: 
 ['मैं', 'तुम्हें', 'कुछ', 'चाहते', 'हैं', '<eos>']
5.797691345214844
[Epoch 2 / 100]
Translated example sentence: 
 ['मैं', 'तुम्हें', 'कुछ', 'बता', 'चाहते', 'हैं', '<eos>']
3.568361759185791
[Epoch 3 / 100]
Translated example sentence: 
 ['मैं', 'तुम्हें', 'कुछ', 'बताना', 'चाहते', 'हैं', '<eos>']
2.494175672531128
[Epoch 4 / 100]
Translated example sentence: 
 ['मैं', 'तुम्हें', 'कुछ', 'कहना', 'चाहता', 'हूँ'