In [1]:
import spacy
import torch
import torch.nn as nn
from torchtext.data import Field, BucketIterator, TabularDataset
import pandas as pd
import string
from torchtext.data.metrics import bleu_score
from torch.utils.tensorboard import SummaryWriter
import random

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
def extract_data():
    Eng_train = open('DataSet2/train.en',encoding='utf8').read().strip().split('\n')
    Hin_train = open('DataSet2/train.hi',encoding='utf8').read().strip().split('\n')
    Eng_validation = open('DataSet2/dev.en',encoding='utf8').read().strip().split('\n')
    Hin_validation = open('DataSet2/dev.hi',encoding='utf8').read().strip().split('\n')
    #Its stupid but this is the only way Tabular dataset allows construction of data :/
    Training_df = pd.DataFrame({'English':[text for text in Eng_train],'Hindi':[text for text in Hin_train]},columns=['English','Hindi'])
    Validation_df = pd.DataFrame({'English':[text for text in Eng_validation],'Hindi':[text for text in Hin_validation]},columns=['English','Hindi'])
    Training_df.to_csv('train.csv',index=False)
    Validation_df.to_csv('validation.csv',index=False)

In [3]:
#exract_data()

In [4]:
tok_eng = spacy.load("en")
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))
def tokenize_eng(text):
    return [tok.text.lower() for tok in tok_eng.tokenizer(remove_punctuation(text))]
def tokenize_hindi(text):
    text = remove_punctuation(text)
    return text.strip().split(' ')

In [5]:
english = Field(sequential=True,use_vocab=True,tokenize=tokenize_eng,lower=True, init_token="<sos>", eos_token="<eos>")
hindi = Field(sequential=True,use_vocab=True,tokenize=tokenize_hindi, init_token="<sos>", eos_token="<eos>")
fields = {"English": ("eng", english), "Hindi": ("hin", hindi)}
train_data, test_data = TabularDataset.splits(
    path="", train="train.csv", test="validation.csv", format="csv", fields=fields
)
english.build_vocab(train_data, max_size=30000, min_freq=1,vectors="glove.6B.200d")
hindi.build_vocab(train_data, max_size=30000, min_freq=1,vectors="glove.6B.200d")
train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data), 
    batch_size=100, 
    device="cuda:0",
    sort_within_batch=True,
    sort_key=lambda x: len(x.eng),
)

In [6]:
print(hindi.vocab.stoi["<eos>"])
print(hindi.vocab.stoi["<sos>"])

3
2


In [7]:
class Encoder(nn.Module):
    def __init__(self,input_dim, embed_dim, hidden_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.embed_dim = embed_dim
        self.input_dim = input_dim
        
        self.embedding = nn.Embedding(input_dim,embed_dim)
        self.rnn = nn.LSTM(embed_dim,hidden_dim,1,bidirectional=True)
    
        for x in self.embedding.parameters():
            x.requires_grad = False
    
        self.fc_hidden = nn.Linear(hidden_dim * 2, hidden_dim)
        self.fc_cell = nn.Linear(hidden_dim * 2, hidden_dim)
    
    def forward(self, x):
        x = self.embedding(x)
        outputs ,(hidden,cell) = self.rnn(x)
        
        hidden = self.fc_hidden(torch.cat((hidden[0:1], hidden[1:2]), dim=2))
        cell = self.fc_cell(torch.cat((cell[0:1], cell[1:2]), dim=2))
        
        return outputs, hidden, cell

In [8]:
class Decoder(nn.Module):
    def __init__(self,input_dim,embed_dim,hidden_dim,output_size):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.embed_dim = embed_dim
        self.input_dim = input_dim
        self.output_size = output_size
        
        self.embedding = nn.Embedding(input_dim, embed_dim)
        self.rnn = nn.LSTM(hidden_dim * 2 + embed_dim, hidden_dim,1)
        self.fc = nn.Linear(hidden_dim, output_size)
    
        for x in self.embedding.parameters():
            x.requires_grad = False
    
        self.energy = nn.Linear(hidden_dim * 3, 1)
        self.fc = nn.Linear(hidden_dim, output_size)
        self.softmax = nn.Softmax(dim=0)
        self.relu = nn.ReLU()
    
    def forward(self, x, encoder_states, hidden, cell):
        x = x.unsqueeze(0)
        embedding = self.embedding(x)
        sequence_length = encoder_states.shape[0]
        h_reshaped = hidden.repeat(sequence_length, 1, 1)
        energy = self.relu(self.energy(torch.cat((h_reshaped, encoder_states), dim=2)))
        # energy: (seq_length, N, 1)

        attention = self.softmax(energy)
        # attention: (seq_length, N, 1)

        # attention: (seq_length, N, 1), snk
        # encoder_states: (seq_length, N, hidden_size*2), snl
        # we want context_vector: (1, N, hidden_size*2), i.e knl
        context_vector = torch.einsum("snk,snl->knl", attention, encoder_states)

        rnn_input = torch.cat((context_vector, embedding), dim=2)
        # rnn_input: (1, N, hidden_size*2 + embedding_size)

        outputs, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
        # outputs shape: (1, N, hidden_size)

        predictions = self.fc(outputs).squeeze(0)
        # predictions: (N, hidden_size)

        return predictions, hidden, cell

In [9]:
class Seq2Seq(nn.Module):
    def __init__(self,encoder,decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(english.vocab)
        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
        encoder_states , hidden, cell = self.encoder(source)
        x = target[0]
        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, encoder_states, hidden, cell)
            outputs[t] = output
            best_guess = output.argmax(1)
            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs

In [10]:
# Training hyperparameters
num_epochs = 30
learning_rate = 0.001
batch_size = 100

# Model hyperparameters
load_model = False
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
input_size_encoder = len(english.vocab)
input_size_decoder = len(hindi.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 200
decoder_embedding_size = 200
hidden_size = 600

encoder_net = Encoder(
    input_size_encoder, encoder_embedding_size, hidden_size
).to(device)

decoder_net = Decoder(
    input_size_decoder,
    decoder_embedding_size,
    hidden_size,
    output_size
).to(device)

model = Seq2Seq(encoder_net, decoder_net).to(device)
model.encoder.embedding.weight.data.copy_(english.vocab.vectors)
model.decoder.embedding.weight.data.copy_(hindi.vocab.vectors)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = english.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [11]:
writer = SummaryWriter(f"runs/loss_plot")
step = 0

In [12]:
def translate_sentence(model, sentence, english, hindi, device, max_length=50):
    spacy_en = spacy.load("en")
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_en(sentence)]
    else:
        tokens = [token.lower() for token in sentence]
        
    tokens.insert(0, english.init_token)
    tokens.append(english.eos_token)
    text_to_indices = [english.vocab.stoi[token] for token in tokens]
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    start_index = hindi.vocab.stoi["<sos>"]
    end_index = hindi.vocab.stoi["<eos>"]
    
    
    # Build encoder hidden, cell state
    with torch.no_grad():
        outputs_encoder, hidden, cell = model.encoder(sentence_tensor)

    outputs = [start_index]

    for _ in range(max_length):
        prev_pred = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.decoder(prev_pred, outputs_encoder, hidden, cell)
            pred = output.argmax(1).item()

        outputs.append(pred)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == end_index:
            #print('booyah')
            break

    translated_sentence = [hindi.vocab.itos[idx] for idx in outputs]

    # remove start token
    return translated_sentence[1:]


def bleu(data, model, english, hindi, device):
    targets = []
    outputs = []

    for example in data:
        src = vars(example)["eng"]
        trg = vars(example)["hin"]

        prediction = translate_sentence(model, src, english, hindi,device)
        prediction = prediction[:-1]  # remove <eos> token

        targets.append([trg])
        outputs.append(prediction)

    return bleu_score(outputs, targets)


def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

In [None]:
#if load_model:
#    load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)


sentence = "I want to tell you something"

for epoch in range(num_epochs):
    print(f"[Epoch {epoch} / {num_epochs}]")

    #checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}
    #save_checkpoint(checkpoint)

    model.eval()

    translated_sentence = translate_sentence(
        model, sentence,english, hindi, device, max_length=50
    )

    print(f"Translated example sentence: \n {translated_sentence}")

    model.train()

    epoch_loss = 0
    x = 0.0
    
    for batch_idx, batch in enumerate(train_iterator):
        
        if(batch_idx%10 == 0):
            print('#',end="")
        
        # Get input and targets and get to cuda
        inp_data = batch.eng.to(device)
        target = batch.hin.to(device)

        # Forward prop
        output = model(inp_data, target)

        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
        # doesn't take input in that form. For example if we have MNIST we want to have
        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
        # way that we have output_words * batch_size that we want to send in into
        # our cost function, so we need to do some reshapin. While we're at it
        # Let's also remove the start token while we're at it
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, target)
        
        epoch_loss += loss.item()
        
        # Back prop
        loss.backward()

        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

        # Plot to tensorboard
        writer.add_scalar("Training loss", loss, global_step=step)
        step += 1
        x+=1
        
    print('')
    print('EPOCH LOSS ',epoch_loss/x)
    print('Calculating BLEU Score...')
    score = bleu(test_data, model, english, hindi, device)
    print(f"Bleu score {score*100:.2f}")
    print(' ----------------------------------------------------------------------------------------------- ')

[Epoch 0 / 30]
Translated example sentence: 
 ['मैं', 'मैं', 'आप', 'है', 'कि', 'कर', '<eos>']
#####################################################################################
EPOCH LOSS  5.100112375074526
Calculating BLEU Score...
