## About
Machine Translation in PyTorch.

English to Hinglish.

Dataset - https://www.kaggle.com/datasets/mrutyunjaybiswal/hinge-english-to-hinglish-machine-translation

In [2]:
#mandatory imports
import unicodedata
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import os
import pandas as pd
import random
import string
import re
from torch.utils.tensorboard import SummaryWriter
from torch.nn.utils.rnn import pad_sequence

In [3]:
train_df = pd.read_csv('/home/suraj/ClickUp/Jan-Feb/data/synthetic-dataset/train.csv')
valid_df = pd.read_csv('/home/suraj/ClickUp/Jan-Feb/data/synthetic-dataset/valid.csv')

In [4]:
train_df.head()

Unnamed: 0,English,Hindi,Hinglish,Average rating,Disagreement
0,Program module is a file that contains instruc...,"माड्यूल, एक संचिका होती है, जिसमें या तो स्रोत...","module , ek program hoti hai , jismen ya to so...",7,6
1,And to Thamud We sent their brother Sali 'h. H...,और (हमने) क़ौमे समूद के पास उनके भाई सालेह को ...,aur hamne aume samood ke pas unke bhaee saleh ...,6,4
2,"and, when reminded, do not remember\n","और जब उन्हें याद दिलाया जाता है, तो वे याद नही...","aur jab unhen yad dilaya jata hai , to ve yad ...",10,0
3,you won the TED Prize 2011.\n,तुम्हें २०११ का टेड प्राइज़ मिल गया है.\n,tumhen २०११ ka ted prize mil gaya hai\n,9,1
4,He gone to Kerodemal College of Delhi Universi...,उन्होंने बाद अध्ययन करने के लिए ये दिल्ली विश्...,unhonne bad science karne ke lie ye delhi univ...,7,0


In [5]:
valid_df.head()

Unnamed: 0,English,Hindi,Hinglish
0,Are you sure you want to permanently delete th...,क्या आप इन फ़ाइलों को स्थायी रूप से हटाना चाहत...,kya aap in files ko sthayi roop se permanently...
1,Three public meetings were held in Bombay City...,उस अवसर पर बंबई में तीन सभाएं की गयीं।\n,us avasar par bombay meetings were held ki gay...
2,Nominee of the insurance has to be a near rela...,बीमा का नामित व्यक्ति अभिदाता का निकट संबंधी ह...,insurance ka namit vyakti abhidata ka nikat sn...
3,Thus there was an inadequate appreciation of t...,इस प्रकार इस महत्वपूर्ण क्षेत्र में तेजी से का...,is prkar is vital sector inadequate appreciati...
4,"The Arabs laughed at him , and the alchemist l...",अरब सैनिक उसकी बात सुनकर हंस पड़े । <s> उनके स...,arab sainik uski bat sunakar hns pare arabs s...


In [6]:
#dropping all columns except English and Hinglish in the dataset
train_df = train_df.drop(['Hindi','Average rating','Disagreement'],axis=1)
valid_df = valid_df.drop(['Hindi'],axis=1)
print(train_df.columns)
print(valid_df.columns)

Index(['English', 'Hinglish'], dtype='object')
Index(['English', 'Hinglish'], dtype='object')


In [7]:
train_df

Unnamed: 0,English,Hinglish
0,Program module is a file that contains instruc...,"module , ek program hoti hai , jismen ya to so..."
1,And to Thamud We sent their brother Sali 'h. H...,aur hamne aume samood ke pas unke bhaee saleh ...
2,"and, when reminded, do not remember\n","aur jab unhen yad dilaya jata hai , to ve yad ..."
3,you won the TED Prize 2011.\n,tumhen २०११ ka ted prize mil gaya hai\n
4,He gone to Kerodemal College of Delhi Universi...,unhonne bad science karne ke lie ye delhi univ...
...,...,...
2761,Polar ice caps may melt further and increase t...,large size men polar ki barph pighalne se ocea...
2762,"It ' s what turns lead into gold , and makes t...","yahi chakr lead into gold bana deta hai , aur ..."
2763,The President said the North Eastern Hill Univ...,president ne kaha ki north parvtiy university ...
2764,The violin bow might well have grown out of th...,bahut snbhav hai ki vaylin ka gaj bhi ek chhar...


In [8]:
#preprocessing the dataframe
train_df.dropna(inplace=True)
valid_df.dropna(inplace=True)

In [9]:
#cleaning the text
#turning unicode string to plain ASCII

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )
#clean text by converting to lower case, removing non -letter characters
def clean_text(text):
    text = unicodeToAscii(text.lower().strip())
    text = re.sub(r"([.!?])", r" \1", text)
    text = re.sub("[.!?]", '', text)
    text = re.sub(r"[^a-zA-Z.!?]+", r" ", text)
    return text

In [10]:
#applying the clean_text method to df
train_df["English"] = train_df["English"].apply(clean_text)
train_df["Hinglish"] = train_df["Hinglish"].apply(clean_text)

valid_df["English"] = valid_df["English"].apply(clean_text)
valid_df["Hinglish"] = valid_df["Hinglish"].apply(clean_text)


In [11]:
train_df.head()

Unnamed: 0,English,Hinglish
0,program module is a file that contains instruc...,module ek program hoti hai jismen ya to source...
1,and to thamud we sent their brother sali h he ...,aur hamne aume samood ke pas unke bhaee saleh ...
2,and when reminded do not remember,aur jab unhen yad dilaya jata hai to ve yad na...
3,you won the ted prize,tumhen ka ted prize mil gaya hai
4,he gone to kerodemal college of delhi universi...,unhonne bad science karne ke lie ye delhi univ...


In [12]:
valid_df.head()

Unnamed: 0,English,Hinglish
0,are you sure you want to permanently delete th...,kya aap in files ko sthayi roop se permanently...
1,three public meetings were held in bombay city...,us avasar par bombay meetings were held ki gayin
2,nominee of the insurance has to be a near rela...,insurance ka namit vyakti abhidata ka nikat sn...
3,thus there was an inadequate appreciation of t...,is prkar is vital sector inadequate appreciati...
4,the arabs laughed at him and the alchemist lau...,arab sainik uski bat sunakar hns pare arabs s ...


In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [14]:
#As per https://www.kaggle.com/code/zeyadkhalid/machine-translation-transformers
# We have to define word indexing
# string_to_index  - string to its associated index i.e stoi
# index_to_string - index to its string i.e itos


start_token = 1
end_token =2

class Vocabulary:
    def __init__(self, language):
        self.language = language
        self.stoi = {}#string2index
        self.stoc = {} # string2count
        self.itos = {0:"<PAD>",start_token:"<START>",end_token:"<END>"}#index2sting
        for k,v in self.itos.items():
            self.stoi[v]=k
        self.num_words =3
    
    def add_word(self,word):
        if word not in self.stoi:
            self.stoi[word]=self.num_words
            self.stoc[word]=1
            self.itos[self.num_words]=word
            self.num_words+=1
        else:
            self.stoc[word]+=1

    def process_sentence(self,sentence):
        for word in sentence.split(' '):
            self.add_word(word)

            

In [15]:
english_vocab = Vocabulary('English')
hinglish_vocab = Vocabulary('Hinglish')

In [16]:
#helper functions
# while training, we shall need pair of input and output tensors whose ending is marked by end_token
start_token=1
end_token=2

#Step 1
#Helper function for creating indexes from sentence
def index_from_sentence(language,sentence):
    return [language.stoi[word] for word in sentence.split(' ')]
# [Hi There]  - [H, I, T, H, E, R, E] - [4,5,8,3,6,4]

def tensor_from_sentence(language,sentence):
    indices = index_from_sentence(language,sentence)
    indices.append(end_token)
    return torch.tensor(indices, dtype=torch.long).view(1,-1)

# step2- create tensor dataset by padding
def tensor_from_dataset(pair, input_language, output_language, max_input_len):
    input_tensor = tensor_from_sentence(input_language,pair[0])
    output_tensor = tensor_from_sentence(output_language, pair[1])
    with torch.no_grad():
        #padding
        pad_input = nn.ConstantPad1d((0,max_input_len-input_tensor.shape[1]),0)
        pad_output = nn.ConstantPad1d((0,max_input_len-output_tensor.shape[1]),0)

        #applying padding
        input_tensor_padded = pad_input(input_tensor)
        output_tensor_padded = pad_output(output_tensor)
    pair_tensor = pad_sequence([input_tensor_padded,output_tensor_padded],batch_first=False, padding_value=0)

    return pair_tensor
    

In [17]:
# sample_hinglish_tensor = tensor_from_sentence(hinglish_vocab,"kya aap in files ko sthayi")
# print(sample_hinglish_tensor)

In [18]:
class TranslationDataset(Dataset):

    def __init__(self,dataframe,english_vocab,hinglish_vocab,transform=None):
        self.dataframe = dataframe
        self.english = self.dataframe['English'].values.tolist()
        self.hinglish = self.dataframe['Hinglish'].values.tolist()
        self.input_lang = english_vocab
        self.output_lang = hinglish_vocab
        #building vocabulary
        for english_sent in self.english:
            self.input_lang.process_sentence(english_sent)
        for hinglish_sent in self.hinglish:
            self.output_lang.process_sentence(hinglish_sent)
        # creating tensors
        self.hinglish_tensors =[tensor_from_sentence(hinglish_vocab,sentence) for sentence in self.hinglish]
        self.english_tensors = [tensor_from_sentence(english_vocab,sentence) for sentence in self.english]
        

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self,index):
        hinglish_sample = self.hinglish_tensors[index][0]
        english_sample = self.english_tensors[index][0]
        sample = {'input':english_sample,'output':hinglish_sample}

        return sample

In [19]:
train_dataset = TranslationDataset(train_df,english_vocab,hinglish_vocab)

In [20]:
train_dataset.__getitem__(1)

{'input': tensor([24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
         42, 43, 44, 45, 33, 46, 39, 47, 48, 49, 17, 15, 50, 24, 51, 52, 39, 53,
         14, 54, 55, 56, 45, 25, 57, 39, 24, 58, 59, 60, 45, 14, 61, 62, 35, 63,
          5, 64, 65, 25, 66, 23,  2]),
 'output': tensor([23, 24, 25, 26, 15, 27, 28, 29, 30, 31, 32, 33, 34, 10, 35, 36, 37, 38,
         39, 40, 41, 37, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
         56, 44, 57, 38, 58, 59, 23, 55, 60, 61, 10, 62, 63, 44, 64, 65, 66, 67,
         68, 17, 69, 46, 70, 71, 72, 73, 74, 15, 75, 23, 76, 77, 23, 64, 78, 79,
          7,  2])}

In [21]:
val_dataset = TranslationDataset(valid_df,english_vocab,hinglish_vocab)

In [22]:
item = train_dataset.__getitem__(5)
print(item['input'], item['input'].shape, item['output'],item['output'].shape)

tensor([84, 27, 85, 86, 25, 87, 11, 17, 15, 88, 89, 90, 91, 92, 15, 93, 17, 29,
        94, 23,  2]) torch.Size([21]) tensor([ 66,  24,  81,  34, 110, 111, 112,  44, 113, 114, 115,  38, 116,  82,
        117,   7,  44, 118, 119, 120,  85, 121,   2]) torch.Size([23])


In [23]:
print(len(english_vocab.stoi),len(hinglish_vocab.stoi))

6684 10166


In [24]:
#collate_function
class Collater(object):
    def __init__(self, pad_index):
        self.pad_index = pad_index

    def __call__(self, batch):

        input = [item['input'] for item in batch]
        output = [item['output'] for item in batch]
        input = pad_sequence(input, batch_first=False, padding_value=self.pad_index)
        output = pad_sequence(output, batch_first=False, padding_value=self.pad_index)

        item = {'input':input, 'output':output}
        return item


In [25]:
pad_idx = english_vocab.stoi["<PAD>"]
train_loader = DataLoader(train_dataset,batch_size=4,shuffle=True, num_workers=4, pin_memory=True,collate_fn=Collater(pad_idx))#, collate_fn=Collater(0))

In [26]:
for i,item in enumerate(train_loader):
    print(item['input'].shape, item['output'].shape)
    break

torch.Size([35, 4]) torch.Size([53, 4])


In [27]:
#model architecture  #partially referenced from Aladdin pearson seq2seq model
# 1. Encoder 
# It generates a single output vector that summarises the input sequence meaning
# steps
# a. a word is fed to a network that generates an output and hidden state,
# b. The hidden state is fed to thenext word and process continues for updating weights.
# c. Last output also known as context vector is the representative of input sequence
class Encoder(nn.Module):
    def __init__(self, input_size,embedding_size,hidden_size,num_layers,prob):
        super().__init__()
        self.hidden_size=hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(prob)
        #generating embeddings
        self.embedding = nn.Embedding(input_size,embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size,num_layers,prob)

    def forward(self,inputs):#input - longtensor of indexes , shape - seq_len,N
        embedding = self.dropout(self.embedding(inputs))
        #embedding shape, seq_len,N,embedding_size - each N mapped to an embedding size 
        outputs,(hidden_state,cell_state) = self.lstm(embedding)
        #context vector lies in hidden state and cell state
        return hidden_state,cell_state




In [28]:
#2 . Decoder 
class Decoder(nn.Module):
    """
    The context representative vector shall be used as initial hidden state with following steps
    1. At each step,an input token and hidden state is fed to the network. start_token = initiallyy
    The firt hidden is context vector og encoder
    2. The first output should be first sentence of the output 
    3. Output ends with end_token or at max_len termination
    """
    def __init__(self,input_size,embedding_size, hidden_size,output_size,num_layers,prob):
        super().__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.dropout = nn.Dropout(prob)
        self.embedding = nn.Embedding(input_size,embedding_size)
        self.lstm = nn.LSTM(embedding_size,hidden_size,num_layers,dropout=prob)
        self.linear = nn.Linear(hidden_size,output_size) # out_size - lenght of language vocab

    def forward(self,inputs,hidden_state, cell_state):
        inputs = inputs.unsqueeze(0)
        embedding = self.dropout(self.embedding(input))
        outputs, (hidden_state,cell_state) = self.lstm(embedding,(hidden_state,cell_state))
        output = self.linear(outputs) #1,N, length_ of vovcab
        output = output.squeeze(0)

        return output,hidden_state,cell_state
    

In [38]:
#3. Linear Class which takes sequence and outputs sequence
class LinearModel(nn.Module):
    def __init__(self,encoder,decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self,source, target,teacher_force_ratio=0.4): #source - english , target -correct hinglish, 
        self.source = source
        self.target = target
        batch_size = self.source.shape[1]
        target_len = self.target.shape[0]
        
        
        target_vocab_size = len(hinglish_vocab.stoi)
        outputs = torch.zeros(target_len,batch_size,target_vocab_size).to(device)
        hidden_state,cell_state = self.encoder(self.source)

        #grabbing start token
        x = self.target[0]
        for t in range(1,target_len):
            output,hidden_state,cell_state = self.decoder(x,hidden_state,cell_state)
            outputs[t] = output
            best_guess = output.argmax(1)

            x = target[t] if random.random()< teacher_force_ratio else best_guess #feeding the guess or the original target
        return outputs
        

In [39]:
#hyperparameters
num_epochs=10
lr=1e-2
batch_size=32

load_model = False
input_size_encoder =  len(english_vocab.stoi)
input_size_decoder = len(hinglish_vocab.stoi)
output_size = len(hinglish_vocab.stoi)
encoder_embedding_size=512
decoder_embedding_size=512
hidden_size=2048
num_layers=4

encoder_dropout=0.5
decoder_dropout=0.5

#tensorboard 
writer = SummaryWriter(f'runs/loss_plot')
step=0



In [52]:
#initialising the encoder and decoder nets
encoder = Encoder(input_size_encoder,encoder_embedding_size,hidden_size,num_layers,encoder_dropout).to(device)
decoder = Decoder(input_size_decoder,decoder_embedding_size,hidden_size,output_size,num_layers, decoder_dropout).to(device)

In [53]:
model = LinearModel(encoder,decoder).to(device)

In [54]:
model

Seq2Seq(
  (encoder): Encoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(6684, 512)
    (rnn): LSTM(512, 2048, num_layers=4, dropout=0.5)
  )
  (decoder): Decoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(10166, 512)
    (rnn): LSTM(512, 2048, num_layers=4, dropout=0.5)
    (fc): Linear(in_features=2048, out_features=10166, bias=True)
  )
)

In [55]:
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(),lr=0.01)

In [56]:
# training
for epoch in range(num_epochs):
    checkpoint = {'state_dict':model.state_dict(), 'optimizer':optimizer.state_dict()}
    torch.save(checkpoint,"checkpoint.pth.tar")
    model.train()
    for idx, batch in enumerate(train_loader):
        #input = torch.LongTensor(input)
        #output = torch.LongTensor(output)
        inputs = batch['input'].to(device)
        outputs = batch['output'].to(device)
        #input = input.permute(0,2,1)
        #output = output.permute(0,2,1)
        print(inputs.shape)
        print(outputs.shape)
        
        out = model(inputs,outputs)
        out = out[1:].reshape(-1,out.shape[2])
        output = output[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(out,outputs)
        loss.backward()
        optimizer.step()

        writer.add_scalar("Train loss",loss, global_step=step)
        step+=1
    print("Epoch- {}, Loss - {}".format(epoch,loss.item()))


torch.Size([32, 4])
torch.Size([36, 4])


ValueError: Expected input batch_size (140) to match target batch_size (36).

## Reference
To Do
1. Write Bleu score methods.
2. Predictions.