In [54]:
!pip3 install indic-nlp-library

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [55]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
import torch
import torch.nn.functional as F
from indicnlp.tokenize import indic_tokenize

In [56]:
data = pd.read_csv("/kaggle/input/sample-data-01/Sentence pairs in English-Hindi - 2025-02-11.tsv",sep="\t",header=None, names = ["SrcSentID","SrcSent","DstSentID","DstSent"])
data.head()

Unnamed: 0,SrcSentID,SrcSent,DstSentID,DstSent
0,1282,Muiriel is 20 now.,485968,म्यूरियल अब बीस साल की हो गई है।
1,1282,Muiriel is 20 now.,2060319,म्यूरियल अब बीस साल की है।
2,1294,Education in this world disappoints me.,485564,मैं इस दुनिया में शिक्षा पर बहुत निराश हूँ।
3,1302,That won't happen.,2060320,वैसा नहीं होगा।
4,1308,I miss you.,2060321,मुझें तुम्हारी याद आ रही है।


In [57]:
data.shape

(13182, 4)

In [58]:
data.drop(labels=[data.columns[0],data.columns[2]],axis=1,inplace=True)

In [59]:
data.head()

Unnamed: 0,SrcSent,DstSent
0,Muiriel is 20 now.,म्यूरियल अब बीस साल की हो गई है।
1,Muiriel is 20 now.,म्यूरियल अब बीस साल की है।
2,Education in this world disappoints me.,मैं इस दुनिया में शिक्षा पर बहुत निराश हूँ।
3,That won't happen.,वैसा नहीं होगा।
4,I miss you.,मुझें तुम्हारी याद आ रही है।


In [60]:
src_sent_tokenizer = AutoTokenizer.from_pretrained("google-T5/T5-base")

In [61]:
data["SrcSent"]=data["SrcSent"].apply(lambda x: src_sent_tokenizer.tokenize(x))
# sub-word tokenization ALgorithm used - BPE Byte pair encoding and the one not used is WPT - word piece tokenizer

In [62]:
data.head()

Unnamed: 0,SrcSent,DstSent
0,"[▁Mu, i, riel, ▁is, ▁20, ▁now, .]",म्यूरियल अब बीस साल की हो गई है।
1,"[▁Mu, i, riel, ▁is, ▁20, ▁now, .]",म्यूरियल अब बीस साल की है।
2,"[▁Education, ▁in, ▁this, ▁world, ▁disappoint, ...",मैं इस दुनिया में शिक्षा पर बहुत निराश हूँ।
3,"[▁That, ▁won, ', t, ▁happen, .]",वैसा नहीं होगा।
4,"[▁I, ▁miss, ▁you, .]",मुझें तुम्हारी याद आ रही है।


In [63]:
data["DstSent"]=data["DstSent"].apply(lambda x: indic_tokenize.trivial_tokenize(x,lang="hi"))

In [64]:
data.head()

Unnamed: 0,SrcSent,DstSent
0,"[▁Mu, i, riel, ▁is, ▁20, ▁now, .]","[म्यूरियल, अब, बीस, साल, की, हो, गई, है, ।]"
1,"[▁Mu, i, riel, ▁is, ▁20, ▁now, .]","[म्यूरियल, अब, बीस, साल, की, है, ।]"
2,"[▁Education, ▁in, ▁this, ▁world, ▁disappoint, ...","[मैं, इस, दुनिया, में, शिक्षा, पर, बहुत, निराश..."
3,"[▁That, ▁won, ', t, ▁happen, .]","[वैसा, नहीं, होगा, ।]"
4,"[▁I, ▁miss, ▁you, .]","[मुझें, तुम्हारी, याद, आ, रही, है, ।]"


In [65]:
data["SrcSent"]=data["SrcSent"].apply(src_sent_tokenizer.convert_tokens_to_ids)
data.head()

Unnamed: 0,SrcSent,DstSent
0,"[4159, 23, 14018, 19, 460, 230, 5]","[म्यूरियल, अब, बीस, साल, की, हो, गई, है, ।]"
1,"[4159, 23, 14018, 19, 460, 230, 5]","[म्यूरियल, अब, बीस, साल, की, है, ।]"
2,"[2855, 16, 48, 296, 26963, 7, 140, 5]","[मैं, इस, दुनिया, में, शिक्षा, पर, बहुत, निराश..."
3,"[466, 751, 31, 17, 1837, 5]","[वैसा, नहीं, होगा, ।]"
4,"[27, 3041, 25, 5]","[मुझें, तुम्हारी, याद, आ, रही, है, ।]"


In [66]:
vs=src_sent_tokenizer.get_vocab()
len(vs)

32100

In [67]:
hindi_vocab=set()
for tokenized_hindi_sent in data["DstSent"]:
    hindi_vocab.update(tokenized_hindi_sent)
len(hindi_vocab)

7069

In [68]:
vd = dict()
for idx,token in enumerate(hindi_vocab):
    vd[token] = idx + 3
vd["<SOS>"]=1
vd["<PAD>"]=0
vd["<EOS>"]=2

In [69]:
hindi_idx2vocab = dict(zip(vd.values(),vd.keys()))

Types of token used in every model <br>
"SOS" - Start of sentence (1) <br>
"PAD" - Padding (0)<br>
"EOS" - End of sentence (2)<br>
Gradient descent algorithm in the case of neural network is called backpropogation algorithm to reduce the loss of data <br>

In [70]:
def covert_hindi_token_to_ids(hindi_sent):
    return [vd[token] for token in hindi_sent]

In [71]:
data["DstSent"]=data["DstSent"].apply(lambda x:covert_hindi_token_to_ids(x))

In [72]:
data.head()

Unnamed: 0,SrcSent,DstSent
0,"[4159, 23, 14018, 19, 460, 230, 5]","[4702, 5336, 6609, 5180, 5410, 3029, 3372, 658..."
1,"[4159, 23, 14018, 19, 460, 230, 5]","[4702, 5336, 6609, 5180, 5410, 6580, 2855]"
2,"[2855, 16, 48, 296, 26963, 7, 140, 5]","[2693, 2139, 664, 1467, 1592, 1729, 4434, 2519..."
3,"[466, 751, 31, 17, 1837, 5]","[4972, 1277, 482, 2855]"
4,"[27, 3041, 25, 5]","[5804, 6958, 4083, 597, 34, 6580, 2855]"


In [73]:
def insert_sos_token_id(hindi_token_sent_ids):
    return [1]+hindi_token_sent_ids

In [74]:
data["DstSentInput"]=data["DstSent"].apply(lambda x:insert_sos_token_id(x))
data.head()

Unnamed: 0,SrcSent,DstSent,DstSentInput
0,"[4159, 23, 14018, 19, 460, 230, 5]","[4702, 5336, 6609, 5180, 5410, 3029, 3372, 658...","[1, 4702, 5336, 6609, 5180, 5410, 3029, 3372, ..."
1,"[4159, 23, 14018, 19, 460, 230, 5]","[4702, 5336, 6609, 5180, 5410, 6580, 2855]","[1, 4702, 5336, 6609, 5180, 5410, 6580, 2855]"
2,"[2855, 16, 48, 296, 26963, 7, 140, 5]","[2693, 2139, 664, 1467, 1592, 1729, 4434, 2519...","[1, 2693, 2139, 664, 1467, 1592, 1729, 4434, 2..."
3,"[466, 751, 31, 17, 1837, 5]","[4972, 1277, 482, 2855]","[1, 4972, 1277, 482, 2855]"
4,"[27, 3041, 25, 5]","[5804, 6958, 4083, 597, 34, 6580, 2855]","[1, 5804, 6958, 4083, 597, 34, 6580, 2855]"


In [75]:
def insert_eos_token_id(hindi_token_sent_ids):
    return hindi_token_sent_ids+[2]

In [76]:
data["DstSentLabel"]=data["DstSent"].apply(lambda x:insert_eos_token_id(x))
data.head()

Unnamed: 0,SrcSent,DstSent,DstSentInput,DstSentLabel
0,"[4159, 23, 14018, 19, 460, 230, 5]","[4702, 5336, 6609, 5180, 5410, 3029, 3372, 658...","[1, 4702, 5336, 6609, 5180, 5410, 3029, 3372, ...","[4702, 5336, 6609, 5180, 5410, 3029, 3372, 658..."
1,"[4159, 23, 14018, 19, 460, 230, 5]","[4702, 5336, 6609, 5180, 5410, 6580, 2855]","[1, 4702, 5336, 6609, 5180, 5410, 6580, 2855]","[4702, 5336, 6609, 5180, 5410, 6580, 2855, 2]"
2,"[2855, 16, 48, 296, 26963, 7, 140, 5]","[2693, 2139, 664, 1467, 1592, 1729, 4434, 2519...","[1, 2693, 2139, 664, 1467, 1592, 1729, 4434, 2...","[2693, 2139, 664, 1467, 1592, 1729, 4434, 2519..."
3,"[466, 751, 31, 17, 1837, 5]","[4972, 1277, 482, 2855]","[1, 4972, 1277, 482, 2855]","[4972, 1277, 482, 2855, 2]"
4,"[27, 3041, 25, 5]","[5804, 6958, 4083, 597, 34, 6580, 2855]","[1, 5804, 6958, 4083, 597, 34, 6580, 2855]","[5804, 6958, 4083, 597, 34, 6580, 2855, 2]"


In [77]:
data.columns

Index(['SrcSent', 'DstSent', 'DstSentInput', 'DstSentLabel'], dtype='object')

In [78]:
X = list(data["SrcSent"])
Y_input = list(data["DstSentInput"])
Y_label = list(data["DstSentLabel"])

In [79]:
X_tensor = [torch.tensor(tokenized_eng_sent_ids) for tokenized_eng_sent_ids in X]
Y_input_tensor = [torch.tensor(tokenized_hindi_sent_ids) for tokenized_hindi_sent_ids in Y_input]
Y_label_tensor = [torch.tensor(tokenized_hindi_sent_ids) for tokenized_hindi_sent_ids in Y_label]

In [80]:
X_padded = torch.nn.utils.rnn.pad_sequence(X_tensor,batch_first = True)
Y_padded_input = torch.nn.utils.rnn.pad_sequence(Y_input_tensor,batch_first = True)
Y_padded_label = torch.nn.utils.rnn.pad_sequence(Y_label_tensor,batch_first = True)

In [81]:
ns=X_padded.shape[1]
ns

68

In [82]:
nd=Y_padded_label.shape[1]
nd

68

In [83]:
class Encoder(torch.nn.Module):

    def __init__(self,src_lang_vocab_size,word_embedding_dim):
        super(Encoder,self).__init__()
        self.first_embedding_layer = torch.nn.Embedding(num_embeddings=src_lang_vocab_size,
                                                       embedding_dim=word_embedding_dim)
        self.second_lstm_layer = torch.nn.LSTM(input_size=word_embedding_dim,
                                               hidden_size=word_embedding_dim,
                                              batch_first=True)

    def forward(self,X_padded_mini_batch):

        first_embedding_layer_out = self.first_embedding_layer(X_padded_mini_batch)
        encoder_output, (final_encoder_output,final_cell_state) = self.second_lstm_layer(first_embedding_layer_out)

        return encoder_output, (final_encoder_output,final_cell_state)

In [84]:
class Decoder(torch.nn.Module):

    def __init__(self,dst_lang_vocab_size,word_embedding_dim):
        super(Decoder,self).__init__()

        self.first_embedding_layer = torch.nn.Embedding(num_embeddings=dst_lang_vocab_size,
                                                       embedding_dim=word_embedding_dim)
        self.second_lstm_layer = torch.nn.LSTM(input_size=word_embedding_dim,
                                               hidden_size=word_embedding_dim,
                                              batch_first=True)
        self.prediction_layer = torch.nn.Linear(in_features=word_embedding_dim,out_features=dst_lang_vocab_size)
        #self.prediction_layer_activation = torch.nn.Softmax(dim=2)

    def forward(self,Y_padded_input_mini_batch,final_encoder_output,final_cell_state):
        
        first_embedding_layer_out = self.first_embedding_layer(Y_padded_input_mini_batch)
        decoder_lstm_layer_out, (final_decoder_lstm_layer_out, final_cell_state) = self.second_lstm_layer(first_embedding_layer_out,
                                                                                                         (final_encoder_output,
                                                                                                          final_cell_state))
        prediction = self.prediction_layer(decoder_lstm_layer_out)
        
        return prediction, (final_decoder_lstm_layer_out, final_cell_state)

In [85]:
class Seq2SeqEncDec(torch.nn.Module):

    def __init__(self,src_lang_vocab_size,dst_lang_vocab_size,word_embedding_dim):
        super(Seq2SeqEncDec,self).__init__()

        self.encoder = Encoder(src_lang_vocab_size,word_embedding_dim)
        self.decoder = Decoder(dst_lang_vocab_size,word_embedding_dim)

    def forward(self,X_padded_mini_batch,Y_padded_input_mini_batch):

        encoder_output, (final_encoder_output,final_cell_state) = self.encoder(X_padded_mini_batch)
        y_hat_mini_batch = self.decoder(Y_padded_input_mini_batch,
                                        final_encoder_output,final_cell_state)

        return y_hat_mini_batch

In [86]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(device)

cuda


In [87]:
X_padded_train = X_padded[0:13000]
Y_padded_input_train = Y_padded_input[0:13000]
Y_padded_label_train = Y_padded_label[0:13000]

X_padded_test = X_padded[13000:]
Y_padded_input_test = Y_padded_input[13000:]
Y_padded_label_test = Y_padded_label[13000:]

In [88]:
network = Seq2SeqEncDec(len(vs),len(vd),128).to(device)

In [89]:
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(network.parameters())
num_epochs = 300
mb_size = 65

for epoch in range(num_epochs):
    for i in range(X_padded_train.shape[0]//mb_size):

        X_train_mb = X_padded_train[i*mb_size:(i+1)*mb_size]
        Y_input_mb = Y_padded_input_train[i*mb_size:(i+1)*mb_size]
        Y_label_mb = Y_padded_label_train[i*mb_size:(i+1)*mb_size]
        Y_label_mb = Y_label_mb.reshape(Y_label_mb.shape[0]*Y_label_mb.shape[1],)
        
        X_train_mb, Y_input_mb, Y_label_mb = X_train_mb.to(device), Y_input_mb.to(device), Y_label_mb.to(device)

        y_hat_train_mb = network(X_train_mb,Y_input_mb)
        y_hat_train_mb = y_hat_train_mb[0]
        y_hat_train_mb = y_hat_train_mb.reshape(y_hat_train_mb.shape[0]*y_hat_train_mb.shape[1],
                                                y_hat_train_mb.shape[2])

        loss_fn_value = loss_fn(y_hat_train_mb,Y_label_mb)

        loss_fn_value.backward()
        #torch.nn.utils.clip_grad_norm_(network.parameters(),max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()

        print("Epoch # {}, Time Step # {}, Loss Value = {}".format(epoch,i,loss_fn_value))

Epoch # 0, Time Step # 0, Loss Value = 8.873336791992188
Epoch # 0, Time Step # 1, Loss Value = 8.844696998596191
Epoch # 0, Time Step # 2, Loss Value = 8.812073707580566
Epoch # 0, Time Step # 3, Loss Value = 8.792744636535645
Epoch # 0, Time Step # 4, Loss Value = 8.76591968536377
Epoch # 0, Time Step # 5, Loss Value = 8.743067741394043
Epoch # 0, Time Step # 6, Loss Value = 8.725313186645508
Epoch # 0, Time Step # 7, Loss Value = 8.667067527770996
Epoch # 0, Time Step # 8, Loss Value = 8.639947891235352
Epoch # 0, Time Step # 9, Loss Value = 8.5847749710083
Epoch # 0, Time Step # 10, Loss Value = 8.533122062683105
Epoch # 0, Time Step # 11, Loss Value = 8.469985961914062
Epoch # 0, Time Step # 12, Loss Value = 8.331007957458496
Epoch # 0, Time Step # 13, Loss Value = 8.350089073181152
Epoch # 0, Time Step # 14, Loss Value = 8.181182861328125
Epoch # 0, Time Step # 15, Loss Value = 8.090423583984375
Epoch # 0, Time Step # 16, Loss Value = 7.936682224273682
Epoch # 0, Time Step # 17, 

In [90]:
torch.save(network.state_dict(),"model.pth")

In [91]:
def generate_translation(eng_sentence):

    tokenized_eng_sentence = src_sent_tokenizer.tokenize(eng_sentence)
    token_ids = src_sent_tokenizer.convert_tokens_to_ids(tokenized_eng_sentence)
    token_ids_tensor = torch.tensor(token_ids)
    token_ids_tensor = torch.unsqueeze(token_ids_tensor,0)

    if torch.cuda.is_available():
        device = torch.device("cuda")
        token_ids_tensor = token_ids_tensor.to(device)

    encoder_outputs,(final_encoder_output,final_candidate_cell_state) = network.encoder(token_ids_tensor)
    decoder_first_time_step_input = torch.tensor([[1]])

    if torch.cuda.is_available():
        encoder_outputs = encoder_outputs.to(device)
        final_encoder_output = final_encoder_output.to(device)
        final_candidate_cell_state = final_candidate_cell_state.to(device)
        decoder_first_time_step_input = decoder_first_time_step_input.to(device)

    decoder_first_time_step_output, (hidden_decoder_output, hidden_decoder_cell_state) = network.decoder(decoder_first_time_step_input,
                                                                          final_encoder_output,
                                                                          final_candidate_cell_state)

    generated_token_id = torch.argmax(F.softmax(decoder_first_time_step_output[:,0,:],dim=1),1)
    generated_token_id = torch.unsqueeze(generated_token_id,1)

    hindi_translated_sentence = str()
    hindi_translated_sentence += " " + hindi_idx2vocab[generated_token_id.item()]

    if torch.cuda.is_available():
        generated_token_id = generated_token_id.to(device)
        hidden_decoder_output = hidden_decoder_output.to(device)
        hidden_decoder_cell_state = hidden_decoder_cell_state.to(device)
        
    for i in range(nd-1):
        
        decoder_first_time_step_output, (hidden_decoder_output, hidden_decoder_cell_state) = network.decoder(generated_token_id,
                                                                                                hidden_decoder_output,
                                                                                                hidden_decoder_cell_state)
        generated_token_id = torch.argmax(F.softmax(decoder_first_time_step_output[:,0,:],dim=1),1)
        generated_token_id = torch.unsqueeze(generated_token_id,1)

        if torch.cuda.is_available():
            generated_token_id = generated_token_id.to(device)
            hidden_decoder_output = hidden_decoder_output.to(device)
            hidden_decoder_cell_state = hidden_decoder_cell_state.to(device)

        if generated_token_id.item() == vd["<EOS>"]:
            break

        hindi_translated_sentence += " " + hindi_idx2vocab[generated_token_id.item()]

    return hindi_translated_sentence

In [92]:
print(generate_translation("Muiriel"))

 अभी में लानत है ।
