In [1]:
import pandas as pd
import numpy as np
import torch

In [2]:
import os
from indicnlp import common
from indicnlp.tokenize import indic_tokenize

In [3]:
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
tokenizer=AutoTokenizer.from_pretrained("google-T5/T5-base")

In [5]:
data=pd.read_csv("/home/thasin/class-projects/LLM-project/Sentence pairs in English-Hindi - 2025-02-13.tsv",sep="\t",header=None,names=["SrcSentenceID","SrcSentence","DstSentenceID","DstSentence"])

In [6]:
data.head()

Unnamed: 0,SrcSentenceID,SrcSentence,DstSentenceID,DstSentence
0,1282,Muiriel is 20 now.,485968,म्यूरियल अब बीस साल की हो गई है।
1,1282,Muiriel is 20 now.,2060319,म्यूरियल अब बीस साल की है।
2,1294,Education in this world disappoints me.,485564,मैं इस दुनिया में शिक्षा पर बहुत निराश हूँ।
3,1302,That won't happen.,2060320,वैसा नहीं होगा।
4,1308,I miss you.,2060321,मुझें तुम्हारी याद आ रही है।


In [7]:
data["DstSentence"]=data["DstSentence"].apply(lambda x: indic_tokenize.trivial_tokenize(x,lang="hi"))

In [8]:
data["DstSentence"]

0              [म्यूरियल, अब, बीस, साल, की, हो, गई, है, ।]
1                      [म्यूरियल, अब, बीस, साल, की, है, ।]
2        [मैं, इस, दुनिया, में, शिक्षा, पर, बहुत, निराश...
3                                    [वैसा, नहीं, होगा, ।]
4                    [मुझें, तुम्हारी, याद, आ, रही, है, ।]
                               ...                        
13177            [क्या, आपके, पास, सब्ज़ी, -, चावल, है, ?]
13178            [क्या, आपके, पास, सब्ज़ी, -, चावल, है, ?]
13179            [क्या, आपके, पास, सब्ज़ी, -, चावल, है, ?]
13180            [क्या, आपके, पास, सब्ज़ी, -, चावल, है, ?]
13181              [मुझे, यह, साइकिल, अब, भी, पसंद, है, ।]
Name: DstSentence, Length: 13182, dtype: object

In [9]:
Nd=max(list(data["DstSentence"].apply(len)))

In [10]:
Nd

67

In [11]:
data["SrcSentence"]=data["SrcSentence"].apply(lambda x: tokenizer.tokenize(x))

In [12]:
data["SrcSentence"]

0                        [▁Mu, i, riel, ▁is, ▁20, ▁now, .]
1                        [▁Mu, i, riel, ▁is, ▁20, ▁now, .]
2        [▁Education, ▁in, ▁this, ▁world, ▁disappoint, ...
3                          [▁That, ▁won, ', t, ▁happen, .]
4                                     [▁I, ▁miss, ▁you, .]
                               ...                        
13177    [▁Do, ▁you, ▁have, ▁some, ▁curry, ▁and, ▁some,...
13178    [▁Do, ▁you, ▁have, ▁curry, ▁and, ▁some, ▁rice, ?]
13179    [▁Do, ▁you, ▁have, ▁any, ▁curry, ▁with, ▁rice, ?]
13180          [▁Do, ▁you, ▁have, ▁curry, ▁with, ▁rice, ?]
13181              [▁I, ▁still, ▁love, ▁this, ▁bicycle, .]
Name: SrcSentence, Length: 13182, dtype: object

In [13]:
Ns=max(list(data["SrcSentence"].apply(len)))

In [14]:
Ns

68

In [15]:
Vs = tokenizer.get_vocab()

In [16]:
Vs

{'clin': 11005,
 'dose': 12051,
 '▁72': 9455,
 '▁purpose': 1730,
 '▁Bundes': 6387,
 '▁bombard': 26877,
 '▁perspective': 3503,
 '▁season': 774,
 '▁Kraft': 10756,
 'Certaine': 31781,
 '▁Trump': 2523,
 '▁categoria': 18146,
 '▁Kill': 14450,
 '▁chambres': 18357,
 '▁cafe': 6913,
 '▁efect': 14393,
 '▁10': 335,
 '▁soybean': 27161,
 'damals': 18866,
 'tention': 9174,
 '▁aktiv': 10935,
 '▁Graham': 15146,
 'LER': 25896,
 '▁Appliance': 24879,
 '▁orasului': 28069,
 '▁carti': 13689,
 'portrayed': 27486,
 '2019': 8584,
 '▁cookie': 7364,
 '▁Jay': 9373,
 '▁regard': 3553,
 'Fällen': 19730,
 '▁Information': 2784,
 '▁Arch': 9318,
 'implantation': 30918,
 'SMEs': 28664,
 'different': 25880,
 '▁servers': 8379,
 '▁excepţi': 30029,
 'modell': 18520,
 'hängt': 21172,
 'zon': 8892,
 '189': 25312,
 '▁Eric': 6964,
 '▁teammates': 24558,
 '▁Auction': 23040,
 '▁vermute': 31525,
 '▁Conversation': 28941,
 '▁români': 14109,
 'fibro': 20602,
 'BI': 5972,
 '▁survive': 7905,
 'incorporating': 18218,
 '▁Aroma': 24293,
 '▁c

In [17]:
Vd = set()
for tokenized_hindi_sentence in data['DstSentence']:
    Vd.update(tokenized_hindi_sentence)
    
hindi_vocab = dict()

for idx, token in enumerate(Vd):
    hindi_vocab[token] = idx + 3

hindi_vocab["<PAD>"] = 0
hindi_vocab["<SOS>"] = 1
hindi_vocab["<EOS>"] = 2

Vd = hindi_vocab

In [18]:
print(Vd)

{'साइकिल': 3, 'करवाया': 4, 'सकें': 5, 'मद्य': 6, 'बोलिंग': 7, 'लिफ़्ट': 8, 'बनायी': 9, 'टिकती': 10, 'चोदिये': 11, 'खा': 12, 'बाईं': 13, 'रफ़तार': 14, 'तारे': 15, 'गोलार्धों': 16, 'परख': 17, 'दीवारों': 18, 'लगती': 19, 'मुसलिम': 20, 'स्केटिंग': 21, 'सीढ़ी': 22, 'डराने': 23, 'भीग': 24, 'कुचल': 25, 'लीं': 26, 'दुर्रानी': 27, 'हाथ': 28, 'ग्रामवासी': 29, 'सेलीन': 30, 'दौड़ते': 31, 'दुखद': 32, 'सुलझा': 33, 'लुइस': 34, 'तथ्यों': 35, 'लापता': 36, 'सर्दियाँ': 37, 'झेला': 38, 'ज़ाहिर': 39, 'रोमानिया': 40, 'बदलिए': 41, 'सख्त': 42, 'टिलापिया': 43, 'जेनकिन्स': 44, 'सटिया': 45, 'आभारी': 46, 'जैनेट': 47, 'दोनो': 48, 'इसमें': 49, 'एंड': 50, '१२': 51, 'दूँ': 52, 'खान': 53, 'बादल': 54, 'पढ़ो': 55, 'हरे': 56, 'आधार': 57, 'खोल': 58, 'पहुँचे': 59, 'पतलून': 60, 'उदाहरण': 61, 'मलाय': 62, 'चूमने': 63, 'प्रकार': 64, 'बिताना': 65, 'मंगल': 66, 'फ़र्नीचर': 67, 'एक': 68, 'काँग': 69, 'नींद': 70, 'विवेक': 71, 'लायक': 72, 'पड़ने': 73, 'डर्जन': 74, 'ऊपर': 75, 'सह': 76, 'अक्तूबर': 77, 'द्वीप': 78, 'आंसू': 79, 'दबाकर': 8

In [19]:
len(Vd)

7072

In [20]:
#jis bhasa me translate krna hai us bhasa me teeno Token chahiye SOS EOS PAD
#jis se translate kr rha hai usme jarurat nhi hai

In [20]:
data.head()

Unnamed: 0,SrcSentenceID,SrcSentence,DstSentenceID,DstSentence
0,1282,"[▁Mu, i, riel, ▁is, ▁20, ▁now, .]",485968,"[म्यूरियल, अब, बीस, साल, की, हो, गई, है, ।]"
1,1282,"[▁Mu, i, riel, ▁is, ▁20, ▁now, .]",2060319,"[म्यूरियल, अब, बीस, साल, की, है, ।]"
2,1294,"[▁Education, ▁in, ▁this, ▁world, ▁disappoint, ...",485564,"[मैं, इस, दुनिया, में, शिक्षा, पर, बहुत, निराश..."
3,1302,"[▁That, ▁won, ', t, ▁happen, .]",2060320,"[वैसा, नहीं, होगा, ।]"
4,1308,"[▁I, ▁miss, ▁you, .]",2060321,"[मुझें, तुम्हारी, याद, आ, रही, है, ।]"


In [21]:
#converting English word to numerical
data['SrcSentence'] = data['SrcSentence'].apply(tokenizer.convert_tokens_to_ids)

#then after that english numeric value will ues as a input in NN afeter converting One hot encoding 

In [22]:
#converting Hindi Sentence to numbers 
def convert_hindi_tokens_to_ids(tokenized_hindi_sentence):
    return [Vd[token] for token in tokenized_hindi_sentence]

In [23]:
data['DstSentence'] = data['DstSentence'].apply(convert_hindi_tokens_to_ids)

In [24]:
data.head()

Unnamed: 0,SrcSentenceID,SrcSentence,DstSentenceID,DstSentence
0,1282,"[4159, 23, 14018, 19, 460, 230, 5]",485968,"[6195, 5516, 1660, 3884, 1238, 971, 2680, 6535..."
1,1282,"[4159, 23, 14018, 19, 460, 230, 5]",2060319,"[6195, 5516, 1660, 3884, 1238, 6535, 6442]"
2,1294,"[2855, 16, 48, 296, 26963, 7, 140, 5]",485564,"[632, 3305, 4552, 4501, 4875, 3625, 1426, 4285..."
3,1302,"[466, 751, 31, 17, 1837, 5]",2060320,"[1760, 2558, 2071, 6442]"
4,1308,"[27, 3041, 25, 5]",2060321,"[828, 6400, 7043, 5826, 1405, 6535, 6442]"


In [36]:
def insert_sos_token_id(hindi_sentence_token_ids_list):
    return [1] + hindi_sentence_token_ids_list

In [37]:
def insert_eos_token_id(hindi_sentence_token_ids_list):
    return hindi_sentence_token_ids_list + [2]

In [38]:
data["DstSentenceInput"] = data["DstSentence"].apply(insert_sos_token_id)
data["DstSentenceLabel"] = data["DstSentence"].apply(insert_eos_token_id)

In [39]:
data.head()

Unnamed: 0,SrcSentenceID,SrcSentence,DstSentenceID,DstSentence,DstSentenceInput,DstSentenceLabel
0,1282,"[4159, 23, 14018, 19, 460, 230, 5]",485968,"[6195, 5516, 1660, 3884, 1238, 971, 2680, 6535...","[1, 6195, 5516, 1660, 3884, 1238, 971, 2680, 6...","[6195, 5516, 1660, 3884, 1238, 971, 2680, 6535..."
1,1282,"[4159, 23, 14018, 19, 460, 230, 5]",2060319,"[6195, 5516, 1660, 3884, 1238, 6535, 6442]","[1, 6195, 5516, 1660, 3884, 1238, 6535, 6442]","[6195, 5516, 1660, 3884, 1238, 6535, 6442, 2]"
2,1294,"[2855, 16, 48, 296, 26963, 7, 140, 5]",485564,"[632, 3305, 4552, 4501, 4875, 3625, 1426, 4285...","[1, 632, 3305, 4552, 4501, 4875, 3625, 1426, 4...","[632, 3305, 4552, 4501, 4875, 3625, 1426, 4285..."
3,1302,"[466, 751, 31, 17, 1837, 5]",2060320,"[1760, 2558, 2071, 6442]","[1, 1760, 2558, 2071, 6442]","[1760, 2558, 2071, 6442, 2]"
4,1308,"[27, 3041, 25, 5]",2060321,"[828, 6400, 7043, 5826, 1405, 6535, 6442]","[1, 828, 6400, 7043, 5826, 1405, 6535, 6442]","[828, 6400, 7043, 5826, 1405, 6535, 6442, 2]"


In [40]:
data.drop(labels=["SrcSentenceID","DstSentenceID","DstSentence"],axis=1,inplace=True)

In [41]:
data.head()

Unnamed: 0,SrcSentence,DstSentenceInput,DstSentenceLabel
0,"[4159, 23, 14018, 19, 460, 230, 5]","[1, 6195, 5516, 1660, 3884, 1238, 971, 2680, 6...","[6195, 5516, 1660, 3884, 1238, 971, 2680, 6535..."
1,"[4159, 23, 14018, 19, 460, 230, 5]","[1, 6195, 5516, 1660, 3884, 1238, 6535, 6442]","[6195, 5516, 1660, 3884, 1238, 6535, 6442, 2]"
2,"[2855, 16, 48, 296, 26963, 7, 140, 5]","[1, 632, 3305, 4552, 4501, 4875, 3625, 1426, 4...","[632, 3305, 4552, 4501, 4875, 3625, 1426, 4285..."
3,"[466, 751, 31, 17, 1837, 5]","[1, 1760, 2558, 2071, 6442]","[1760, 2558, 2071, 6442, 2]"
4,"[27, 3041, 25, 5]","[1, 828, 6400, 7043, 5826, 1405, 6535, 6442]","[828, 6400, 7043, 5826, 1405, 6535, 6442, 2]"


In [42]:
X = list(data["SrcSentence"])
Y_input = list(data["DstSentenceInput"])
Y_label = list(data["DstSentenceLabel"])

X_tensor = [torch.tensor(eng_tokenized_ids) for eng_tokenized_ids in X]
Y_input_tensor = [torch.tensor(hin_tokenized_ids) for hin_tokenized_ids in Y_input]
Y_label_tensor = [torch.tensor(hin_tokenized_ids) for hin_tokenized_ids in Y_label]

X_padded = torch.nn.utils.rnn.pad_sequence(X_tensor,batch_first=True)
Y_input_padded = torch.nn.utils.rnn.pad_sequence(Y_input_tensor,batch_first=True)
Y_label_padded = torch.nn.utils.rnn.pad_sequence(Y_label_tensor,batch_first=True)

In [43]:
X_padded.shape

torch.Size([13182, 68])

In [44]:
X_padded[2]

tensor([ 2855,    16,    48,   296, 26963,     7,   140,     5,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [45]:
Ns = X_padded.shape[1]
Nd = Y_label_padded.shape[1]

In [46]:
Ns, Nd

(68, 68)

In [50]:
class Attention(torch.nn.Module):
    def __init__(self):
        super(Attention,self).__init__()
        self.attention_probabilities = torch.nn.Softmax(dim=1)

    def forward(self,encoder_outputs,decoder_lstm_layer_outputs):
        
        decoder_lstm_layer_outputs = torch.transpose(decoder_lstm_layer_outputs,dim0=1,dim1=2)
        alignment_scores = torch.bmm(encoder_outputs,decoder_lstm_layer_outputs)
        attention_weights = self.attention_probabilities(alignment_scores)
        attention_weights = torch.transpose(attention_weights,dim0=1,dim1=2)
        context_vectors = torch.bmm(attention_weights,encoder_outputs)

        return context_vectors

In [47]:
#1 Encoder
#2 Decoder
# 3 attention layer
#then combine all of these

# 1:
#embedding layer does not have bias that will give output in encoder that is first_embedding_layer_output (32100,32)
#the first_embedding_layer_output will go in LSTM layer as a input and give three output 1.encoder output(15,68,32) it is a matrix, 2.final_encoder_ouput[c[0]] vector, 3. final_candidate_cell_output(h1[0]) vector
#encoder box will give two outputs 1.inal_encoder_ouput, 2. final_candidate_cell_output
class Encoder(torch.nn.Module):
    def __init__(self,src_lang_vocab_size,topic_vector_dim):
        super(Encoder,self).__init__()
        self.first_emebdding_layer = torch.nn.Embedding(num_embeddings=src_lang_vocab_size,
                                                        embedding_dim=topic_vector_dim)
        self.second_lstm_layer = torch.nn.LSTM(input_size=topic_vector_dim,hidden_size=topic_vector_dim,
                                               batch_first=True)
        
    def forward(self,X_padded_mini_batch):

        first_embedding_layer_out = self.first_emebdding_layer(X_padded_mini_batch)
        encoder_outputs,(final_encoder_output,final_candidate_cell_state) = self.second_lstm_layer(first_embedding_layer_out)

        return encoder_outputs,(final_encoder_output,final_candidate_cell_state)

In [48]:
class Decoder(torch.nn.Module):
    def __init__(self,dst_lang_vocab_size,topic_vector_dim):
        super(Decoder,self).__init__()
        self.first_embedding_layer = torch.nn.Embedding(num_embeddings=dst_lang_vocab_size,
                                                        embedding_dim=topic_vector_dim)
        self.second_lstm_layer = torch.nn.LSTM(input_size=topic_vector_dim,hidden_size=topic_vector_dim,
                                               batch_first=True)
        self.attention_layer = Attention()
        self.output_layer = torch.nn.Linear(in_features=topic_vector_dim*2,out_features=dst_lang_vocab_size)
        self.output_layer_activation = torch.nn.Softmax(dim=2)

    def forward(self,encoder_outputs,initial_hidden_state,initial_candidate_cell_state,
                Y_padded_mini_batch):

        first_embedding_layer_out = self.first_embedding_layer(Y_padded_mini_batch)
        decoder_lstm_layer_outputs,final_cell_hidden_states = self.second_lstm_layer(first_embedding_layer_out,
                                                                                    (initial_hidden_state,
                                                                                    initial_candidate_cell_state))
        context_vectors = self.attention_layer(encoder_outputs,decoder_lstm_layer_outputs)
        concatenated_lstm_layer_output = torch.concatenate(tensors=(decoder_lstm_layer_outputs,context_vectors),dim=2)
        affine_transformed_output = self.output_layer(concatenated_lstm_layer_output)
        decoder_outputs = self.output_layer_activation(affine_transformed_output)

        return decoder_outputs, final_cell_hidden_states

In [49]:
class Seq2SeqEncDecWithAttn(torch.nn.Module):
    def __init__(self,src_lang_vocab_size,dst_lang_vocab_size,topic_vector_dim):
        super(Seq2SeqEncDecWithAttn,self).__init__()
        self.encoder = Encoder(src_lang_vocab_size,topic_vector_dim)
        self.decoder = Decoder(dst_lang_vocab_size,topic_vector_dim)

    def forward(self,X_padded_mini_batch,Y_padded_mini_batch_input):

        encoder_outputs,(final_encoder_output,final_candidate_cell_state) = self.encoder(X_padded_mini_batch)
        Y_hat_mini_batch, final_cell_hidden_states = self.decoder(encoder_outputs,final_encoder_output,
                                                                  final_candidate_cell_state,Y_padded_mini_batch_input)
        
        return Y_hat_mini_batch

In [51]:
X_padded_train = X_padded[0:13000]
Y_input_padded_train = Y_input_padded[0:13000]
Y_label_padded_train = Y_label_padded[0:13000]

X_padded_test = X_padded[13000:]
Y_input_padded_test = Y_input_padded[13000:]
Y_label_padded_test = Y_label_padded[13000:]

In [52]:
nw = Seq2SeqEncDecWithAttn(src_lang_vocab_size=len(Vs),dst_lang_vocab_size=len(Vd),topic_vector_dim=32)
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.RMSprop(params=nw.parameters())
epochs = 1
mb_size = 26

for epoch in range(epochs):
    for i in range(X_padded_train.shape[0]//mb_size):

        X_train_mb = X_padded_train[i*mb_size:(i+1)*mb_size]
        Y_input_train_mb = Y_input_padded_train[i*mb_size:(i+1)*mb_size]
        Y_label_train_mb = Y_label_padded_train[i*mb_size:(i+1)*mb_size]

        Y_label_train_mb = Y_label_train_mb.reshape(Y_label_train_mb.shape[0]*Y_label_train_mb.shape[1],)

        Y_pred_train_mb = nw(X_train_mb,Y_input_train_mb)
        Y_pred_train_mb = Y_pred_train_mb.reshape(Y_pred_train_mb.shape[0]*Y_pred_train_mb.shape[1],
                                                  Y_pred_train_mb.shape[2])
        

        loss_fn_value = loss_fn(Y_pred_train_mb,Y_label_train_mb)

        loss_fn_value.backward()
        optimizer.step()
        optimizer.zero_grad()

        if i % 25 == 0:
            print("Epoch #{}, Mini Batch #{}, CCE Loss = {}".format(epoch,i,loss_fn_value))

Epoch #0, Mini Batch #0, CCE Loss = 8.863905906677246
Epoch #0, Mini Batch #25, CCE Loss = 7.972998142242432
Epoch #0, Mini Batch #50, CCE Loss = 7.980232238769531
Epoch #0, Mini Batch #75, CCE Loss = 7.97843599319458
Epoch #0, Mini Batch #100, CCE Loss = 7.994259834289551
Epoch #0, Mini Batch #125, CCE Loss = 7.984238147735596
Epoch #0, Mini Batch #150, CCE Loss = 7.957523345947266
Epoch #0, Mini Batch #175, CCE Loss = 7.974461555480957
Epoch #0, Mini Batch #200, CCE Loss = 7.947882652282715
Epoch #0, Mini Batch #225, CCE Loss = 7.947875022888184
Epoch #0, Mini Batch #250, CCE Loss = 7.956212520599365
Epoch #0, Mini Batch #275, CCE Loss = 7.956339359283447
Epoch #0, Mini Batch #300, CCE Loss = 7.937109470367432
Epoch #0, Mini Batch #325, CCE Loss = 7.991406440734863
Epoch #0, Mini Batch #350, CCE Loss = 7.956906318664551
Epoch #0, Mini Batch #375, CCE Loss = 7.9438958168029785
Epoch #0, Mini Batch #400, CCE Loss = 7.960297107696533
Epoch #0, Mini Batch #425, CCE Loss = 7.9608631134033

In [53]:
Vd_idx2vocab = dict(zip(Vd.values(),Vd.keys()))

In [54]:
def generate_translation(eng_sentence):

    tokenized_eng_sentence = tokenizer.tokenize(eng_sentence)
    print(tokenized_eng_sentence)
    token_ids = tokenizer.convert_tokens_to_ids(tokenized_eng_sentence)
    token_ids_tensor = torch.tensor(token_ids)
    token_ids_tensor = torch.unsqueeze(token_ids_tensor,0)
    padded_token_ids = torch.nn.utils.rnn.pad_sequence(token_ids_tensor)

    encoder_outputs,(final_encoder_output,final_candidate_cell_state) = nw.encoder(padded_token_ids)
    decoder_first_time_step_input = torch.tensor([hindi_vocab["<SOS>"]]*mb_size)
    #decoder_first_time_step_input = torch.unsqueeze(decoder_first_time_step_input,1)
    final_encoder_output = torch.squeeze(final_encoder_output,0)
    final_candidate_cell_state = torch.squeeze(final_candidate_cell_state,0)
    decoder_first_time_step_output, hidden_cell_states = nw.decoder(encoder_outputs,
                                                                          final_encoder_output,
                                                                          final_candidate_cell_state,
                                                                          decoder_first_time_step_input)
    
    generated_token_id = torch.argmax(decoder_first_time_step_output,1)
    generated_token_id = torch.unsqueeze(generated_token_id,1)

    print(Vd_idx2vocab[generated_token_id])

    for i in range(Nd-1):

        generated_softmax_probabilities,hidden_cell_states = nw.decoder(encoder_outputs,
                                                                        hidden_cell_states[0],hidden_cell_states[1],
                                                                        generated_token_id)
        generated_token_id = torch.argmax(generated_softmax_probabilities,1)

        if generated_token_id == Vd["<EOS"]:
            break

        print(Vd_idx2vocab[generated_token_id])

In [None]:
generate_translation("The")
