In [1]:
import pandas as pd
import numpy as np
import torch

In [2]:
import os
from indicnlp import common
from indicnlp.tokenize import indic_tokenize

In [3]:
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
tokenizer=AutoTokenizer.from_pretrained("google-T5/T5-base")

In [5]:
data=pd.read_csv("/home/thasin/class-projects/LLM-project/Sentence pairs in English-Hindi - 2025-02-13.tsv",sep="\t",header=None,names=["SrcSentenceID","SrcSentence","DstSentenceID","DstSentence"])

In [None]:
data.head()

Unnamed: 0,SrcSentenceID,SrcSentence,DstSentenceID,DstSentence
0,1282,Muiriel is 20 now.,485968,म्यूरियल अब बीस साल की हो गई है।
1,1282,Muiriel is 20 now.,2060319,म्यूरियल अब बीस साल की है।
2,1294,Education in this world disappoints me.,485564,मैं इस दुनिया में शिक्षा पर बहुत निराश हूँ।
3,1302,That won't happen.,2060320,वैसा नहीं होगा।
4,1308,I miss you.,2060321,मुझें तुम्हारी याद आ रही है।


In [7]:
data["DstSentence"]=data["DstSentence"].apply(lambda x: indic_tokenize.trivial_tokenize(x,lang="hi"))

In [8]:
data["DstSentence"]

0              [म्यूरियल, अब, बीस, साल, की, हो, गई, है, ।]
1                      [म्यूरियल, अब, बीस, साल, की, है, ।]
2        [मैं, इस, दुनिया, में, शिक्षा, पर, बहुत, निराश...
3                                    [वैसा, नहीं, होगा, ।]
4                    [मुझें, तुम्हारी, याद, आ, रही, है, ।]
                               ...                        
13177            [क्या, आपके, पास, सब्ज़ी, -, चावल, है, ?]
13178            [क्या, आपके, पास, सब्ज़ी, -, चावल, है, ?]
13179            [क्या, आपके, पास, सब्ज़ी, -, चावल, है, ?]
13180            [क्या, आपके, पास, सब्ज़ी, -, चावल, है, ?]
13181              [मुझे, यह, साइकिल, अब, भी, पसंद, है, ।]
Name: DstSentence, Length: 13182, dtype: object

In [9]:
Nd=max(list(data["DstSentence"].apply(len)))

In [10]:
Nd

67

In [11]:
data["SrcSentence"]=data["SrcSentence"].apply(lambda x: tokenizer.tokenize(x))

In [12]:
data["SrcSentence"]

0                        [▁Mu, i, riel, ▁is, ▁20, ▁now, .]
1                        [▁Mu, i, riel, ▁is, ▁20, ▁now, .]
2        [▁Education, ▁in, ▁this, ▁world, ▁disappoint, ...
3                          [▁That, ▁won, ', t, ▁happen, .]
4                                     [▁I, ▁miss, ▁you, .]
                               ...                        
13177    [▁Do, ▁you, ▁have, ▁some, ▁curry, ▁and, ▁some,...
13178    [▁Do, ▁you, ▁have, ▁curry, ▁and, ▁some, ▁rice, ?]
13179    [▁Do, ▁you, ▁have, ▁any, ▁curry, ▁with, ▁rice, ?]
13180          [▁Do, ▁you, ▁have, ▁curry, ▁with, ▁rice, ?]
13181              [▁I, ▁still, ▁love, ▁this, ▁bicycle, .]
Name: SrcSentence, Length: 13182, dtype: object

In [13]:
Ns=max(list(data["SrcSentence"].apply(len)))

In [14]:
Ns

68

In [15]:
Vs = tokenizer.get_vocab()

In [16]:
Vs

{'▁influential': 16569,
 'MH': 20131,
 '▁iarna': 21781,
 '▁shipment': 19843,
 '▁ergeben': 25624,
 '▁Abdul': 28508,
 '▁fishermen': 30285,
 'empt': 9045,
 'iunii': 9785,
 'freiheit': 28995,
 'trays': 28501,
 'ibly': 15596,
 '▁Don': 1008,
 'attributed': 20923,
 '▁dispoziţi': 27774,
 '▁impress': 18514,
 '▁meist': 12711,
 'load': 7134,
 'stabil': 19614,
 '▁opioid': 23139,
 '▁tweet': 10657,
 '▁pollution': 10441,
 '▁Mel': 5049,
 'abri': 17639,
 '▁tourists': 11618,
 '▁Adri': 26062,
 '▁Trim': 21917,
 '▁Deutschland': 4069,
 '▁Bur': 4152,
 '▁stops': 10796,
 '▁accommodations': 18044,
 '2005': 22594,
 'quarter': 19973,
 'CAM': 21907,
 '▁avatar': 27374,
 '▁villages': 12293,
 '▁stumble': 29630,
 'GET': 20750,
 'with': 4065,
 '▁PowerPoint': 23587,
 '▁maintenance': 2453,
 '▁drept': 6072,
 '▁hack': 8093,
 'Atelier': 23612,
 '▁prest': 11504,
 '▁Privacy': 17865,
 '▁rasch': 30778,
 '▁navigate': 7939,
 '▁feat': 20906,
 '▁gloves': 16802,
 '▁privé': 21611,
 'geschlagen': 24883,
 '▁joueur': 25366,
 '▁renovatio

In [17]:
Vd = set()
for tokenized_hindi_sentence in data['DstSentence']:
    Vd.update(tokenized_hindi_sentence)
    
hindi_vocab = dict()

for idx, token in enumerate(Vd):
    hindi_vocab[token] = idx + 3

hindi_vocab["<PAD>"] = 0
hindi_vocab["<SOS>"] = 1
hindi_vocab["<EOS>"] = 2

Vd = hindi_vocab

In [18]:
print(Vd)

{'लगभग': 3, 'बताइए': 4, 'चिढ़': 5, 'डोमिनिकाना': 6, 'अन्नागार': 7, 'इसीलिए': 8, 'व्यायाम': 9, 'उभयलैंगिकता': 10, 'एन्क्रिप्टेड': 11, 'हस्ताक्षर': 12, 'मुशकिल': 13, 'कहूं': 14, 'देखने': 15, 'उगेगा': 16, 'अठारह': 17, 'बताने': 18, 'दूं': 19, 'आत्मनिर्णय': 20, 'ज़बरदस्ती': 21, 'पढ़ने': 22, 'मोड़ो': 23, 'कला': 24, 'लोरी': 25, 'ट्रेड': 26, 'जिसने': 27, 'बताऊँ': 28, 'विदेषी': 29, 'सुरक्षित': 30, 'होनी': 31, 'मातापिता': 32, 'बुनना': 33, 'जीतूँगा': 34, 'घना': 35, 'ब्रिटिशकाल': 36, 'नौवे': 37, 'झूठे': 38, 'जाऊंगी': 39, 'मानेगा': 40, 'झूठा': 41, 'अपनेआप': 42, 'पाए': 43, 'सामान्य': 44, 'बगैर': 45, 'गोलियों': 46, 'किसने': 47, 'जाती': 48, 'मुझें': 49, 'पकड़े': 50, 'दीमक': 51, 'चौंका': 52, 'अनहोनी': 53, 'आदतें': 54, 'परिवर्तित': 55, 'सूख': 56, 'ब्रॅड': 57, 'दिली': 58, 'बिल्ले': 59, 'कैथोलिक': 60, 'महँगा': 61, 'दिलाई': 62, 'कुल': 63, 'मंत्री': 64, 'अर्जित': 65, 'चिट्ठी': 66, 'तीव्र': 67, 'प्रेमिका': 68, 'चाहता': 69, 'हादसा': 70, 'लूंगी': 71, 'वस्\u200dतु': 72, 'कहता': 73, 'सँभाला': 74, 'ज़ख्मी': 75, '

In [19]:
len(Vd)

7072

In [20]:
#jis bhasa me translate krna hai us bhasa me teeno Token chahiye SOS EOS PAD
#jis se translate kr rha hai usme jarurat nhi hai

In [21]:
data.head()

Unnamed: 0,SrcSentenceID,SrcSentence,DstSentenceID,DstSentence
0,1282,"[▁Mu, i, riel, ▁is, ▁20, ▁now, .]",485968,"[म्यूरियल, अब, बीस, साल, की, हो, गई, है, ।]"
1,1282,"[▁Mu, i, riel, ▁is, ▁20, ▁now, .]",2060319,"[म्यूरियल, अब, बीस, साल, की, है, ।]"
2,1294,"[▁Education, ▁in, ▁this, ▁world, ▁disappoint, ...",485564,"[मैं, इस, दुनिया, में, शिक्षा, पर, बहुत, निराश..."
3,1302,"[▁That, ▁won, ', t, ▁happen, .]",2060320,"[वैसा, नहीं, होगा, ।]"
4,1308,"[▁I, ▁miss, ▁you, .]",2060321,"[मुझें, तुम्हारी, याद, आ, रही, है, ।]"


In [22]:
#converting English word to numerical
data['SrcSentence'] = data['SrcSentence'].apply(tokenizer.convert_tokens_to_ids)

#then after that english numeric value will ues as a input in NN afeter converting One hot encoding 

In [23]:
#converting Hindi Sentence to numbers 
def convert_hindi_tokens_to_ids(tokenized_hindi_sentence):
    return [Vd[token] for token in tokenized_hindi_sentence]

In [24]:
data['DstSentence'] = data['DstSentence'].apply(convert_hindi_tokens_to_ids)

In [25]:
data.head()

Unnamed: 0,SrcSentenceID,SrcSentence,DstSentenceID,DstSentence
0,1282,"[4159, 23, 14018, 19, 460, 230, 5]",485968,"[4458, 795, 4528, 6081, 6661, 1976, 532, 2516,..."
1,1282,"[4159, 23, 14018, 19, 460, 230, 5]",2060319,"[4458, 795, 4528, 6081, 6661, 2516, 5011]"
2,1294,"[2855, 16, 48, 296, 26963, 7, 140, 5]",485564,"[7043, 1000, 992, 3620, 3192, 418, 3410, 2751,..."
3,1302,"[466, 751, 31, 17, 1837, 5]",2060320,"[4640, 2432, 5618, 5011]"
4,1308,"[27, 3041, 25, 5]",2060321,"[49, 296, 1727, 6785, 6728, 2516, 5011]"


In [26]:
X = list(data['SrcSentence'])
Y = list(data['DstSentence'])

X_tensor = [torch.tensor(eng_tokenized_ids) for eng_tokenized_ids in X]
Y_tensor = [torch.tensor(hindi_tokenized_ids) for hindi_tokenized_ids in Y]

X_padded = torch.nn.utils.rnn.pad_sequence(X_tensor,batch_first=True)
Y_padded = torch.nn.utils.rnn.pad_sequence(Y_tensor,batch_first=True)

In [27]:
X_padded.shape

torch.Size([13182, 68])

In [28]:
X_padded[2]

tensor([ 2855,    16,    48,   296, 26963,     7,   140,     5,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [29]:
Y_padded.shape

torch.Size([13182, 67])

In [30]:
Y_padded[2]

tensor([7043, 1000,  992, 3620, 3192,  418, 3410, 2751, 2831, 5011,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0])

In [31]:
#1 Encoder
#2 Decoder
# 3 attention layer
#then combine all of these

# 1:
#embedding layer does not have bias that will give output in encoder that is first_embedding_layer_output (32100,32)
#the first_embedding_layer_output will go in LSTM layer as a input and give three output 1.encoder output(15,68,32) it is a matrix, 2.final_encoder_ouput[c[0]] vector, 3. final_candidate_cell_output(h1[0]) vector
#encoder box will give two outputs 1.inal_encoder_ouput, 2. final_candidate_cell_output
class Encoder(torch.nn.Module):
    def __init__(self, src_lang_vocab_dim, topic_vector_dim):
        super(Encoder,self).__init__()
        self.first_embedding_layer = torch.nn.Embedding(num_embeddings=src_lang_vocab_dim, embedding_dim=topic_vector_dim)
        self.second_lstm_layer = torch.nn.LSTM(input_size=topic_vector_dim,hidden_size=topic_vector_dim,batch_first=True)
        
    def forward(self,x_padded_mini_batch):
        
        first_embedding_layer_out = self.first_embedding_layer(x_padded_mini_batch)
        encoder_output,(final_encoder_output, final_candidate_cell_output) = self.second_lstm_layer(first_embedding_layer_out)
        
        return encoder_output,(final_encoder_output,final_candidate_cell_output)

In [32]:
test = Encoder(len(X_padded),32)
print(test)

Encoder(
  (first_embedding_layer): Embedding(13182, 32)
  (second_lstm_layer): LSTM(32, 32, batch_first=True)
)


In [40]:
X_padded_mini_batch = torch.randint(0, len(X_padded), (26, 68))

In [41]:
X_padded_mini_batch.shape

torch.Size([26, 68])

In [42]:
encoder_output,(final_encoder_output,final_candidate_cell_output) = test(X_padded_mini_batch)

In [43]:
encoder_output

tensor([[[-0.1417,  0.0641, -0.0472,  ...,  0.1099,  0.0281,  0.0398],
         [-0.0230, -0.0231, -0.2075,  ...,  0.2611,  0.0601, -0.0614],
         [-0.0794,  0.0023, -0.2114,  ...,  0.1558, -0.1048, -0.0556],
         ...,
         [ 0.0563, -0.1724, -0.0814,  ...,  0.1037,  0.0545, -0.2440],
         [ 0.1435, -0.3471,  0.0294,  ..., -0.2530,  0.0534, -0.0886],
         [ 0.2708,  0.0522, -0.0136,  ..., -0.0510,  0.1283, -0.1725]],

        [[ 0.0591, -0.2636, -0.0757,  ...,  0.1504,  0.0566,  0.0083],
         [-0.1801, -0.0107, -0.0734,  ...,  0.2207, -0.0345,  0.0762],
         [ 0.0567, -0.1379, -0.1557,  ...,  0.1570, -0.0096, -0.0383],
         ...,
         [-0.0248, -0.0410,  0.0487,  ...,  0.1287, -0.1191, -0.1959],
         [ 0.0792, -0.1118, -0.2716,  ...,  0.0662, -0.1451, -0.1242],
         [ 0.1002, -0.1121, -0.2296,  ...,  0.1214, -0.1968, -0.1214]],

        [[ 0.0479, -0.0258, -0.0854,  ...,  0.1819,  0.1001, -0.0731],
         [ 0.1177, -0.1407,  0.0273,  ..., -0

In [44]:
encoder_output.shape

torch.Size([26, 68, 32])

In [45]:
final_encoder_output.shape

torch.Size([1, 26, 32])

In [46]:
final_candidate_cell_output.shape  # (num_layers, batch_size, hidden_dim)

torch.Size([1, 26, 32])

In [48]:
Y_padded.shape

torch.Size([13182, 67])

In [None]:
#Decoder
#Inputs in decoder---> encoder_output, 2.h[0] hidden state, 3.[C[0]] candidate cell state, 3. Y_padded_mini_batch
#then lstm gives you output decoder_lstm_layer_outputs,(final_cell_hidden_states-->hd[0],cd[0])
#ouput of decoder is-----> 1. Decoder ouput-hindi sententeces ka prediction (26,67,7072)

class Decoder(torch.nn.Module):
   