In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hindi-english-dataset/Sentence pairs in English-Hindi - 2025-02-11.tsv


In [2]:
!pip3 install indic-nlp-library

Collecting indic-nlp-library
  Downloading indic_nlp_library-0.92-py3-none-any.whl.metadata (5.7 kB)
Collecting sphinx-argparse (from indic-nlp-library)
  Downloading sphinx_argparse-0.5.2-py3-none-any.whl.metadata (3.7 kB)
Collecting morfessor (from indic-nlp-library)
  Downloading Morfessor-2.0.6-py3-none-any.whl.metadata (628 bytes)
Downloading indic_nlp_library-0.92-py3-none-any.whl (40 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Morfessor-2.0.6-py3-none-any.whl (35 kB)
Downloading sphinx_argparse-0.5.2-py3-none-any.whl (12 kB)
Installing collected packages: morfessor, sphinx-argparse, indic-nlp-library
Successfully installed indic-nlp-library-0.92 morfessor-2.0.6 sphinx-argparse-0.5.2


In [3]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
import torch
from indicnlp.tokenize import indic_tokenize

In [4]:
data = pd.read_csv("/kaggle/input/hindi-english-dataset/Sentence pairs in English-Hindi - 2025-02-11.tsv", sep="\t", header=None, names=["SrcSentID","SrcSent","DstSentID","DstSent"])

In [5]:
data.shape

(13182, 4)

In [6]:
data.drop(labels=[data.columns[0],data.columns[2]],axis=1,inplace=True)

In [7]:
src_sent_tokenizer = AutoTokenizer.from_pretrained("google-T5/T5-base")

config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [8]:
data["SrcSent"] = data["SrcSent"].apply(lambda x: src_sent_tokenizer.tokenize(x))

In [9]:
data["DstSent"] = data["DstSent"].apply(lambda x: indic_tokenize.trivial_tokenize(x, lang="hi"))

In [10]:
data["SrcSent"] = data["SrcSent"].apply(src_sent_tokenizer.convert_tokens_to_ids)

In [11]:
Vs = src_sent_tokenizer.get_vocab()

In [12]:
len(Vs)

32100

In [13]:
hindi_vocab = set()
for tokenized_hindi_sent in data["DstSent"]:
    hindi_vocab.update(tokenized_hindi_sent)

In [14]:
vd = dict()
for idx, token in enumerate(hindi_vocab):
    vd[token] = idx + 3
vd["<PAD>"] = 0
vd["<SOS>"] = 1
vd["<EOS>"] = 2

In [15]:
def convert_hindi_tokens_to_ids(hindi_sent):
    return [vd[token] for token in hindi_sent]

In [16]:
data["DstSent"] = data["DstSent"].apply(lambda x: convert_hindi_tokens_to_ids(x))

In [17]:
def insert_sos_token_id(hindi_sent_token_ids):
    return [1] + hindi_sent_token_ids

In [18]:
data["DstSentInput"] = data["DstSent"].apply(lambda x: insert_sos_token_id(x))

In [19]:
def insert_eos_token_id(hindi_sent_token_ids):
    return hindi_sent_token_ids + [2]

In [20]:
data["DstSentLabel"] = data["DstSent"].apply(lambda x: insert_eos_token_id(x))

In [21]:
data.drop(labels=[data.columns[1]], axis=1, inplace=True)

In [22]:
x = list(data["SrcSent"])
y_input = list(data["DstSentInput"])
y_label = list(data["DstSentLabel"])


In [23]:
x_tensor = [torch.tensor(tokenized_eng_sent_ids) for tokenized_eng_sent_ids in x]
y_input_tensor = [torch.tensor(tokenized_hin_sent_ids) for tokenized_hin_sent_ids in y_input]
y_label_tensor = [torch.tensor(tokenized_hin_sent_ids) for tokenized_hin_sent_ids in y_label]

In [24]:
x_padded = torch.nn.utils.rnn.pad_sequence(x_tensor, batch_first = True)
y_padded_input = torch.nn.utils.rnn.pad_sequence(y_input_tensor, batch_first = True)
y_padded_label = torch.nn.utils.rnn.pad_sequence(y_label_tensor, batch_first = True)

In [25]:
ns = x_padded.shape[1]
nd = y_padded_label.shape[1]

In [26]:
class Encoder(torch.nn.Module):
    def __init__(self, src_lang_vocab_size, word_embedding_dim):
        super(Encoder, self).__init__()
        self.first_embedding_layer = torch.nn.Embedding(num_embeddings = src_lang_vocab_size,
                                                        embedding_dim = word_embedding_dim)
        self.second_lstm_layer = torch.nn.LSTM(input_size = word_embedding_dim,
                                               hidden_size = word_embedding_dim,
                                               batch_first = True)
    def forward(self, x_padded_mini_batch):
            first_embedding_layer_out = self.first_embedding_layer(x_padded_mini_batch)
            encoder_output, (final_encoder_output, final_cell_state) = self.second_lstm_layer(first_embedding_layer_out)
            return encoder_output, (final_encoder_output, final_cell_state)

In [27]:
class Decoder(torch.nn.Module):
    def __init__(self, dst_lang_vocab_size, word_embedding_dim):
        super(Decoder, self).__init__()
        self.first_embedding_layer = torch.nn.Embedding(num_embeddings = dst_lang_vocab_size,
                                                        embedding_dim = word_embedding_dim)
        self.second_lstm_layer = torch.nn.LSTM(input_size = word_embedding_dim,
                                               hidden_size = word_embedding_dim,
                                               batch_first = True)
        self.prediction_layer = torch.nn.Linear(in_features = word_embedding_dim, out_features = dst_lang_vocab_size)
        self.prediction_layer_activation = torch.nn.Softmax(dim = 2)
    def forward(self, y_padded_input_mini_batch, final_encoder_output, final_cell_state):
        first_embedding_layer_out = self.first_embedding_layer(y_padded_input_mini_batch)
        decoder_lstm_layer_out, (final_decoder_lstm_layer_out, final_cell_state) = self.second_lstm_layer(first_embedding_layer_out,
                                                                                                   (final_encoder_output,
                                                                                                   final_cell_state))
        prediction = self.prediction_layer_activation(self.prediction_layer(decoder_lstm_layer_out))
        return prediction, (final_decoder_lstm_layer_out, final_cell_state)

In [28]:
class Seq2SeqEncDec(torch.nn.Module):
    def __init__(self, src_lang_vocab_size, dst_lang_vocab_size, word_embedding_dim):
        super(Seq2SeqEncDec, self).__init__()
        self.encoder = Encoder(src_lang_vocab_size, word_embedding_dim)
        self.decoder = Decoder(dst_lang_vocab_size, word_embedding_dim)
    def forward(self, x_padded_mini_batch, y_padded_input_mini_batch):
        encoder_output, (final_encoder_output, final_cell_state) = self.encoder(x_padded_mini_batch)
        y_hat_mini_batch = self.decoder(y_padded_input_mini_batch, final_encoder_output, final_cell_state)
        return y_hat_mini_batch

In [29]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(device)

cuda


In [30]:
x_padded_train = x_padded[0:13000]
y_padded_input_train = y_padded_input[0:13000]
y_padded_label_train = y_padded_label[0:13000]
x_padded_test = x_padded[13000:]
y_padded_input_test = y_padded_input[13000:]
y_padded_label_test = y_padded_label[13000:]

In [31]:
len(Vs)

32100

In [32]:
len(vd)

7072

In [33]:
network = Seq2SeqEncDec(len(Vs), len(vd), 128).to(device)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index = 0)
optimizer = torch.optim.Adam(network.parameters())
num_epochs = 25
mb_size = 65

for epoch in range(num_epochs):
    for i in range(x_padded_train.shape[0]//mb_size):
        x_train_mb = x_padded_train[i*mb_size:(i+1)*mb_size]
        y_input_mb = y_padded_input_train[i*mb_size:(i+1)*mb_size]
        y_label_mb = y_padded_label_train[i*mb_size:(i+1)*mb_size]
        y_label_mb = y_label_mb.reshape(y_label_mb.shape[0]*y_label_mb.shape[1],)

        x_train_mb, y_input_mb, y_label_mb = x_train_mb.to(device), y_input_mb.to(device), y_label_mb.to(device)

        y_hat_train_mb = network(x_train_mb, y_input_mb)
        y_hat_train_mb = y_hat_train_mb[0]
        y_hat_train_mb = y_hat_train_mb.reshape(y_hat_train_mb.shape[0]*y_hat_train_mb.shape[1],
                                                y_hat_train_mb.shape[2])
        
        loss_fn_value = loss_fn(y_hat_train_mb, y_label_mb)

        loss_fn_value.backward()

        torch.nn.utils.clip_grad_norm_(network.parameters(), max_norm = 1.0)
        optimizer.step()
        optimizer.zero_grad()
        print("Epoch # {}, Time Step # {}, Loss Value = {}".format(epoch, i, loss_fn_value))

Epoch # 0, Time Step # 0, Loss Value = 8.863899230957031
Epoch # 0, Time Step # 1, Loss Value = 8.86389446258545
Epoch # 0, Time Step # 2, Loss Value = 8.8638916015625
Epoch # 0, Time Step # 3, Loss Value = 8.86388874053955
Epoch # 0, Time Step # 4, Loss Value = 8.863885879516602
Epoch # 0, Time Step # 5, Loss Value = 8.863880157470703
Epoch # 0, Time Step # 6, Loss Value = 8.863877296447754
Epoch # 0, Time Step # 7, Loss Value = 8.863869667053223
Epoch # 0, Time Step # 8, Loss Value = 8.86386775970459
Epoch # 0, Time Step # 9, Loss Value = 8.863856315612793
Epoch # 0, Time Step # 10, Loss Value = 8.863845825195312
Epoch # 0, Time Step # 11, Loss Value = 8.863834381103516
Epoch # 0, Time Step # 12, Loss Value = 8.86380386352539
Epoch # 0, Time Step # 13, Loss Value = 8.863801956176758
Epoch # 0, Time Step # 14, Loss Value = 8.863764762878418
Epoch # 0, Time Step # 15, Loss Value = 8.86374282836914
Epoch # 0, Time Step # 16, Loss Value = 8.863690376281738
Epoch # 0, Time Step # 17, Loss