In [191]:
import pandas as pd
import numpy as np
import torch

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")  # Use GPU
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")  
    print("Using CPU")


Using GPU: Tesla P100-PCIE-16GB


In [194]:
import os
from indicnlp import common
from indicnlp.tokenize import indic_tokenize

In [195]:
from transformers import AutoTokenizer

In [196]:
tokenizer=AutoTokenizer.from_pretrained("google-T5/T5-base")

In [197]:
data=pd.read_csv("/kaggle/input/sentence-pair-english-to-hindi/Sentence pairs in English-Hindi - 2025-02-13.tsv",sep="\t",header=None,names=["SrcSentenceID","SrcSentence","DstSentenceID","DstSentence"])

In [198]:
data.head()

Unnamed: 0,SrcSentenceID,SrcSentence,DstSentenceID,DstSentence
0,1282,Muiriel is 20 now.,485968,म्यूरियल अब बीस साल की हो गई है।
1,1282,Muiriel is 20 now.,2060319,म्यूरियल अब बीस साल की है।
2,1294,Education in this world disappoints me.,485564,मैं इस दुनिया में शिक्षा पर बहुत निराश हूँ।
3,1302,That won't happen.,2060320,वैसा नहीं होगा।
4,1308,I miss you.,2060321,मुझें तुम्हारी याद आ रही है।


In [199]:
data.shape

(13182, 4)

In [200]:
data["DstSentence"]=data["DstSentence"].apply(lambda x: indic_tokenize.trivial_tokenize(x,lang="hi"))

In [201]:
data["DstSentence"]

0              [म्यूरियल, अब, बीस, साल, की, हो, गई, है, ।]
1                      [म्यूरियल, अब, बीस, साल, की, है, ।]
2        [मैं, इस, दुनिया, में, शिक्षा, पर, बहुत, निराश...
3                                    [वैसा, नहीं, होगा, ।]
4                    [मुझें, तुम्हारी, याद, आ, रही, है, ।]
                               ...                        
13177            [क्या, आपके, पास, सब्ज़ी, -, चावल, है, ?]
13178            [क्या, आपके, पास, सब्ज़ी, -, चावल, है, ?]
13179            [क्या, आपके, पास, सब्ज़ी, -, चावल, है, ?]
13180            [क्या, आपके, पास, सब्ज़ी, -, चावल, है, ?]
13181              [मुझे, यह, साइकिल, अब, भी, पसंद, है, ।]
Name: DstSentence, Length: 13182, dtype: object

In [202]:
Nd=max(list(data["DstSentence"].apply(len)))

In [203]:
Nd

67

In [204]:
data["SrcSentence"]=data["SrcSentence"].apply(lambda x: tokenizer.tokenize(x))

In [205]:
data["SrcSentence"]

0                        [▁Mu, i, riel, ▁is, ▁20, ▁now, .]
1                        [▁Mu, i, riel, ▁is, ▁20, ▁now, .]
2        [▁Education, ▁in, ▁this, ▁world, ▁disappoint, ...
3                          [▁That, ▁won, ', t, ▁happen, .]
4                                     [▁I, ▁miss, ▁you, .]
                               ...                        
13177    [▁Do, ▁you, ▁have, ▁some, ▁curry, ▁and, ▁some,...
13178    [▁Do, ▁you, ▁have, ▁curry, ▁and, ▁some, ▁rice, ?]
13179    [▁Do, ▁you, ▁have, ▁any, ▁curry, ▁with, ▁rice, ?]
13180          [▁Do, ▁you, ▁have, ▁curry, ▁with, ▁rice, ?]
13181              [▁I, ▁still, ▁love, ▁this, ▁bicycle, .]
Name: SrcSentence, Length: 13182, dtype: object

In [206]:
seq_len=max(list(data["SrcSentence"].apply(len)))

In [207]:
seq_len

68

In [208]:
Vs = tokenizer.get_vocab()

In [209]:
len(Vs)

32100

In [210]:
Vd = set()
for tokenized_hindi_sentence in data['DstSentence']:
    Vd.update(tokenized_hindi_sentence)
    
hindi_vocab = dict()

for idx, token in enumerate(Vd):
    hindi_vocab[token] = idx + 4

hindi_vocab["<PAD>"] = 0
hindi_vocab["<SOS>"] = 1
hindi_vocab["<EOS>"] = 2
hindi_vocab["<UNK>"] = 3

Vd = hindi_vocab

In [211]:
eng_vocab = dict()
for idx, token in enumerate(Vs):
    eng_vocab[token] = idx + 4  # reserve 0–3 for special tokens

eng_vocab["<PAD>"] = 0
eng_vocab["<SOS>"] = 1
eng_vocab["<EOS>"] = 2
eng_vocab["<UNK>"] = 3


In [212]:
len(Vd)

7073

In [213]:
len(eng_vocab)

32104

In [214]:
#jis bhasa me translate krna hai us bhasa me teeno Token chahiye SOS EOS PAD
#jis se translate kr rha hai usme jarurat nhi hai

In [215]:
data.head()

Unnamed: 0,SrcSentenceID,SrcSentence,DstSentenceID,DstSentence
0,1282,"[▁Mu, i, riel, ▁is, ▁20, ▁now, .]",485968,"[म्यूरियल, अब, बीस, साल, की, हो, गई, है, ।]"
1,1282,"[▁Mu, i, riel, ▁is, ▁20, ▁now, .]",2060319,"[म्यूरियल, अब, बीस, साल, की, है, ।]"
2,1294,"[▁Education, ▁in, ▁this, ▁world, ▁disappoint, ...",485564,"[मैं, इस, दुनिया, में, शिक्षा, पर, बहुत, निराश..."
3,1302,"[▁That, ▁won, ', t, ▁happen, .]",2060320,"[वैसा, नहीं, होगा, ।]"
4,1308,"[▁I, ▁miss, ▁you, .]",2060321,"[मुझें, तुम्हारी, याद, आ, रही, है, ।]"


In [216]:
#converting English word to numerical
data['SrcSentence'] = data['SrcSentence'].apply(tokenizer.convert_tokens_to_ids)

#then after that english numeric value will ues as a input in NN afeter converting One hot encoding 

In [217]:
#converting Hindi Sentence to numbers 
def convert_hindi_tokens_to_ids(tokenized_hindi_sentence):
    return [Vd[token] for token in tokenized_hindi_sentence]

In [218]:
data['DstSentence'] = data['DstSentence'].apply(convert_hindi_tokens_to_ids)

In [219]:
data.head()

Unnamed: 0,SrcSentenceID,SrcSentence,DstSentenceID,DstSentence
0,1282,"[4159, 23, 14018, 19, 460, 230, 5]",485968,"[4446, 1212, 811, 6742, 1997, 6567, 2573, 4117..."
1,1282,"[4159, 23, 14018, 19, 460, 230, 5]",2060319,"[4446, 1212, 811, 6742, 1997, 4117, 4344]"
2,1294,"[2855, 16, 48, 296, 26963, 7, 140, 5]",485564,"[5968, 6994, 4152, 5941, 1115, 3143, 3963, 421..."
3,1302,"[466, 751, 31, 17, 1837, 5]",2060320,"[2230, 4800, 6686, 4344]"
4,1308,"[27, 3041, 25, 5]",2060321,"[4737, 3554, 6787, 3712, 501, 4117, 4344]"


In [220]:
def insert_sos_token_id(hindi_sentence_token_ids_list):
    return [1] + hindi_sentence_token_ids_list

In [221]:
def insert_eos_token_id(hindi_sentence_token_ids_list):
    return hindi_sentence_token_ids_list + [2]

In [222]:
data["DstSentenceInput"] = data["DstSentence"].apply(insert_sos_token_id)
data["DstSentenceLabel"] = data["DstSentence"].apply(insert_eos_token_id)

In [223]:
data.head()

Unnamed: 0,SrcSentenceID,SrcSentence,DstSentenceID,DstSentence,DstSentenceInput,DstSentenceLabel
0,1282,"[4159, 23, 14018, 19, 460, 230, 5]",485968,"[4446, 1212, 811, 6742, 1997, 6567, 2573, 4117...","[1, 4446, 1212, 811, 6742, 1997, 6567, 2573, 4...","[4446, 1212, 811, 6742, 1997, 6567, 2573, 4117..."
1,1282,"[4159, 23, 14018, 19, 460, 230, 5]",2060319,"[4446, 1212, 811, 6742, 1997, 4117, 4344]","[1, 4446, 1212, 811, 6742, 1997, 4117, 4344]","[4446, 1212, 811, 6742, 1997, 4117, 4344, 2]"
2,1294,"[2855, 16, 48, 296, 26963, 7, 140, 5]",485564,"[5968, 6994, 4152, 5941, 1115, 3143, 3963, 421...","[1, 5968, 6994, 4152, 5941, 1115, 3143, 3963, ...","[5968, 6994, 4152, 5941, 1115, 3143, 3963, 421..."
3,1302,"[466, 751, 31, 17, 1837, 5]",2060320,"[2230, 4800, 6686, 4344]","[1, 2230, 4800, 6686, 4344]","[2230, 4800, 6686, 4344, 2]"
4,1308,"[27, 3041, 25, 5]",2060321,"[4737, 3554, 6787, 3712, 501, 4117, 4344]","[1, 4737, 3554, 6787, 3712, 501, 4117, 4344]","[4737, 3554, 6787, 3712, 501, 4117, 4344, 2]"


In [224]:
data.drop(labels=["SrcSentenceID","DstSentenceID","DstSentence"],axis=1,inplace=True)

In [225]:
data.head()

Unnamed: 0,SrcSentence,DstSentenceInput,DstSentenceLabel
0,"[4159, 23, 14018, 19, 460, 230, 5]","[1, 4446, 1212, 811, 6742, 1997, 6567, 2573, 4...","[4446, 1212, 811, 6742, 1997, 6567, 2573, 4117..."
1,"[4159, 23, 14018, 19, 460, 230, 5]","[1, 4446, 1212, 811, 6742, 1997, 4117, 4344]","[4446, 1212, 811, 6742, 1997, 4117, 4344, 2]"
2,"[2855, 16, 48, 296, 26963, 7, 140, 5]","[1, 5968, 6994, 4152, 5941, 1115, 3143, 3963, ...","[5968, 6994, 4152, 5941, 1115, 3143, 3963, 421..."
3,"[466, 751, 31, 17, 1837, 5]","[1, 2230, 4800, 6686, 4344]","[2230, 4800, 6686, 4344, 2]"
4,"[27, 3041, 25, 5]","[1, 4737, 3554, 6787, 3712, 501, 4117, 4344]","[4737, 3554, 6787, 3712, 501, 4117, 4344, 2]"


In [226]:
X = list(data["SrcSentence"])
Y_input = list(data["DstSentenceInput"])
Y_label = list(data["DstSentenceLabel"])

X_tensor = [torch.tensor(eng_tokenized_ids) for eng_tokenized_ids in X]
Y_input_tensor = [torch.tensor(hin_tokenized_ids) for hin_tokenized_ids in Y_input]
Y_label_tensor = [torch.tensor(hin_tokenized_ids) for hin_tokenized_ids in Y_label]

X_padded = torch.nn.utils.rnn.pad_sequence(X_tensor,batch_first=True)
Y_input_padded = torch.nn.utils.rnn.pad_sequence(Y_input_tensor,batch_first=True)
Y_label_padded = torch.nn.utils.rnn.pad_sequence(Y_label_tensor,batch_first=True)

In [227]:
X_padded.shape

torch.Size([13182, 68])

In [228]:
X_padded[2]

tensor([ 2855,    16,    48,   296, 26963,     7,   140,     5,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [229]:
seq_len = X_padded.shape[1]
dest_len = Y_label_padded.shape[1]
src_vocab_size = len(Vs)

In [230]:
seq_len, dest_len, src_vocab_size

(68, 68, 32100)

In [231]:
class PositionalEncoding(torch.nn.Module):
    def __init__(self, d_model, max_len=500):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]


In [232]:
d_model = 256
src_embedding = torch.nn.Embedding(src_vocab_size, d_model)
x = src_embedding(X_padded)  # (batch_size, seq_len, d_model)


In [233]:
import math
pos_encoder = PositionalEncoding(d_model, max_len=X_padded.shape[1])
x = pos_encoder(x)

In [234]:
class MultiHeadAttention(torch.nn.Module):
    def __init__(self, embed_size, heads):
        super(MultiHeadAttention, self).__init__()
        self.embed_size = embed_size 
        self.heads = heads # 8 in Transformer
        self.head_dim = embed_size // heads 
        assert (
            self.head_dim * heads == embed_size
        ), "Embedding size needs to be divisible by heads"
        # === Project Embeddings into three vectors: Query, Key and Value ===
        
        self.values = torch.nn.Linear(embed_size, embed_size)
        self.keys = torch.nn.Linear(embed_size, embed_size)
        self.queries = torch.nn.Linear(embed_size, embed_size)
        self.fc_out = torch.nn.Linear(embed_size, embed_size)

    def forward(self, values, keys, query, mask):
        # Values, Keys and Queries have size: (batch_size, sequence_len, embedding_size)
        batch_size = query.shape[0]
        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]
        # === Pass through Linear Layer ===
        values = self.values(values)  # (batch_size, value_len, embed_size)
        keys = self.keys(keys)  # (batch_size, key_len, embed_size)
        queries = self.queries(query)  # (batch_size, query_len, embed_size)

        values = values.reshape(batch_size, value_len, self.heads, self.head_dim)
        keys = keys.reshape(batch_size, key_len, self.heads, self.head_dim)
        queries = queries.reshape(batch_size, query_len, self.heads, self.head_dim)


        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
        # queries shape: (batch_size, query_len, heads, heads_dim),
        # keys shape: (batch_size, key_len, heads, heads_dim)
        # energy: (batch_size, heads, query_len, key_len)

        # Mask padded indices so their weights become 0
        if mask is not None:
            energy = energy.masked_fill(mask == 0, float("-1e20"))

        # Normalize energy values similarly to seq2seq + attention
        # so that they sum to 1. Also divide by scaling factor for
        # better stability
        attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=3) 
        # attention shape: (batch_size, heads, query_len, key_len)
        # values shape: (batch_size, value_len, heads, heads_dim)
        # out after matrix multiply: (batch_size, query_len, heads, head_dim), then
        # we reshape and flatten the last two dimensions.
        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
            batch_size, query_len, self.heads * self.head_dim
        )
        # Linear layer doesn't modify the shape, final shape will be
        # (batch_size, query_len, embed_size)
        out = self.fc_out(out)
        return out

In [235]:
embed_size = 256
heads = 8
self_attn = MultiHeadAttention(embed_size, 8)
attn_output = self_attn(x,x,x,mask=None)  # Still (batch_size, seq_len, d_model)


In [236]:
attn_output.shape

torch.Size([13182, 68, 256])

In [237]:
class TransformerLayer(torch.nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion=4):
        super(TransformerLayer, self).__init__()
        self.attention = MultiHeadAttention(embed_size, heads) 
        self.norm1 = torch.nn.LayerNorm(embed_size)
        self.norm2 = torch.nn.LayerNorm(embed_size)
        self.feed_forward = torch.nn.Sequential(
            torch.nn.Linear(embed_size, forward_expansion * embed_size),
            torch.nn.ReLU(),
            torch.nn.Linear(forward_expansion * embed_size, embed_size),
        )
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, value, key, query, mask):
        # Values, Keys and Queries have size: (batch_size, query_len, embedding_size)
        attention = self.attention(value, key, query, mask) # attention shape: (batch_size, query_len, embedding_size)
        # Add skip connection, run through normalization and finally dropout
        x = self.dropout(self.norm1(attention + query)) # x shape: (batch_size, query_len, embedding_size)
        forward = self.feed_forward(x) # forward shape: (batch_size, query_len, embedding_size)
        out = self.dropout(self.norm2(forward + x)) # out shape: (batch_size, query_len, embedding_size)
        return out

In [259]:
class Encoder(torch.nn.Module):
    def __init__(self, src_vocab_size, embed_size, num_layers, heads,
        device, forward_expansion, dropout, max_length): 
        super(Encoder, self).__init__()
        self.embed_size = embed_size # size of the input embedding
        self.device = device # either "cuda" or "cpu"
        # Lookup table with an embedding for each word in the vocabulary
        self.word_embedding = torch.nn.Embedding(src_vocab_size, embed_size) 
        # Lookup table with a positional embedding for each word in the sequence
        self.position_embedding =torch.nn.Embedding(max_length, embed_size)
        self.layers = torch.nn.ModuleList(
            [
                TransformerLayer(
                    embed_size,
                    heads,
                    dropout=dropout,
                    forward_expansion=forward_expansion,
                )
                for _ in range(num_layers)
            ]
        )
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, x, mask):
       
        batch_size, seq_length = x.shape
        x = x.to(self.device)
        # positions is an arange from (0,seq_len), e.g: torch.tensor([[0,1,2,...,N], [0,1,2,...,N], ..., [0,1,2,...,N]])
        positions = torch.arange(0, seq_length).expand(batch_size, seq_length).to(self.device)
        x = x.long()
        
        out = self.dropout((self.word_embedding(x) + self.position_embedding(positions)))
        # In the Encoder the query, key, value are all the same, in the
        # decoder this will change. This might look a bit odd in this case.
        for layer in self.layers:
            out = layer(out, out, out, mask)
        # output shape: torch.Size([batch_size, sequence_length, embedding_size])
        return out

In [260]:
class DecoderLayer(torch.nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout, device):
        super(DecoderLayer, self).__init__()
        self.norm = torch.nn.LayerNorm(embed_size)
        self.attention = MultiHeadAttention(embed_size, heads=heads)
        self.transformer_block = TransformerLayer(
            embed_size, heads, dropout, forward_expansion
        )
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, x, value, key, src_mask, trg_mask):
        attention = self.attention(x, x, x, trg_mask)
        query = self.dropout(self.norm(attention + x))
        out = self.transformer_block(value, key, query, src_mask)
        return out

In [261]:
class Decoder(torch.nn.Module):
    def __init__(self, trg_vocab_size, embed_size, num_layers, heads, forward_expansion,
        dropout, device, max_length):
        
        super(Decoder, self).__init__()
        self.device = device
        #=== For each token in target vocab there is a token embedding ===
        
        self.word_embedding = torch.nn.Embedding(trg_vocab_size, embed_size) 
        self.position_embedding = torch.nn.Embedding(max_length, embed_size)
        self.layers = torch.nn.ModuleList(
            [
                DecoderLayer(embed_size, heads, forward_expansion, dropout, device)
                for _ in range(num_layers)
            ]
        )
        self.fc_out = torch.nn.Linear(embed_size, trg_vocab_size)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, x, enc_out, src_mask, trg_mask):
        """
        :param x: target sequence. Shape: (batch_size, target_sequence_len)
        :param enc_out: encoder output. Shape: (batch_size, src_sequence_length, embedding_size)
        :param src_mask: source mask.
        :param trg_mask: target mask.
        """
        batch_size, seq_length = x.shape # x shape: (batch_size, target_sequence_len)
        # positions is an arange from (0,seq_len), e.g: torch.tensor([[0,1,2,...,N], [0,1,2,...,N], ..., [0,1,2,...,N]])
        x = x.to(self.device)
        positions = torch.arange(0, seq_length).expand(batch_size, seq_length).to(self.device) # positions shape: (batch_size, target_sequence_len)
        x = self.dropout((self.word_embedding(x) + self.position_embedding(positions)))
    
        for layer in self.layers:
            x = layer(x, enc_out, enc_out, src_mask, trg_mask)

        out = self.fc_out(x)
        return out

In [262]:
class Transformer(torch.nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, embed_size=512,num_layers=6, forward_expansion=4, heads=8, dropout=0, device=device, max_length=100):

        super(Transformer, self).__init__()
        # === Encoder ===
        self.encoder = Encoder(src_vocab_size, embed_size, num_layers, heads, device, forward_expansion, dropout, max_length)
        # === Decoder ===
        self.decoder = Decoder(trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_length)
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        # (N, 1, 1, src_len)
        return src_mask.to(self.device)
        
    def make_trg_mask(self, trg):
        # trg: (N, trg_len)
        N, trg_len = trg.shape

        # Padding mask: (N, 1, 1, trg_len)
        pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2).to(self.device)

        # Causal mask: (1, 1, trg_len, trg_len)
        causal_mask = torch.tril(torch.ones((trg_len, trg_len), device=self.device)).bool()
        causal_mask = causal_mask.unsqueeze(0).unsqueeze(1)

        # Combine: only attend to previous tokens and non-pad
        trg_mask = pad_mask & causal_mask  # shape: (N, 1, trg_len, trg_len)
        return trg_mask

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src) # src_mask shape: 
        trg_mask = self.make_trg_mask(trg) # trg_mask shape: 
        enc_src = self.encoder(src, src_mask) # enc_src shape:
        out = self.decoder(trg, enc_src, src_mask, trg_mask) # out shape: 
        return out

In [263]:
src = X_padded           # (batch_size, src_seq_len)
trg = Y_input_padded     # (batch_size, trg_seq_len)
labels = Y_label_padded  # (batch_size, trg_seq_len)


In [264]:
X_padded.shape, Y_input_padded.shape, Y_label_padded.shape


(torch.Size([13182, 68]), torch.Size([13182, 68]), torch.Size([13182, 68]))

In [265]:
X_padded_train = X_padded[0:13000]
Y_input_padded_train = Y_input_padded[0:13000]
Y_label_padded_train = Y_label_padded[0:13000]

X_padded_test = X_padded[13000:]
Y_input_padded_test = Y_input_padded[13000:]
Y_label_padded_test = Y_label_padded[13000:]

In [266]:
# Move training data to device
X_padded_train = X_padded_train.to(device)
Y_input_padded_train = Y_input_padded_train.to(device)
Y_label_padded_train = Y_label_padded_train.to(device)

# Move test data to device
X_padded_test = X_padded_test.to(device)
Y_input_padded_test = Y_input_padded_test.to(device)
Y_label_padded_test = Y_label_padded_test.to(device)


In [267]:
import torch.optim as optim
# Hyperparameters
learning_rate = 3e-4
batch_size = 64
num_epochs = 35
clip = 1
trg_pad_idx = 0
src_pad_idx = 0
trg_vocab_size = len(Vd)
# Loss and optimizer
pad_idx = trg_pad_idx
criterion = torch.nn.CrossEntropyLoss(ignore_index=pad_idx)
src_vocab_size = len(eng_vocab)
# Instantiate the model
model = Transformer(
    src_vocab_size=src_vocab_size,
    trg_vocab_size=trg_vocab_size,
    src_pad_idx=src_pad_idx,
    trg_pad_idx=trg_pad_idx,
    embed_size=256,
    num_layers=6,
    forward_expansion=6,
    heads=8,
    dropout=0.3,
    device=device,
    max_length=100
).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [268]:
from torch.utils.data import DataLoader, TensorDataset
# Create TensorDataset for training
train_dataset = TensorDataset(X_padded_train, Y_input_padded_train, Y_label_padded_train)

# Create DataLoader for training
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, drop_last=True)

# Create TensorDataset for testing
test_dataset = TensorDataset(X_padded_test, Y_input_padded_test, Y_label_padded_test)

# Create DataLoader for testing
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=False)

# Optionally, printing some examples from the train loader to verify
for batch_idx, (X_batch, Y_input_batch, Y_label_batch) in enumerate(train_loader):
    print(f"Batch {batch_idx + 1}:")
    print("X_batch shape:", X_batch.shape)
    print("Y_input_batch shape:", Y_input_batch.shape)
    print("Y_label_batch shape:", Y_label_batch.shape)
    break  # Just look at the first batch


Batch 1:
X_batch shape: torch.Size([64, 68])
Y_input_batch shape: torch.Size([64, 68])
Y_label_batch shape: torch.Size([64, 68])


In [269]:
divice  = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:

model.train()

for epoch in range(num_epochs):
    epoch_loss = 0
    for idx, (src, trg_input, trg_label) in enumerate(train_loader):
        src = src.to(device)
        trg_input = trg_input.to(device)
        trg_label = trg_label.to(device)

        optimizer.zero_grad()
        output = model(src, trg_input)

        # Reshape for loss calculation: 
        # Output shape: (batch_size, trg_len, vocab_size) → (batch_size * trg_len, vocab_size)
        # Target shape: (batch_size, trg_len) → (batch_size * trg_len)
        output = model(src, trg[:, :-1])  # shift trg for teacher forcing

        
        output = output.reshape(-1, output.shape[2])
        
        trg_label = trg_label.reshape(-1)

        loss = criterion(output, trg_label)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip)
        optimizer.step()

        epoch_loss += loss.item()
        if idx % 500 == 0:
            print(f"Epochs {epoch+1}/ {num_epochs} and loss: {loss.item():.4f}")
    
    avg_loss = epoch_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")


RuntimeError: shape '[13182, 68, 8, 32]' is invalid for input of size 1114112

In [None]:
import re
def tokenize(sentence):
    return re.findall(r"\b\w+\b", sentence.lower())

#inference loop
def translate(sentence,model,eng_vocab, hin_vocab, inv_hindi_vocab, max_len=50):
    
    model.eval()
    
    tokens = tokenize(sentence)
    print(tokens)
    #tokens = [tok for tok in tokens if tok in eng_vocab]
    #input_tensor = torch.tensor([tok eng_vocab["<UNK>"] for tok in tokens]).to(device)
    print("<UNK>" in eng_vocab)   # should print True
    print(eng_vocab.get("<UNK>", "Not Found"))  # should print 3 or the correct index
    unk_count = sum(1 for tok in tokens if tok not in eng_vocab)
    print(f"Total UNKs in input: {unk_count} out of {len(tokens)}")
    input_tensor = torch.tensor([eng_vocab.get(tok, eng_vocab["<UNK>"]) for tok in tokens]).unsqueeze(0).to(device)
 
    #generate souce masking
    src_mask = model.make_src_mask(input_tensor)

#passing through encoder
    with torch.no_grad():
        enc_src = model.encoder(input_tensor,src_mask)

#start with SOS token
    trg_indices = [hin_vocab["<SOS>"]]

    for _ in range(max_len):
        trg_tensor = torch.tensor(trg_indices).unsqueeze(0).to(device)
        trg_mask = model.make_trg_mask(trg_tensor)


        with torch.no_grad():
            output = model.decoder(trg_tensor, enc_src, src_mask, trg_mask)

        next_token = output.argmax(-1)[:,-1].item() #picking most probable next token

        trg_indices.append(next_token)

        if next_token == hin_vocab["<EOS>"]:
            break
    translated_sent = [inv_hindi_vocab.get(idx, "<UNK>") for idx in trg_indices[1:]]
        
    return " ".join(translated_sent).replace("<EOS>","") #not including <SOS>
        

In [None]:
inv_hindi_vocab = {index: word for word, index in Vd.items()}


In [None]:
eng_input = "is"
hindi_output = translate(eng_input, model, eng_vocab, Vd, inv_hindi_vocab)
print("Translated:", hindi_output)