In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import random
import json
import re

In [2]:
def tf_lower_and_split_punct(text):
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text, "[^ a-z.?!,¿]", "")
    text = tf.strings.regex_replace(text, "[.?!,¿]", r" \0 ")
    text = tf.strings.strip(text)
    text = tf.strings.join(["[SOS]", text, "[EOS]"], separator=" ")
    return text


def tf_lower_and_split_punct_1(text):
    text = tf.strings.lower(text)
    text = tf.strings.strip(text)
    text = tf.strings.regex_replace(text,'[^ \u0900-\u097F,I?!]','')
    text = tf.strings.regex_replace(text, "[.?!,¿]", r" \0 ")
    text = tf.strings.join(["[SOS]", text, "[EOS]"], separator=" ")
    return text

In [3]:
path = "/kaggle/input/english-hindi-google-trans/english_hindi.csv"

In [4]:
df = pd.read_csv(path,index_col=0)

In [5]:
print(len(df))
df.head()

64131


Unnamed: 0,English,Hindi
0,To give adequate medical cover and to carry ou...,यहीं के पशुओं के स्वास्थ्य-सुधार तथा पशु-पालन ...
1,and she is not dazzled by anything.”,और किसी से प्रभावित नहीं होती।”
2,The Ter temple now containing a Trivikrama ima...,तेर के मंदिर में अब तक एक त्रिविक्रम की मूर्ति...
3,Which is(Lodurva Jain Temple)attract the big n...,के लिए जाना जाता है जो वर्ष भर तीर्थयात्राओं क...
4,monotheism,अद्वैतवाद


In [6]:
df[df.isnull().any(axis = 1)]

Unnamed: 0,English,Hindi


In [7]:
df.head()

Unnamed: 0,English,Hindi
0,To give adequate medical cover and to carry ou...,यहीं के पशुओं के स्वास्थ्य-सुधार तथा पशु-पालन ...
1,and she is not dazzled by anything.”,और किसी से प्रभावित नहीं होती।”
2,The Ter temple now containing a Trivikrama ima...,तेर के मंदिर में अब तक एक त्रिविक्रम की मूर्ति...
3,Which is(Lodurva Jain Temple)attract the big n...,के लिए जाना जाता है जो वर्ष भर तीर्थयात्राओं क...
4,monotheism,अद्वैतवाद


In [8]:
# df.drop(df.index[indexes],axis = 0,inplace = True)
indexes = df[df['Hindi'].apply(lambda x:True if (len(x)<=2) or (re.compile('^[\w\s\.,0-9]+$',re.ASCII).findall(x)) else False)].index
df.drop(df.index[indexes],axis =0,inplace = True)


In [9]:
df.dropna(inplace = True)
print(df.isnull().any())

English    False
Hindi      False
dtype: bool


In [10]:
text_pairs =list(zip(df['English'],df['Hindi']))
random.seed(42)
random.shuffle(text_pairs)
print(text_pairs[:5])

[('According to section 79 central parliament has two important houses other than president Rajasabha of the state and Loksabha,of the people.', 'भारत के संविधान की धारा 79 के अनुसार केन्द्रीय संसद की परिषद् में राष्ट्रपति तथा दो सदन है जिन्हें राज्यों की परिषद् राज्यसभा तथा लोगों का सदन लोकसभा के नाम से जाना जाता है।'), ('taluqadars', 'तालुकदार'), ('Nepal(a democratic republic) is a South Asian Himalayan Country.', 'नेपाल (आधिकारिक रूप में संघीय लोकतान्त्रिक गणराज्य नेपाल) ( ) एक दक्षिण एशियाई भूपरिवेष्ठित हिमालयी राष्ट्र है।'), ('witnessing ahswathama', 'साक्षी अश्वत्थामा'), ('solved everything in 20 minutes.', 'सारे सवाल 20 मिनट में हल कर दिए.')]


In [11]:
num_val_samples = int(0.1 * len(text_pairs))
num_train_samples = len(text_pairs) -  num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:]
# test_pairs = text_pairs[num_train_samples+num_val_samples:]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
# print(f'{len(test_pairs)} test pairs')

63999 total pairs
57600 training pairs
6399 validation pairs


#### approx vocab size

In [12]:
list1 = list(df['English'])
list1 = [word.strip().strip('!').strip('.').strip(',').strip('?').lower() for sentence in list1 for word in sentence.split(' ') if len(word)]

print('approx english vocab:',len(set(list1)))

list2 = list(df['Hindi'])
list2 = [word.strip('!').strip('.').strip(',').strip('?') for sentence in list2 for word in sentence.split(' ')]
print('approx hindi vocab:',len(set(list2)))

approx english vocab: 55726
approx hindi vocab: 62726


In [13]:
list1 = list(set(list1))
print(len(list1))

55726


In [14]:
for i in range(10,20):
    print(list1[i*10:i*10+10])

['preponderant', 'sanity', 'injury', 'bogota', 'balasmand', 'maval', '10th:', 'articles,lectures,roles,letters', 'centers', 'samaj,in']
['abominably', 'headquarters-tension', 'vedik', 'unilaterally', 'torso', 'ball“”', 'head-dress', 'thinkcycle', 'lemon', 'fungus']
['1583', 'interloper', 'sparring', 'panchtantra', 'mandapa-temples', 'mastercard', 'monarchic', 'sanctuaries', 'horoscopes', 'lakes']
['south-western', 'sustainable“”', 'disposable', 'warned', 'helper', 'spear-throwing', 'kahin', 'loudly', 'peeling', 'baloney.”']
['crows', 'dasaradh', 'under-21', 'pistol', 'farmers,16%helpand3%daily', 'assemblyperhaps', 'harvey', 'kumar', 'akregator', 'issar']
['temple,but', 'profitable', 'jodhpur“”', 'etched', 'revenue', 'lull', 'gadgil', 'alleges', 'uni-code', 'religious,ancient']
['co-artist', 'faiths', 'thirty-five', '“”putrakameshthi“”', 'jasper', 'technicalities', 'calling', 'dna', 'diverse', 'topics']
['maurice', 'disapproved', '“”title', 'runs.to', 'high-domed', 'rubber', 'divediacha

In [15]:
vocab_size = 15000
sequence_length = 50
batch_size = 64

In [16]:
# vectorization
eng_vectorization = tf.keras.layers.TextVectorization(
    max_tokens = vocab_size,
    output_mode = "int",
    output_sequence_length = sequence_length,
    standardize=tf_lower_and_split_punct
)

hin_vectorization = tf.keras.layers.TextVectorization(
    max_tokens = vocab_size,
    output_mode = "int",
    output_sequence_length = sequence_length+1,
    standardize=tf_lower_and_split_punct_1
)

train_eng_texts = [pair[0] for pair in train_pairs]
train_hin_texts = [pair[1] for pair in train_pairs]

eng_vectorization.adapt(train_eng_texts)
hin_vectorization.adapt(train_hin_texts)

In [17]:
#save the vectorization layers

eng_vocab = eng_vectorization.get_vocabulary()
    
with open('eng_vocab.json', 'w', encoding='utf-8') as f:
    json.dump(eng_vocab, f)
    
hin_vocab = hin_vectorization.get_vocabulary()
    
with open('hin_vocab.json', 'w', encoding='utf-8') as f:
    json.dump(hin_vocab, f)
    

In [18]:
print(len(eng_vocab))
print(eng_vocab[:20])
print(hin_vocab[:20])

15000
['', '[UNK]', 'the', '[SOS]', '[EOS]', ',', '.', 'of', 'and', 'in', 'to', 'a', 'is', 'that', 'for', 'was', 'it', 'as', 'by', 'on']
['', '[UNK]', '[SOS]', '[EOS]', 'के', 'में', 'है', 'की', ',', 'और', 'से', 'का', 'को', 'हैं', 'पर', 'एक', 'कि', 'भी', 'यह', 'लिए']


In [19]:
list1 = [x for x in eng_vocab if len(x)==10]
len(list1)

4614

In [20]:
def format_dataset(eng, spa):
    eng = eng_vectorization(eng)
    hin = hin_vectorization(spa)
    return (
        {
            "encoder_inputs": eng,
            "decoder_inputs": hin[:, :-1],
        },
        hin[:, 1:],
    )
    
def make_dataset(pairs):
    eng_texts, hin_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    hin_texts = list(hin_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, hin_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset.cache().shuffle(2048).prefetch(16)

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)
# test_ds = make_dataset(test_pairs)

In [38]:
for inp,tar in train_ds.take(1):
    print(inp['encoder_inputs'][:2])
    print(inp['decoder_inputs'][:2])
    print(tar[:2])

tf.Tensor(
[[    3     1     5    27 13326    92   181   118    12   126    12   513
     10  1806    68   431     6     4     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0]
 [    3    48    45   462  1166     5   195   154     5    68     1     5
   2689  1193     5    19   845     7   149     1     9  1363     7     1
      5    13     1  3026   125     1     1     6     4     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0]], shape=(2, 50), dtype=int64)
tf.Tensor(
[[    2     1     8 12845    10     8   147  1157     6    16   502  1828
      4   306  4655    35     3     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0]
 [    2   293  

In [39]:
vocab_size = 15000
units_1 = 128

In [40]:
class TransformerEncoder(tf.keras.layers.Layer):
    def __init__(self,embed_dim,dense_dim,num_heads,**kwargs):
        super().__init__(**kwargs)

        self.num_heads = num_heads
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim

        self.mha = tf.keras.layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim = embed_dim
        )
        self.dense_proj = tf.keras.Sequential(
            [
                tf.keras.layers.Dense(dense_dim,activation = 'relu'),
                tf.keras.layers.Dense(embed_dim)
            ]
        )
        self.layernorm_1 = tf.keras.layers.LayerNormalization()
        self.layernorm_2 = tf.keras.layers.LayerNormalization()
        self.supports_masking = True

    def call(self,inputs,mask = None):
        if mask is not None:
            padding_mask = tf.cast(mask[:,None,:],dtype = tf.int32)
        else:
            padding_mask = None
        attention_output = self.mha(
            query = inputs,
            value = inputs,
            key = inputs,
            attention_mask = padding_mask
        )
        proj_input = self.layernorm_1(attention_output+inputs)
        proj_output = self.dense_proj(proj_input)
        output = self.layernorm_2(proj_input + proj_output)
        return output
    
    def get_config(self):
        config = super().get_config()
        config.update({
            'embed_dim':self.embed_dim,
            'dense_dim':self.dense_dim,
            'num_heads': self.num_heads,
        })
        return config

In [41]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self,sequence_length,vocab_size,embed_dim,**kwargs):
        super().__init__(**kwargs)

        self.embed_dim = embed_dim
        self.vocab_size = vocab_size
        self.sequence_length = sequence_length
        
        self.token_embeddings = tf.keras.layers.Embedding(
            input_dim = vocab_size,
            output_dim  = embed_dim
        )

        self.position_embeddings = tf.keras.layers.Embedding(
            input_dim = sequence_length,
            output_dim = embed_dim
        )

    def call(self,inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start = 0,limit = length,delta =1)
        embeded_tokens = self.token_embeddings(inputs)
        embeded_position = self.position_embeddings(positions)
        return embeded_tokens + embeded_position
    
    def compute_mask(self,inputs,mask =None):
        if mask is not None:
            return tf.not_equal(inputs,0)
        else:
            return None
    def get_config(self):
        config = super().get_config()
        config.update({
            'vocab_size': self.vocab_size,
            'sequence_length': self.sequence_length,
            'embed_dim': self.embed_dim,
        })
        return config

In [42]:
class TransformerDecoder(tf.keras.layers.Layer):
    def __init__(self,embed_dim,latent_dim,num_heads,**kwargs):
        super().__init__(**kwargs)

        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads

        self.mha1 = tf.keras.layers.MultiHeadAttention(num_heads=num_heads,key_dim=embed_dim)
        self.mha2 = tf.keras.layers.MultiHeadAttention(num_heads = num_heads,key_dim=embed_dim)

        self.dense_proj = tf.keras.Sequential(
            [
                tf.keras.layers.Dense(latent_dim,activation = 'relu'),
                tf.keras.layers.Dense(embed_dim),
            ]
        )
        self.layernorm_1 = tf.keras.layers.LayerNormalization()
        self.layernorm_2 = tf.keras.layers.LayerNormalization()
        self.layernorm_3 = tf.keras.layers.LayerNormalization()
        self.supports_masking = True

    def call(self,inputs,encoder_outputs,mask=None):
        casual_mask = self.get_casual_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:,None,:],dtype = tf.int32)
            padding_mask = tf.minimum(padding_mask,casual_mask)
        else:
            padding_mask = None

        attention_output1 = self.mha1(
            query = inputs,
            value = inputs,
            key = inputs,
            attention_mask = casual_mask
        )

        out_1 = self.layernorm_1(inputs + attention_output1)

        attention_output2 = self.mha2(
            query = out_1,
            value = encoder_outputs,
            key = encoder_outputs,
            attention_mask = padding_mask,
        )

        out_2 = self.layernorm_2(out_1 + attention_output2)
        proj_output = self.dense_proj(out_2)
        output = self.layernorm_3(proj_output + out_2)

        return output
    
    def get_casual_attention_mask(self,inputs):
        input_shape = tf.shape(inputs)
        batch_size,sequence_length = input_shape[0],input_shape[1]
        i = tf.range(sequence_length)[:,None]
        j = tf.range(sequence_length)
        mask = tf.cast(i>=j,tf.int32)
        mask = tf.reshape(mask,(1,input_shape[1],input_shape[1]))
        mult = tf.concat(
            [
                tf.expand_dims(batch_size,-1),
                tf.convert_to_tensor([1,1]),
            ],
            axis =0,
        )
        return tf.tile(mask,mult)
    

    def get_config(self):
        config = super().get_config()
        config.update({
            'embed_dim': self.embed_dim,
            'latent_dim': self.latent_dim,
            'num_heads': self.num_heads
        })
        return config


In [43]:
tf.range(0,20,delta = 1)

<tf.Tensor: shape=(20,), dtype=int32, numpy=
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19], dtype=int32)>

In [57]:
# define emmbedding dimensions, latent dimensions, and number of heads
embed_dim = 100
latent_dim = 256
num_heads = 2

#Encoder
encoder_inputs = tf.keras.Input(shape = (None,), dtype = "int64", name = "encoder_inputs")

x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)

encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x)

encoder = tf.keras.Model(encoder_inputs, encoder_outputs, name = "encoder")

#Decoder
decoder_inputs = tf.keras.Input(shape = (None,), dtype = "int64", name = "decoder_inputs")
encoder_seq_inputs = tf.keras.Input(shape = (None, embed_dim), name = "encoder_seq_inputs")

x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)

x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoder_seq_inputs)

x = tf.keras.layers.Dropout(0.5)(x)

decoder_outputs = tf.keras.layers.Dense(vocab_size, activation = tf.nn.log_softmax)(x)

decoder = tf.keras.Model([decoder_inputs, encoder_seq_inputs], decoder_outputs, name = "decoder")

# Define the final model
decoder_outputs = decoder([decoder_inputs, encoder_outputs])

transformer = tf.keras.Model(
    [encoder_inputs, decoder_inputs], decoder_outputs, name = "transformer"
)


In [58]:
logits = transformer((inp['encoder_inputs'],inp['decoder_inputs']))

In [59]:
logits.shape

TensorShape([64, 50, 15000])

In [60]:
epochs = 1
losses = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
transformer.summary()

transformer.compile(
    "adam", loss = losses, metrics = ["accuracy"]
)
# transformer.fit(train_ds, epochs = epochs, validation_data = val_ds)


In [61]:
epochs =20
transformer.fit(train_ds.repeat(),
                epochs = epochs,
                steps_per_epoch=500, 
                validation_data = val_ds.repeat(),
               validation_steps = 20,
               callbacks =tf.keras.callbacks.EarlyStopping(patience=3)
               )

Epoch 1/20


W0000 00:00:1727521650.895296      97 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert


[1m499/500[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 21ms/step - accuracy: 0.6753 - loss: 3.6152

W0000 00:00:1727521667.001189      98 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert


[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 24ms/step - accuracy: 0.6754 - loss: 3.6107 - val_accuracy: 0.7340 - val_loss: 1.8496
Epoch 2/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 21ms/step - accuracy: 0.7287 - loss: 1.9137 - val_accuracy: 0.7420 - val_loss: 1.7230
Epoch 3/20
[1m499/500[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 21ms/step - accuracy: 0.7359 - loss: 1.7826

W0000 00:00:1727521688.921075      95 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert


[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 23ms/step - accuracy: 0.7359 - loss: 1.7825 - val_accuracy: 0.7405 - val_loss: 1.6857
Epoch 4/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 21ms/step - accuracy: 0.7390 - loss: 1.7250 - val_accuracy: 0.7512 - val_loss: 1.5775
Epoch 5/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 21ms/step - accuracy: 0.7471 - loss: 1.6230 - val_accuracy: 0.7577 - val_loss: 1.5062
Epoch 6/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 21ms/step - accuracy: 0.7545 - loss: 1.5543 - val_accuracy: 0.7734 - val_loss: 1.4098
Epoch 7/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 21ms/step - accuracy: 0.7599 - loss: 1.4954 - val_accuracy: 0.7631 - val_loss: 1.4437
Epoch 8/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 21ms/step - accuracy: 0.7675 - loss: 1.4280 - val_accuracy: 0.7612 - val_loss: 1.4418
Epoch 9/20
[1m500/500[0m 

<keras.src.callbacks.history.History at 0x7c8a818f3a90>

In [62]:
transformer.save_weights('english_hindi_model.weights.h5')

In [63]:
transformer.evaluate(val_ds)

[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.7674 - loss: 1.3974


[1.3781664371490479, 0.7703515887260437]