In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

In [2]:
import tensorflow_datasets as tfds

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

#Creating Embedding and positional embedding

In [4]:
splits = {'train': 'data/train-00000-of-00001.parquet', 'validation': 'data/validation-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/cfilt/iitb-english-hindi/" + splits["train"])[:100_000]
df = np.array(df)

##Tokenizing Inputs

In [5]:
hi_text = []
en_text = []

max_len = 0
for i in range(10000):
  hi_text.append(df[i][0]["hi"])
  en_text.append(df[i][0]["en"])

In [6]:
assert len(hi_text) == len(en_text), "length words in input language does not match length of words in output language"

hi_token= tokenizer(hi_text,padding=True,return_tensors='tf')["input_ids"]
en_token= tokenizer(en_text,padding=True,return_tensors='tf')["input_ids"]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.


##Creating Embedding of tokens

In [7]:
vocab_size = tokenizer.vocab_size

embedding_layer = tf.keras.layers.Embedding(input_dim=vocab_size + 1,output_dim=128)
hi_embedding = embedding_layer(hi_token)
en_embedding = embedding_layer(en_token)

In [8]:
arr= [4,5,6,7]
arr[::3]

[4, 7]

##Positional encoding

In [9]:
# pos_embed_arr = []
# for pos in range(58):
#       for i in range(128):
#         if i % 2 == 0:
#           pe = np.sin(pos/10000**(2*i/128))
#           pos_embed_arr.append(pe)
#         elif i % 2 != 0:
#           pe = np.cos(pos/10000**(2*i/128))
#           pos_embed_arr.append(pe)

# pos_embed_arr = np.array(pos_embed_arr)
# pos_embed_arr = pos_embed_arr.reshape(58,128)
# pos_embed_arr

In [10]:
d_model_encoder = en_embedding.shape[2]
d_model_decoder = hi_embedding.shape[2]
max_token_encoder = en_embedding.shape[1]
max_token_decoder = hi_embedding.shape[1]

In [11]:
class PositionalEncoder(tf.keras.layers.Layer):
  def __init__(self,max_token,d_model,dtype = np.float32,**kwargs):
    super().__init__(dtype = dtype,**kwargs)
    self.d_model = d_model
    self.max_token = max_token
    assert d_model % 2 == 0, "d_model should be even"

    self.pos_embed_arr = []
    for pos in range(self.max_token):
      for i in range(d_model):
        if i % 2 == 0:
          pe = np.sin(pos/10000**(2*i/self.d_model))
          self.pos_embed_arr.append(pe)
        elif i % 2 != 0:
          pe = np.cos(pos/10000**(2*i/self.d_model))
          self.pos_embed_arr.append(pe)

    self.pos_embed_arr = np.array(self.pos_embed_arr)
    self.pos_embed_arr = self.pos_embed_arr.reshape(self.max_token,self.d_model)

  def call(self,embedding_vector):
      return embedding_vector + self.pos_embed_arr

In [12]:
pos_embedding_encoder = PositionalEncoder(max_token_encoder,d_model_encoder)
final_en_embedding = pos_embedding_encoder.call(en_embedding)
final_en_embedding

<tf.Tensor: shape=(10000, 58, 128), dtype=float32, numpy=
array([[[-2.8859451e-04,  1.0006448e+00,  2.3370769e-02, ...,
          9.6733981e-01, -3.0443097e-02,  9.5610255e-01],
        [ 7.9861242e-01,  6.1021829e-01,  7.2178924e-01, ...,
          9.8500586e-01, -1.3819026e-02,  9.8001307e-01],
        [ 8.9874357e-01, -1.7840259e-01,  9.7138232e-01, ...,
          9.6279162e-01, -1.0045442e-02,  9.8164839e-01],
        ...,
        [-9.7420371e-01, -8.4716350e-01, -3.8739714e-01, ...,
          1.0097629e+00,  3.5256356e-02,  1.0242946e+00],
        [-4.9599954e-01, -1.7091677e-01, -9.0891445e-01, ...,
          1.0097629e+00,  3.5256367e-02,  1.0242946e+00],
        [ 4.6171623e-01,  6.4570260e-01, -9.4001794e-01, ...,
          1.0097629e+00,  3.5256382e-02,  1.0242946e+00]],

       [[-2.8859451e-04,  1.0006448e+00,  2.3370769e-02, ...,
          9.6733981e-01, -3.0443097e-02,  9.5610255e-01],
        [ 7.9299575e-01,  6.1716086e-01,  6.7374843e-01, ...,
          9.6360183e-01, 

#Add & Norm Layer

In [21]:
class Add_Norm(tf.keras.layers.Layer):
  def __init__(self,**kwargs):
    super().__init__(dtype=np.float32,**kwargs)
    self.norm_layer = tf.keras.layers.LayerNormalization(axis=-1,epsilon=0.001)

  def call(self,sublayer_output,residual_input):
    return self.norm_layer(sublayer_output + residual_input)

#Multi-Head Attention / Scaled Dot Product Attention

In [16]:
class ScaledDotProduct_Attention(tf.keras.layers.Layer):
  def __init__(self,d_k:int,**kwargs):
    super().__init__(dtype=np.float32,**kwargs)
    self.d_k = d_k

  def call(self,q,k,v):
    dot_product = tf.matmul(q,k,transpose_b=True) #score
    scaled_dot_product = dot_product / tf.sqrt(tf.cast(self.d_k,dtype=tf.float32))
    attention_weight = tf.nn.softmax(scaled_dot_product,axis = -1)
    output = tf.matmul(attention_weight,v)
    return output

In [44]:
class MultiHead_Attention(tf.keras.layers.Layer):
  def __init__(self,d_model:int,heads:int,**kwargs):
    super().__init__(dtype=np.float32,**kwargs)
    self.d_model = d_model
    self.heads = heads
    assert d_model % heads == 0, "d_model must be perfectly divisible by heads"
    self.d_k = d_model // heads
    self.attention_fuction = ScaledDotProduct_Attention(self.d_k)
    self.wq = [tf.keras.layers.Dense(self.d_k,use_bias=False) for _ in range(self.heads)]
    self.wk = [tf.keras.layers.Dense(self.d_k,use_bias=False) for _ in range(self.heads)]
    self.wv = [tf.keras.layers.Dense(self.d_k,use_bias=False) for _ in range(self.heads)]
    self.final_linear_transform = tf.keras.layers.Dense(self.d_model,use_bias=False)
    self.add_norm_layer = Add_Norm()

  def call(self,initial_embedding:tf.Tensor):
    concat_arr = []
    for head in range(self.heads):
      q = self.wq[head](initial_embedding)
      k = self.wk[head](initial_embedding)
      v = self.wv[head](initial_embedding)

      output = self.attention_fuction.call(q,k,v)
      concat_arr.append(output)
    concat_output = tf.concat(concat_arr,axis=-1)
    mha_output = self.final_linear_transform(concat_output)
    residual_output = self.add_norm_layer.call(mha_output,initial_embedding)
    return residual_output

#Feed Forward Network

In [41]:
class FFN(tf.keras.layers.Layer):
  def __init__(self,d_model:int,**kwargs):
    super().__init__(dtype=np.float32,**kwargs)
    self.d_model = d_model
    self.add_norm_layer = Add_Norm()
    self.network = tf.keras.Sequential([
        tf.keras.layers.Dense(self.d_model * 4,activation='relu'),
        tf.keras.layers.Dense(self.d_model)
    ])

  def call(self,mha_output):
    nn_output = self.network(mha_output)
    ffn_output = self.add_norm_layer.call(nn_output,mha_output)
    return ffn_output

#**Encoder**

In [42]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, d_model:int, heads:int,**kwargs):
    super().__init__(dtype=np.float32,**kwargs)
    self.d_model = d_model
    self.heads = heads
    self.mha = MultiHead_Attention(d_model=self.d_model,heads = self.heads)
    self.ffn = FFN(d_model=self.d_model)

  def call(self,input_embedding):
    first_sublayer_out = self.mha.call(initial_embedding = input_embedding)
    ffn_output = self.ffn.call(mha_output = first_sublayer_out)
    encoder_output = ffn_output
    return encoder_output

In [45]:
encoder = Encoder(d_model=d_model_encoder,heads=4)
encoder.call(final_en_embedding)

<tf.Tensor: shape=(10000, 58, 128), dtype=float32, numpy=
array([[[ 0.6185941 ,  1.1694176 , -0.87058103, ...,  1.2707363 ,
         -0.40354115,  0.21266264],
        [ 1.3813236 ,  0.7497443 , -0.39847443, ...,  1.276266  ,
         -0.32571954,  0.23179089],
        [ 1.5614914 , -0.03332208, -0.30900928, ...,  1.2909454 ,
         -0.2619142 ,  0.09310463],
        ...,
        [ 0.01898558, -0.58300585, -1.534879  , ...,  1.365679  ,
         -0.02882093,  0.25455335],
        [ 0.49355948,  0.06056736, -2.040436  , ...,  1.3279662 ,
         -0.11493537,  0.12971139],
        [ 1.3415722 ,  0.80096745, -2.1103952 , ...,  1.2957757 ,
         -0.18106335,  0.0367102 ]],

       [[ 0.61724734,  1.1742388 , -0.86926395, ...,  1.2721894 ,
         -0.40412644,  0.21419272],
        [ 1.398043  ,  0.76609766, -0.4258126 , ...,  1.2420088 ,
         -0.25539163,  0.27239773],
        [ 1.552138  , -0.03990797, -0.26151785, ...,  1.2905993 ,
         -0.2011206 ,  0.18497282],
        .