## **Tensorflow prerequisites**

In [5]:
pip install bert-for-tf2

Collecting bert-for-tf2
  Downloading bert-for-tf2-0.14.9.tar.gz (41 kB)
     -------------------------------------- 41.2/41.2 kB 657.8 kB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting py-params>=0.9.6
  Downloading py-params-0.10.2.tar.gz (7.4 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting params-flow>=0.8.0
  Downloading params-flow-0.8.2.tar.gz (22 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: bert-for-tf2, params-flow, py-params
  Building wheel for bert-for-tf2 (setup.py): started
  Building wheel for bert-for-tf2 (setup.py): finished with status 'done'
  Created wheel for bert-for-tf2: filename=bert_for_tf2-0.14.9-py3-none-any.whl size=30557 sha256=9c6702cac0445ee3e3c8ef21e7a33993aaf87af97682e10ea660958a5bf16b44
  Stored in directory

In [6]:
pip install tensorflow-text

Collecting tensorflow-text
  Downloading tensorflow_text-2.10.0-cp37-cp37m-win_amd64.whl (5.0 MB)
     ---------------------------------------- 5.0/5.0 MB 2.1 MB/s eta 0:00:00
Collecting tensorflow<2.11,>=2.10.0
  Downloading tensorflow-2.10.1-cp37-cp37m-win_amd64.whl (455.9 MB)
     -------------------------------------- 455.9/455.9 MB 2.2 MB/s eta 0:00:00
Collecting tensorflow-hub>=0.8.0
  Downloading tensorflow_hub-0.13.0-py2.py3-none-any.whl (100 kB)
     -------------------------------------- 100.6/100.6 kB 6.0 MB/s eta 0:00:00
Collecting opt-einsum>=2.3.2
  Downloading opt_einsum-3.3.0-py3-none-any.whl (65 kB)
     ---------------------------------------- 65.5/65.5 kB 3.5 MB/s eta 0:00:00
Collecting tensorboard<2.11,>=2.10
  Downloading tensorboard-2.10.1-py3-none-any.whl (5.9 MB)
     ---------------------------------------- 5.9/5.9 MB 5.5 MB/s eta 0:00:00
Collecting protobuf<3.20,>=3.9.2
  Downloading protobuf-3.19.6-cp37-cp37m-win_amd64.whl (896 kB)
     ----------------------

In [2]:
!pip install tensorflow-datasets

Collecting tensorflow-datasets
  Downloading tensorflow_datasets-4.8.2-py3-none-any.whl (5.3 MB)
     ---------------------------------------- 5.3/5.3 MB 5.7 MB/s eta 0:00:00
Collecting tensorflow-metadata
  Downloading tensorflow_metadata-1.12.0-py3-none-any.whl (52 kB)
     ---------------------------------------- 52.3/52.3 kB ? eta 0:00:00
Collecting etils[enp,epath]>=0.9.0
  Downloading etils-0.9.0-py3-none-any.whl (140 kB)
     -------------------------------------- 140.1/140.1 kB 8.1 MB/s eta 0:00:00
Collecting dill
  Using cached dill-0.3.6-py3-none-any.whl (110 kB)
Collecting promise
  Downloading promise-2.3.tar.gz (19 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting dm-tree
  Downloading dm_tree-0.1.8-cp37-cp37m-win_amd64.whl (102 kB)
     -------------------------------------- 102.1/102.1 kB 5.7 MB/s eta 0:00:00
Collecting googleapis-common-protos<2,>=1.52.0
  Downloading googleapis_common_protos-1.59.0-py2.

## **MuRIL model**

#### ***Prerequisites***

In [2]:
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_hub as hub
import tensorflow_text as text
from bert import bert_tokenization
import numpy as np
from scipy.spatial import distance

In [3]:
def get_model(model_url, max_seq_length):
    inputs = dict(
        input_word_ids=tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32),
        input_mask=tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32),
        input_type_ids=tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32),
    )

    muril_layer = hub.KerasLayer(model_url, trainable=True)
    outputs = muril_layer(inputs)

    assert 'sequence_output' in outputs
    assert 'pooled_output' in outputs
    assert 'encoder_outputs' in outputs
    assert 'default' in outputs
    return tf.keras.Model(inputs=inputs,outputs=outputs["pooled_output"]), muril_layer
     

max_seq_length = 128
muril_model, muril_layer = get_model(
model_url="https://tfhub.dev/google/MuRIL/1", max_seq_length=max_seq_length)
     

vocab_file = muril_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = muril_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case)


def create_input(input_strings, tokenizer, max_seq_length):
    input_ids_all, input_mask_all, input_type_ids_all = [], [], []
    for input_string in input_strings:
        input_tokens = ["[CLS]"] + tokenizer.tokenize(input_string) + ["[SEP]"]
        input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
        sequence_length = min(len(input_ids), max_seq_length)
    
        if len(input_ids) >= max_seq_length:
            input_ids = input_ids[:max_seq_length]
        else:
            input_ids = input_ids + [0] * (max_seq_length - len(input_ids))

        input_mask = [1] * sequence_length + [0] * (max_seq_length - sequence_length)

        input_ids_all.append(input_ids)
        input_mask_all.append(input_mask)
        input_type_ids_all.append([0] * max_seq_length)
  
    return np.array(input_ids_all), np.array(input_mask_all), np.array(input_type_ids_all)
     

def encode(input_text):
    input_ids, input_mask, input_type_ids = create_input(input_text, 
                                                       tokenizer, 
                                                       max_seq_length)
    inputs = dict(
        input_word_ids=input_ids,
        input_mask=input_mask,
        input_type_ids=input_type_ids,
    )
    return muril_model(inputs)

#### ***Examples***

In [7]:
sentences = ["खेल", "पेड़","पत्ते","नीचे","बैठना"]
embeddings = encode(sentences)

     
dst_1 = distance.euclidean(np.array(embeddings[0]), 
                           np.array(embeddings[1]))
print("Distance between {} & {} is {}".format(sentences[0],
                                                sentences[1],
                                                dst_1))

dst_2 = distance.euclidean(np.array(embeddings[1]), 
                           np.array(embeddings[2]))
print("Distance between {} & {} is {}".format(sentences[1],
                                                sentences[2],
                                                dst_1))

dst_2 = distance.euclidean(np.array(embeddings[2]), 
                           np.array(embeddings[3]))
print("Distance between {} & {} is {}".format(sentences[2],
                                                sentences[3],
                                                dst_1))

Distance between खेल & पेड़ is 0.012261751107871532
Distance between पेड़ & पत्ते is 0.012261751107871532
Distance between पत्ते & नीचे is 0.012261751107871532


In [4]:
sentences=["पेड़", "ताश", "खेल"]
embeddings = encode(sentences)
     

dst_1 = distance.euclidean(np.array(embeddings[0]), 
                           np.array(embeddings[1]))
print("Distance between {} & {} is {}".format(sentences[0],
                                                sentences[1],
                                                dst_1))

dst_2 = distance.euclidean(np.array(embeddings[1]), 
                           np.array(embeddings[2]))
print("Distance between {} & {} is {}".format(sentences[1],
                                                sentences[2],
                                                dst_2))

Distance between पेड़ & ताश is 0.0185878686606884
Distance between ताश & खेल is 0.01976960524916649


In [5]:
sentences=["पेड़", "शिखा", "खेल"]
embeddings = encode(sentences)


dst_1 = distance.euclidean(np.array(embeddings[0]), 
                           np.array(embeddings[1]))
print("Distance between {} & {} is {}".format(sentences[0],
                                                sentences[1],
                                                dst_1))

dst_2 = distance.euclidean(np.array(embeddings[1]), 
                           np.array(embeddings[2]))
print("Distance between {} & {} is {}".format(sentences[1],
                                                sentences[2],
                                                dst_2))

Distance between पेड़ & शिखा is 0.013731339015066624
Distance between शिखा & खेल is 0.015045817010104656


In [6]:
sentences=["पेड़", "चादर", "खेल"]
embeddings = encode(sentences)
     

dst_1 = distance.euclidean(np.array(embeddings[0]), 
                           np.array(embeddings[1]))
print("Distance between {} & {} is {}".format(sentences[0],
                                                sentences[1],
                                                dst_1))

dst_2 = distance.euclidean(np.array(embeddings[1]), 
                           np.array(embeddings[2]))
print("Distance between {} & {} is {}".format(sentences[1],
                                                sentences[2],
                                                dst_2))

Distance between पेड़ & चादर is 0.010924091562628746
Distance between चादर & खेल is 0.01224229484796524
