In [5]:
pip install bert-for-tf2

Collecting bert-for-tf2
  Downloading bert-for-tf2-0.14.9.tar.gz (41 kB)
     -------------------------------------- 41.2/41.2 kB 657.8 kB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting py-params>=0.9.6
  Downloading py-params-0.10.2.tar.gz (7.4 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting params-flow>=0.8.0
  Downloading params-flow-0.8.2.tar.gz (22 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: bert-for-tf2, params-flow, py-params
  Building wheel for bert-for-tf2 (setup.py): started
  Building wheel for bert-for-tf2 (setup.py): finished with status 'done'
  Created wheel for bert-for-tf2: filename=bert_for_tf2-0.14.9-py3-none-any.whl size=30557 sha256=9c6702cac0445ee3e3c8ef21e7a33993aaf87af97682e10ea660958a5bf16b44
  Stored in directory

In [6]:
pip install tensorflow-text

Collecting tensorflow-text
  Downloading tensorflow_text-2.10.0-cp37-cp37m-win_amd64.whl (5.0 MB)
     ---------------------------------------- 5.0/5.0 MB 2.1 MB/s eta 0:00:00
Collecting tensorflow<2.11,>=2.10.0
  Downloading tensorflow-2.10.1-cp37-cp37m-win_amd64.whl (455.9 MB)
     -------------------------------------- 455.9/455.9 MB 2.2 MB/s eta 0:00:00
Collecting tensorflow-hub>=0.8.0
  Downloading tensorflow_hub-0.13.0-py2.py3-none-any.whl (100 kB)
     -------------------------------------- 100.6/100.6 kB 6.0 MB/s eta 0:00:00
Collecting opt-einsum>=2.3.2
  Downloading opt_einsum-3.3.0-py3-none-any.whl (65 kB)
     ---------------------------------------- 65.5/65.5 kB 3.5 MB/s eta 0:00:00
Collecting tensorboard<2.11,>=2.10
  Downloading tensorboard-2.10.1-py3-none-any.whl (5.9 MB)
     ---------------------------------------- 5.9/5.9 MB 5.5 MB/s eta 0:00:00
Collecting protobuf<3.20,>=3.9.2
  Downloading protobuf-3.19.6-cp37-cp37m-win_amd64.whl (896 kB)
     ----------------------

In [2]:
!pip install tensorflow-datasets

Collecting tensorflow-datasets
  Downloading tensorflow_datasets-4.8.2-py3-none-any.whl (5.3 MB)
     ---------------------------------------- 5.3/5.3 MB 5.7 MB/s eta 0:00:00
Collecting tensorflow-metadata
  Downloading tensorflow_metadata-1.12.0-py3-none-any.whl (52 kB)
     ---------------------------------------- 52.3/52.3 kB ? eta 0:00:00
Collecting etils[enp,epath]>=0.9.0
  Downloading etils-0.9.0-py3-none-any.whl (140 kB)
     -------------------------------------- 140.1/140.1 kB 8.1 MB/s eta 0:00:00
Collecting dill
  Using cached dill-0.3.6-py3-none-any.whl (110 kB)
Collecting promise
  Downloading promise-2.3.tar.gz (19 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting dm-tree
  Downloading dm_tree-0.1.8-cp37-cp37m-win_amd64.whl (102 kB)
     -------------------------------------- 102.1/102.1 kB 5.7 MB/s eta 0:00:00
Collecting googleapis-common-protos<2,>=1.52.0
  Downloading googleapis_common_protos-1.59.0-py2.

In [3]:
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_hub as hub
import tensorflow_text as text
from bert import bert_tokenization
import numpy as np
from scipy.spatial import distance

In [15]:
def get_model(model_url, max_seq_length):
    inputs = dict(
        input_word_ids=tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32),
        input_mask=tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32),
        input_type_ids=tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32),
    )

    muril_layer = hub.KerasLayer(model_url, trainable=True)
    outputs = muril_layer(inputs)

    assert 'sequence_output' in outputs
    assert 'pooled_output' in outputs
    assert 'encoder_outputs' in outputs
    assert 'default' in outputs
    return tf.keras.Model(inputs=inputs,outputs=outputs["pooled_output"]), muril_layer
     

max_seq_length = 128
muril_model, muril_layer = get_model(
model_url="https://tfhub.dev/google/MuRIL/1", max_seq_length=max_seq_length)
     

vocab_file = muril_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = muril_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case)


def create_input(input_strings, tokenizer, max_seq_length):
    input_ids_all, input_mask_all, input_type_ids_all = [], [], []
    for input_string in input_strings:
        input_tokens = ["[CLS]"] + tokenizer.tokenize(input_string) + ["[SEP]"]
        input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
        sequence_length = min(len(input_ids), max_seq_length)
    
    if len(input_ids) >= max_seq_length:
        input_ids = input_ids[:max_seq_length]
    else:
        input_ids = input_ids + [0] * (max_seq_length - len(input_ids))

    input_mask = [1] * sequence_length + [0] * (max_seq_length - sequence_length)

    input_ids_all.append(input_ids)
    input_mask_all.append(input_mask)
    input_type_ids_all.append([0] * max_seq_length)
  
    return np.array(input_ids_all), np.array(input_mask_all), np.array(input_type_ids_all)
     

def encode(input_text):
    input_ids, input_mask, input_type_ids = create_input(input_text, 
                                                       tokenizer, 
                                                       max_seq_length)
    inputs = dict(
        input_word_ids=input_ids,
        input_mask=input_mask,
        input_type_ids=input_type_ids,
    )
    return muril_model(inputs


# dst_2 = distance.euclidean(np.array(embeddings[1]), 
#                            np.array(embeddings[2]))
# print("Distance between {} & {} is {}".format(sentences[1],
#                                                 sentences[2],
#                                                 dst_2))
# # Distance between दोस्त & मित्र is 0.009007866494357586     
# # Distance between मित्र & शत्रु is 0.011569375172257423

# # dst_2 > dst_1
# dst_2 = distance.euclidean(np.array(embeddings[0]), 
#                            np.array(embeddings[2]))
# print("Distance between {} & {} is {}".format(sentences[0],
#                                                 sentences[2],
#                                                 dst_2))
# # True

SyntaxError: unexpected EOF while parsing (1639206727.py, line 75)

In [16]:
sentences = ["खेल", "पेड़","पत्ते","नीचे","बैठना"]
     

embeddings = encode(sentences)
print(embeddings)
print(type(embeddings[0][0]), type(embeddings[0][1]))

     

dst_1 = distance.euclidean(np.array(embeddings[0][0]), 
                           np.array(embeddings[0][1]))
print("Distance between {} & {} is {}".format(sentences[0],
                                                sentences[1],
                                                dst_1))

tf.Tensor(
[[ 9.05657280e-03  1.82127524e-02 -7.74277886e-03  1.49575174e-02
  -7.94080738e-03 -8.13474692e-03 -2.09902646e-03 -4.27046558e-03
  -3.39623727e-03 -9.03426856e-03 -9.59986355e-03  4.95837396e-03
   1.44740962e-03  5.81350061e-04 -4.12893109e-03  1.72125213e-02
   1.48790155e-03 -9.81140882e-03  5.08353813e-04 -5.16026001e-03
   5.23602217e-03  5.25832700e-04  6.77105086e-03 -2.50974912e-02
   5.67265972e-03  1.11501443e-03 -7.79086258e-03 -5.89352613e-03
   2.04008044e-04 -2.28850581e-02  1.28805675e-02  1.46551281e-02
   5.90788247e-03 -1.67335905e-02  2.22222637e-02  6.58667879e-03
   8.08604248e-03 -1.49213299e-02  6.31368300e-03 -6.61029667e-03
  -2.14631017e-02 -1.60514228e-02  2.60370458e-03  2.13128608e-02
  -1.96953490e-02 -1.00091763e-03 -2.86470447e-03 -1.25710964e-02
  -4.99460148e-03 -8.87874397e-04 -1.04923574e-02 -3.04827606e-03
   1.55180562e-02  1.23135662e-02 -5.43640135e-03 -1.63651742e-02
   1.31260219e-03 -1.05677033e-02 -7.98007194e-03  2.26366892e-02

In [1]:
!pip freeze

aiohttp==3.8.4
aiosignal==1.3.1
alabaster==0.7.13
anyio @ file:///C:/ci/anyio_1644463705902/work/dist
argon2-cffi @ file:///C:/ci/argon2-cffi_1613038019788/work
async-timeout==4.0.2
asynctest==0.13.0
attrs @ file:///C:/b/abs_09s3y775ra/croot/attrs_1668696195628/work
Babel @ file:///C:/b/abs_a2shv_3tqi/croot/babel_1671782804377/work
backcall @ file:///home/ktietz/src/ci/backcall_1611930011877/work
beautifulsoup4 @ file:///C:/ci/beautifulsoup4_1650292996413/work
bleach @ file:///opt/conda/conda-bld/bleach_1641577558959/work
blis==0.7.9
Bottleneck==1.3.7
brotlipy==0.7.0
catalogue==2.0.8
certifi @ file:///C:/b/abs_85o_6fm0se/croot/certifi_1671487778835/work/certifi
cffi @ file:///C:/b/abs_49n3v2hyhr/croot/cffi_1670423218144/work
charset-normalizer @ file:///tmp/build/80754af9/charset-normalizer_1630003229654/work
click==8.1.3
colorama @ file:///C:/b/abs_a9ozq0l032/croot/colorama_1672387194846/work
confection==0.0.4
cryptography @ file:///C:/b/abs_8ecplyc3n2/croot/cryptography_1677533105000