This notebook explores capabilities of recent State-of-the-art Multilingual model - Language agnostic BERT Sentence Embedding 

In [1]:
! pip install bert-for-tf2

Collecting bert-for-tf2
[?25l  Downloading https://files.pythonhosted.org/packages/af/c1/015648a2186b25c6de79d15bec40d3d946fcf1dd5067d1c1b28009506486/bert-for-tf2-0.14.6.tar.gz (40kB)
[K     |████████                        | 10kB 14.2MB/s eta 0:00:01[K     |████████████████                | 20kB 1.7MB/s eta 0:00:01[K     |████████████████████████▏       | 30kB 2.2MB/s eta 0:00:01[K     |████████████████████████████████| 40kB 1.8MB/s 
[?25hCollecting py-params>=0.9.6
  Downloading https://files.pythonhosted.org/packages/a4/bf/c1c70d5315a8677310ea10a41cfc41c5970d9b37c31f9c90d4ab98021fd1/py-params-0.9.7.tar.gz
Collecting params-flow>=0.8.0
  Downloading https://files.pythonhosted.org/packages/a9/95/ff49f5ebd501f142a6f0aaf42bcfd1c192dc54909d1d9eb84ab031d46056/params-flow-0.8.2.tar.gz
Building wheels for collected packages: bert-for-tf2, py-params, params-flow
  Building wheel for bert-for-tf2 (setup.py) ... [?25l[?25hdone
  Created wheel for bert-for-tf2: filename=bert_for_tf2

In [2]:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import bert

In [3]:
tf.__version__

'2.3.0'

In [4]:
LABSE_model_URL = "https://tfhub.dev/google/LaBSE/1"
MAX_SEQ_LENGTH = 64

In [5]:
def getModel(model_url, max_seq_length):
  # Load the saved LaBSE model as Keras layer. 
  # Set trainable to True to enable weight update for fine-tuning the model for down stream task
  labse_layer = hub.KerasLayer(handle=model_url, trainable=True, name='labse')

  # Define Inputs
  input_word_ids = tf.keras.Input(shape=(max_seq_length, ), dtype=tf.int32, name='input_word_ids')
  input_mask = tf.keras.Input(shape=(max_seq_length, ), dtype=tf.int32, name='input_mask')
  input_segment_ids = tf.keras.Input(shape=(max_seq_length, ), dtype=tf.int32, name='input_segment_ids')

  # LABSE layer - what is pooled output
  pooled_output, _ = labse_layer([input_word_ids, input_mask ,input_segment_ids])
  
  # The output is L2 normalized - why L2 normalization in the layer
  pooled_output = tf.keras.layers.Lambda(lambda x: tf.nn.l2_normalize(x, axis=1), name='l2_normalized_pooling')(pooled_output)

  # Define Model
  return tf.keras.Model(inputs=[input_word_ids, input_mask ,input_segment_ids], outputs=pooled_output), labse_layer

In [6]:
labse_model, labse_layer = getModel(LABSE_model_URL, MAX_SEQ_LENGTH)

In [7]:
labse_model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 64)]         0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 64)]         0                                            
__________________________________________________________________________________________________
input_segment_ids (InputLayer)  [(None, 64)]         0                                            
__________________________________________________________________________________________________
labse (KerasLayer)              [(None, 768), (None, 470926849   input_word_ids[0][0]             
                                                                 input_mask[0][0]      

In [10]:
vocab_file = labse_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = labse_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert.bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

In [11]:
tokenizer.tokenize("Thanks, I like this application")

['Thanks', ',', 'I', 'like', 'this', 'application']

In [12]:
tokenizer.tokenize("धन्यवाद, मुझे यह एप्लीकेशन पसंद है")


['धन्यवाद', ',', 'मुझे', 'यह', 'एप', '##्ली', '##के', '##शन', 'पसंद', 'है']

In [13]:
def create_input(input_strings, tokenizer, max_seq_length):

  input_ids_all, input_mask_all, segment_ids_all = [], [], []
  for input_string in input_strings:
    # Tokenize input.
    input_tokens = ["[CLS]"] + tokenizer.tokenize(input_string) + ["[SEP]"]
    input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
    sequence_length = min(len(input_ids), max_seq_length)

    # Padding or truncation.
    if len(input_ids) >= max_seq_length:
      input_ids = input_ids[:max_seq_length]
    else:
      input_ids = input_ids + [0] * (max_seq_length - len(input_ids))

    input_mask = [1] * sequence_length + [0] * (max_seq_length - sequence_length)

    input_ids_all.append(input_ids)
    input_mask_all.append(input_mask)
    segment_ids_all.append([0] * max_seq_length)

  return np.array(input_ids_all), np.array(input_mask_all), np.array(segment_ids_all)


In [20]:
def encode(input_text):
  input_ids, input_mask, segment_ids = create_input(
    input_text, tokenizer, MAX_SEQ_LENGTH)
  return labse_model([input_ids, input_mask, segment_ids])

In [40]:
eng_sentence = ["Thanks, I like this chair", "I didn't like it"]
hi_sentence = ["धन्यवाद, मुझे यह कुर्सी पसंद है", "मुझे यह पसंद नहीं आया"]
hinglish_sentence = ["dhanyawaad, mujhe yeh kursi pasand hai", "Mujhe yeh pasand nahi aayi"]

In [41]:
en_embeddings = encode(eng_sentence)
hi_embeddings = encode(hi_sentence)
hinglish_embeddings = encode(hinglish_sentence)

In [42]:
en_embeddings.shape

TensorShape([2, 768])

In [43]:
# English-Hindi similarity
print (np.matmul(en_embeddings, np.transpose(hi_embeddings)))

[[0.9189365  0.39174503]
 [0.4012831  0.9497982 ]]


In [44]:
# English-Hinglish Similarity
print (np.matmul(en_embeddings, np.transpose(hinglish_embeddings)))

[[0.40627664 0.12593725]
 [0.03425898 0.16832921]]


In [45]:
english_sentences = ["dog", "Puppies are nice.", "I enjoy taking long walks along the beach with my dog."]
italian_sentences = ["cane", "I cuccioli sono carini.", "Mi piace fare lunghe passeggiate lungo la spiaggia con il mio cane."]
japanese_sentences = ["犬", "子犬はいいです", "私は犬と一緒にビーチを散歩するのが好きです"]


english_embeddings = encode(english_sentences)
italian_embeddings = encode(italian_sentences)
japanese_embeddings = encode(japanese_sentences)

# English-Italian similarity
print (np.matmul(english_embeddings, np.transpose(italian_embeddings)))

# English-Japanese similarity
print (np.matmul(english_embeddings, np.transpose(japanese_embeddings)))

# Italian-Japanese similarity
print (np.matmul(italian_embeddings, np.transpose(japanese_embeddings)))

[[0.63192904 0.3061977  0.44297487]
 [0.11652687 0.8596667  0.35940546]
 [0.1480399  0.32447964 0.95426506]]
[[0.93567216 0.54030645 0.46792305]
 [0.31804204 0.7622249  0.3608588 ]
 [0.36750704 0.42791563 0.81714547]]
[[0.5343719  0.25018615 0.19974725]
 [0.30140817 0.71333206 0.4064753 ]
 [0.3850308  0.47767898 0.86742973]]


#### References

1. https://tfhub.dev/google/LaBSE/1
2. [Language agnostic BERT Sentence Embedding](https://arxiv.org/pdf/2007.01852.pdf)