# Cross-Lingual Similarity and Semantic Search Engine with Multilingual Universal Sentence Encoder


In [2]:
!pip install tensorflow_text
!pip install simpleneighbors[annoy]
!pip install tqdm



Collecting annoy>=1.16.0; extra == "annoy" (from simpleneighbors[annoy])
[?25l  Downloading https://files.pythonhosted.org/packages/a1/5b/1c22129f608b3f438713b91cd880dc681d747a860afe3e8e0af86e921942/annoy-1.17.0.tar.gz (646kB)
[K     |████████████████████████████████| 655kB 639kB/s eta 0:00:01
[?25hBuilding wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25lerror
[31m  ERROR: Complete output from command /snap/jupyter/6/bin/python -u -c 'import setuptools, tokenize;__file__='"'"'/tmp/pip-install-sog47k_n/annoy/setup.py'"'"';f=getattr(tokenize, '"'"'open'"'"', open)(__file__);code=f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' bdist_wheel -d /tmp/pip-wheel-e431oyqc --python-tag cp37:[0m
[31m  ERROR: running bdist_wheel
  running build
  running build_py
  creating build
  creating build/lib.linux-x86_64-3.7
  creating build/lib.linux-x86_64-3.7/annoy
  copying annoy/__init__.py -> build/lib.

In [2]:
# Loading the Pre-trained model
import numpy as np
import os
import pandas as pd
import tensorflow.compat.v2 as tf
import tensorflow_hub as hub
from tensorflow_text import SentencepieceTokenizer
import sklearn.metrics.pairwise

from simpleneighbors import SimpleNeighbors
from tqdm import tqdm
from tqdm import trange
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
module_url = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual/3' 

model = hub.load(module_url)

def embed_text(input):
    return model(input)



# Creating a Multilingual Semantic-Similarity Search Engine


## Download Data to Index - news sentences in multiples languages

In [3]:
corpus_metadata = [
    ('ar', 'ar-en.txt.zip', 'News-Commentary.ar-en.ar', 'Arabic'),
    ('zh', 'en-zh.txt.zip', 'News-Commentary.en-zh.zh', 'Chinese'),
    ('en', 'en-es.txt.zip', 'News-Commentary.en-es.en', 'English'),
    ('ru', 'en-ru.txt.zip', 'News-Commentary.en-ru.ru', 'Russian'),
    ('es', 'en-es.txt.zip', 'News-Commentary.en-es.es', 'Spanish'),
]

language_to_sentences = {}
language_to_news_path = {}
for language_code, zip_file, news_file, language_name in corpus_metadata:
    zip_path = tf.keras.utils.get_file(
      fname=zip_file,
      origin='http://opus.nlpl.eu/download.php?f=News-Commentary/v11/moses/' + zip_file,
      extract=True)
    news_path = os.path.join(os.path.dirname(zip_path), news_file)
    language_to_sentences[language_code] = pd.read_csv(news_path, sep='\t', header=None)[0][:1000]
    language_to_news_path[language_code] = news_path

    print('{:,} {} sentences'.format(len(language_to_sentences[language_code]), language_name))

Downloading data from http://opus.nlpl.eu/download.php?f=News-Commentary/v11/moses/ar-en.txt.zip
1,000 Arabic sentences
Downloading data from http://opus.nlpl.eu/download.php?f=News-Commentary/v11/moses/en-zh.txt.zip
1,000 Chinese sentences
Downloading data from http://opus.nlpl.eu/download.php?f=News-Commentary/v11/moses/en-es.txt.zip
1,000 English sentences
Downloading data from http://opus.nlpl.eu/download.php?f=News-Commentary/v11/moses/en-ru.txt.zip
1,000 Russian sentences
1,000 Spanish sentences


## Using the pre-trained model to encode sentences

In [None]:
batch_size = 2048
language_to_embeddings = {}
for language_code, zip_file, news_file, language_name in corpus_metadata:
    print('\nComputing {} embeddings'.format(language_name))
    with tqdm(total=len(language_to_sentences[language_code])) as pbar:
        for batch in pd.read_csv(language_to_news_path[language_code], sep='\t',header=None, chunksize=batch_size):
            language_to_embeddings.setdefault(language_code, []).extend(embed_text(batch[0]))
            pbar.update(len(batch))

  0%|          | 0/1000 [00:00<?, ?it/s]


Computing Arabic embeddings


10240it [00:50, 197.02it/s]             

## Building an index of semantic vectors using SimpleNeighbors

In [None]:
%%time

num_index_trees = 40
language_name_to_index = {}
embedding_dimensions = len(list(language_to_embeddings.values())[0][0])
for language_code, zip_file, news_file, language_name in corpus_metadata:
    print('\nAdding {} embeddings to index'.format(language_name))
    index = SimpleNeighbors(embedding_dimensions, metric='dot')

    for i in trange(len(language_to_sentences[language_code])):
        index.add_one(language_to_sentences[language_code][i], language_to_embeddings[language_code][i])

    print('Building {} index with {} trees...'.format(language_name, num_index_trees))
    index.build(n=num_index_trees)
    language_name_to_index[language_name] = index

In [None]:
%%time

num_index_trees = 60
print('Computing mixed-language index')
combined_index = SimpleNeighbors(embedding_dimensions, metric='dot')
for language_code, zip_file, news_file, language_name in corpus_metadata:
    print('Adding {} embeddings to mixed-language index'.format(language_name))
    for i in trange(len(language_to_sentences[language_code])):
        annotated_sentence = '({}) {}'.format(language_name, language_to_sentences[language_code][i])
        combined_index.add_one(annotated_sentence, language_to_embeddings[language_code][i])

print('Building mixed-language index with {} trees...'.format(num_index_trees))
combined_index.build(n=num_index_trees)

### Testing Semantic-search cross-lingual capabilities

In [None]:
sample_query = 'Global warming' 
index_language = 'English'  #["Arabic", "Chinese", "English", "French", "German", "Russian", "Spanish"]
num_results = 10  

query_embedding = embed_text(sample_query)[0]
search_results = language_name_to_index[index_language].nearest(query_embedding, n=num_results)

print('{} sentences similar to: "{}"\n'.format(index_language, sample_query))
search_results

### Mixed-corpus capabilities

In [None]:
sample_query = 'Global warming'  
num_results = 40  

query_embedding = embed_text(sample_query)[0]
search_results = language_name_to_index[index_language].nearest(query_embedding, n=num_results)

print('{} sentences similar to: "{}"\n'.format(index_language, sample_query))
search_results