In [1]:
import sys
import re
import os
from pathlib import Path
from collections import namedtuple
import numpy as np

### Set up the LASER environmental variable as shown below. we are inside notebook folders, hence LASER_PATH will be one directory above this.

In [2]:
import os
import sys
import pandas as pd
import numpy as np
import pickle
from collections import Counter, defaultdict
import re

pd.set_option('display.max_colwidth' , -1)

LASER_PATH = ".."
os.environ["LASER"] = LASER_PATH 

#### need to set up DATA_PATH , CACHE_PATH , MODEL_PATH as shown below
Codes below are inspired from this blog post https://medium.com/the-artificial-impostor/multilingual-similarity-search-using-pretrained-bidirectional-lstm-encoder-e34fac5958b0

In [3]:
sys.path.append(LASER_PATH + '/source')
sys.path.append(LASER_PATH + '/source/lib')

DATA_PATH = Path("../data/tatoeba/v1/")
CACHE_PATH = Path("cache/")
CACHE_PATH.mkdir(exist_ok=True)
MODEL_PATH = Path("../models")

from indexing import IndexLoad, IndexTextOpen, IndexTextQuery ,IndexSearchKNN, IndexCreate, IndexSearchMultiple
from embed import SentenceEncoder, EncodeLoad, EncodeFile
from text_processing import Token, BPEfastApply

In [4]:
SPACE_NORMALIZER = re.compile("\s+")
Batch = namedtuple('Batch', 'srcs tokens lengths')

In [5]:
def MarginRatio(em, ofp, params, args, stats):
    if args.include_source == 'always':
        ofp.write('{:d}\t{:6.1f}\t{}\n'
                  .format(stats.nbs, 0.0, sentences[n].replace('@@ ', '')))
    D, I = params.idx.search(em, args.margin_k)
    Mean = D.mean(axis=1)
    for n in range(D.shape[0]):
        if D[n, 0] / Mean[n] <= args.threshold:
            if args.include_source == 'matches':
                ofp.write('{:d}\t{:6.1f}\t{}\n'
                          .format(stats.nbs, 0.0, sentences[n].replace('@@ ', '')))
            txt = IndexTextQuery(params.T, params.R, I[n, 0])
            ofp.write('{:d}\t{:7.5f}\t{}\n'.format(stats.nbs, D[n, 0], txt))
            stats.nbp += 1

        stats.nbs += 1

In [6]:
def buffered_read(fp, buffer_size):
    buffer = []
    for src_str in fp:
        buffer.append(src_str.strip())
        if len(buffer) >= buffer_size:
            yield buffer
            buffer = []

    if len(buffer) > 0:
        yield buffer

### Tokenization and BPE extraction
We are working on  two input files -

1) tatoeba.hin-eng.hin ( Tatoeba test data set Hindi Sentences) 

2) tatoeba.hin-eng.eng  ( Tatoeba test data set English Sentences) 

In [7]:
Token(
    str(DATA_PATH / "tatoeba.hin-eng.hin"),
    str(CACHE_PATH / "tatoeba.hin-eng.hin"),
    lang="hin",
    romanize=False,
    lower_case=True, gzip=False,
    verbose=True, over_write=False)

 - Tokenizer: tatoeba.hin-eng.hin in language hin  


In [8]:
Token(
    str(DATA_PATH / "tatoeba.hin-eng.eng"),
    str(CACHE_PATH / "tatoeba.hin-eng.eng"),
    lang="en",
    romanize=False,
    lower_case=True, gzip=False,
    verbose=True, over_write=False)

 - Tokenizer: tatoeba.hin-eng.eng in language en  


In [9]:
bpe_codes = str(MODEL_PATH / "93langs.fcodes")
BPEfastApply(
    str(CACHE_PATH / "tatoeba.hin-eng.eng"),
    str(CACHE_PATH / "tatoeba.hin-eng.eng.bpe"),
    bpe_codes,
    verbose=True, over_write=False)

 - fast BPE: processing tatoeba.hin-eng.eng


In [10]:
bpe_codes = str(MODEL_PATH / "93langs.fcodes")
BPEfastApply(
    str(CACHE_PATH / "tatoeba.hin-eng.hin"),
    str(CACHE_PATH / "tatoeba.hin-eng.hin.bpe"),
    bpe_codes,
    verbose=True, over_write=False)

 - fast BPE: processing tatoeba.hin-eng.hin


## Extract Sentence Embeddings

In [11]:
encoder = SentenceEncoder(
    str(MODEL_PATH / "bilstm.93langs.2018-12-26.pt"),
    max_sentences=None,
    max_tokens=10000,
    cpu=False)


EncodeFile(
    encoder,
    str(CACHE_PATH / "tatoeba.hin-eng.hin.bpe"),
    str(CACHE_PATH / "tatoeba.hin-eng.hin.enc"),
    verbose=True, over_write=False)

 - Encoder: tatoeba.hin-eng.hin.bpe to tatoeba.hin-eng.hin.enc
 - Encoder: 1000 sentences in 0s


In [12]:
EncodeFile(
    encoder,
    str(CACHE_PATH / "tatoeba.hin-eng.eng.bpe"),
    str(CACHE_PATH / "tatoeba.hin-eng.eng.enc"),
    verbose=True, over_write=False)

 - Encoder: tatoeba.hin-eng.eng.bpe to tatoeba.hin-eng.eng.enc
 - Encoder: 1000 sentences in 0s


## Create Index

In [13]:
data_en, index_en = IndexCreate(
    str(CACHE_PATH / "tatoeba.hin-eng.eng.enc"), 'FlatL2', verbose=True, save_index=False)


 - embedding: cache/tatoeba.hin-eng.eng.enc 1000 examples of dim 1024
 - creating FAISS index


In [14]:
data_hin, index_hin = IndexCreate(
    str(CACHE_PATH / "tatoeba.hin-eng.hin.enc"), 'FlatL2', verbose=True, save_index=False)

 - embedding: cache/tatoeba.hin-eng.hin.enc 1000 examples of dim 1024
 - creating FAISS index


## Evaluation

In [15]:
err = IndexSearchMultiple(
    [data_en, data_hin], [index_en, index_hin], langs=["en", "hin"], verbose=True)

Calculating similarity error (indices):
 - similarity error en=>hin  5.80%
 - similarity error hin=>en  4.80%


### Error Analysis

Read in the tokenized documents:

In [16]:
documents = {}
for lang in ("eng", "hin"):
    with open(CACHE_PATH / f"tatoeba.hin-eng.{lang}") as fin:
        documents[lang] = list(fin.readlines())
print(len(documents["eng"]), len(documents["hin"]))

1000 1000


### Misclassfied examples in English to Hindi Similarity Search

In [17]:
_, matched_indices = index_hin.search(data_en, 1)
print(matched_indices.shape)
print('Total english sentences with correct index in hindi: ' , sum([x == i for i, x in enumerate(matched_indices[:, 0])]))

(1000, 1)
Total english sentences with correct index in hindi:  942


In [18]:

for idx in np.where([x != i for i, x in enumerate(matched_indices[:, 0])])[0]:
    print(
        "source:  ", documents["eng"][idx].strip() + "\n",
        "predict: ", documents["hin"][matched_indices[idx, 0]].strip() + "\n",
        "correct: ", documents["hin"][idx].strip() + "\n", sep=""
    )

source:  i accompanied her on a walk .
predict: मैं सफ ़ र पर उसके साथ गया ।
correct: मैं उसके साथ सैर के लिए गया ।

source:  he is possessed of intelligence .
predict: वह हरफनमौला है ।
correct: वह अकलमंद है ।

source:  it 's for free .
predict: यह मुफ ़ ् त का है ।
correct: यह मुफ ़ ् त है ।

source:  this lion is very tame .
predict: टॉम बहुत स ् वार ् थी है ।
correct: यह शेर बहुत पालतू है ।

source:  the novel is centered on the civil war .
predict: शीतयुद ् ध द ् वितीय विश ् वयुद ् ध के बाद शुरू हुआ ।
correct: यह उपन ् यास सिविल वॉर के बारे में है ।

source:  to make matters worse , he fell ill .
predict: कहा जाता है कि वह बीमार है ।
correct: और-तो-और वह बीमार भी पड ़ गया ।

source:  what is the name of this bird ?
predict: इस पक ् षी का नाम क ् या है ?
correct: इस चिड ़ िया का नाम क ् या है ?

source:  that is my book .
predict: वह मेरी किताब है ।
correct: वह मेरा पुस ् तक है ।

source:  she is dressed in white .
predict: उसने लाल कपड ़ े पहने हैं ।
correct: उसने सफ ़ ेद कपड ़ े प

### Misclassfied examples in English to Hindi Similarity Search

In [19]:
_, matched_indices = index_en.search(data_hin, 1)
print(matched_indices.shape)
print('Total hindi sentences with correct index in english: ' , sum([x == i for i, x in enumerate(matched_indices[:, 0])]))

(1000, 1)
Total hindi sentences with correct index in english:  952


In [20]:
for idx in np.where([x != i for i, x in enumerate(matched_indices[:, 0])])[0]:
    print(
        "source:  ", documents["hin"][idx].strip() + "\n",
        "predict: ", documents["eng"][matched_indices[idx, 0]].strip() + "\n",
        "correct: ", documents["eng"][idx].strip() + "\n", sep=""
    )

source:  मैं उसके साथ सैर के लिए गया ।
predict: i 'm with him .
correct: i accompanied her on a walk .

source:  वह अकलमंद है ।
predict: it 's for free .
correct: he is possessed of intelligence .

source:  यह मुफ ़ ् त का है ।
predict: it 's for free .
correct: it is free of charge .

source:  यह उपन ् यास सिविल वॉर के बारे में है ।
predict: are osakans greedy ?
correct: the novel is centered on the civil war .

source:  वह अमीर है हो है पर कंजूस भी है ।
predict: my family is wealthy .
correct: he may be rich , but he is stingy .

source:  और-तो-और वह बीमार भी पड ़ गया ।
predict: it is said that he is sick .
correct: to make matters worse , he fell ill .

source:  इस पक ् षी का नाम क ् या है ?
predict: what is the name of this bird ?
correct: what do you call this bird ?

source:  वह मेरा पुस ् तक है ।
predict: that 's my book .
correct: that is my book .

source:  उसने सफ ़ ेद कपड ़ े पहने हुएँ हैं ।
predict: she 's wearing red .
correct: she is dressed in white .

source:  वह हरफनमौ