In [2]:
import requests
from io import StringIO
import pandas as pd


In [3]:
res = requests.get('https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/sick2014/SICK_train.txt')
data = StringIO(res.text)
data = pd.read_csv(data, sep='\t')
data.head()

Unnamed: 0,pair_ID,sentence_A,sentence_B,relatedness_score,entailment_judgment
0,1,A group of kids is playing in a yard and an ol...,A group of boys in a yard is playing and a man...,4.5,NEUTRAL
1,2,A group of children is playing in the house an...,A group of kids is playing in a yard and an ol...,3.2,NEUTRAL
2,3,The young boys are playing outdoors and the ma...,The kids are playing outdoors near a man with ...,4.7,ENTAILMENT
3,5,The kids are playing outdoors near a man with ...,A group of kids is playing in a yard and an ol...,3.4,NEUTRAL
4,9,The young boys are playing outdoors and the ma...,A group of kids is playing in a yard and an ol...,3.7,NEUTRAL


In [4]:
sentences1 = data['sentence_A'].tolist()
sentences2 = data['sentence_B'].tolist()
sentences1[:5]

['A group of kids is playing in a yard and an old man is standing in the background',
 'A group of children is playing in the house and there is no man standing in the background',
 'The young boys are playing outdoors and the man is smiling nearby',
 'The kids are playing outdoors near a man with a smile',
 'The young boys are playing outdoors and the man is smiling nearby']

In [5]:
sentences = sentences1 + sentences2

In [6]:
len(sentences)

9000

In [7]:
# This isn't a particularly large number, so let's pull in a few more similar datasets.
urls = [
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/MSRpar.train.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/MSRpar.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2013/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2014/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2014/images.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2015/images.test.tsv'
]


In [8]:

for url in urls:
    res = requests.get(url)
    # extract to dataframe
    data = pd.read_csv(StringIO(res.text), sep='\t', header=None, on_bad_lines='skip')
    # add to columns 1 and 2 to sentences list
    sentences.extend(data[1].tolist())
    sentences.extend(data[2].tolist())




In [9]:
len(sentences)

20470

In [10]:
len(set(sentences)  )

14505

In [11]:
sentences = [sentence for sentence in set(sentences) if isinstance(sentence, str) and len(sentence) > 0]
sentences[0:5]

['She insisted, though, that it not be published until after her death.',
 'reverse (a direction, attitude, or course of action).',
 'He playfully chided the Senate\'s "little bitty tax relief plan."',
 'The boy is lying on a couch with a puppy.',
 'A cruise ship is in front of a docking area.']

In [12]:
len(sentences)

14504

In [13]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

In [14]:
from sentence_transformers import SentenceTransformer
# all-MiniLM-L6-v2 is fast, but not as accurate as bert-base-nli-mean-tokens
# You can change the model to see how it affects the results
model = SentenceTransformer('bert-base-nli-mean-tokens')

sentence_embeddings = model.encode(sentences, show_progress_bar=True)
sentence_embeddings.shape

  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 454/454 [02:12<00:00,  3.44it/s]


(14504, 768)

In [30]:
import faiss
d = sentence_embeddings.shape[1]  # dimension
d

768

In [31]:
index = faiss.IndexFlatL2(d)

In [32]:
index.is_trained

True

In [33]:
index.add(sentence_embeddings)

In [34]:
k = 4 # Top 4 results
xq = model.encode(["A man running in the football field"])

In [35]:
%%time
D, I = index.search(xq, k)  # actual search
I


CPU times: total: 0 ns
Wall time: 3.19 ms


array([[ 7952, 12890,  5329, 12861]], dtype=int64)

In [36]:
for i in range(k):
    print(f"Result {i + 1} : {sentences[I[0][i]]}")

Result 1 : A group of football players is running in the field
Result 2 : A group of football players running down the field.
Result 3 : A group of people playing football is running in the field
Result 4 : A man in a football uniform is running with a football during a game.


In [37]:
baseline = I[0]

# Partioning
Using this method, we would take a query vector xq, identify the cell it belongs to, and then use our IndexFlatL2 (or another metric) to search between the query vector and all other vectors belonging to that specific cell.

So, we are reducing the scope of our search, producing an approximate answer, rather than exact (as produced through exhaustive search).

To implement this, we first initialize our index using IndexFlatL2 — but this time, we are using the L2 index as a quantizer step — which we feed into the partitioning IndexIVFFlat index.

In [46]:
nlist = 20
quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFFlat(quantizer, d, nlist)

In [49]:
index.is_trained

True

In [None]:
index.train(sentence_embeddings)
index.is_trained

In [50]:
index.add(sentence_embeddings)
index.ntotal

14504

In [51]:
%%time
D, I = index.search(xq, k)
I

CPU times: total: 0 ns
Wall time: 1.01 ms


array([[ 7952, 12890,  5329, 12861]], dtype=int64)

In [52]:
for i in range(k):
    print(f"Result {i + 1} : {sentences[I[0][i]]}")

Result 1 : A group of football players is running in the field
Result 2 : A group of football players running down the field.
Result 3 : A group of people playing football is running in the field
Result 4 : A man in a football uniform is running with a football during a game.


In [53]:
test_compare = [ True if baseline[i] == I[0][i] else False  for i in range(k)]
test_compare
# if you increase the value of nlist speed is gonna increase/ accuracy will decrease 

[True, True, True, True]

In [54]:
print(baseline, " ", I)



[ 7952 12890  5329 12861]   [[ 7952 12890  5329 12861]]


In [55]:
# This is Inverted File with Product Quantization (IVFPQ)
m = 8  # number of centroid IDs in final compressed vectors
bits = 8 # number of bits in each centroid

quantizer = faiss.IndexFlatL2(d)  # we keep the same L2 distance flat index
index = faiss.IndexIVFPQ(quantizer, d, nlist, m, bits) 

In [56]:
index.train(sentence_embeddings)
index.is_trained

True

In [57]:
index.add(sentence_embeddings)
index.ntotal

14504

In [58]:
%%time
D, I = index.search(xq, k)
I

CPU times: total: 0 ns
Wall time: 0 ns


array([[ 7168, 10773, 12861, 12890]], dtype=int64)

In [59]:
for i in range(k):
    print(f"Result {i + 1} : {sentences[I[0][i]]}")

Result 1 : A football player is running past an official carrying a football
Result 2 : A person playing football is running past an official carrying a football
Result 3 : A man in a football uniform is running with a football during a game.
Result 4 : A group of football players running down the field.


In [60]:
test_compare = [ True if baseline[i] == I[0][i] else False  for i in range(k)]
test_compare
# if you increase the value of nlist speed is gonna increase/ accuracy will decrease 

[False, False, False, False]

In [61]:
print(baseline, " ", I)



[ 7952 12890  5329 12861]   [[ 7168 10773 12861 12890]]
