In [1]:
import requests
from io import StringIO
import pandas as pd

In [2]:
res = requests.get('https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/sick2014/SICK_train.txt')
# create dataframe
data = pd.read_csv(StringIO(res.text), sep='\t')
data.head()

Unnamed: 0,pair_ID,sentence_A,sentence_B,relatedness_score,entailment_judgment
0,1,A group of kids is playing in a yard and an ol...,A group of boys in a yard is playing and a man...,4.5,NEUTRAL
1,2,A group of children is playing in the house an...,A group of kids is playing in a yard and an ol...,3.2,NEUTRAL
2,3,The young boys are playing outdoors and the ma...,The kids are playing outdoors near a man with ...,4.7,ENTAILMENT
3,5,The kids are playing outdoors near a man with ...,A group of kids is playing in a yard and an ol...,3.4,NEUTRAL
4,9,The young boys are playing outdoors and the ma...,A group of kids is playing in a yard and an ol...,3.7,NEUTRAL


In [3]:
sentences = data['sentence_A'].tolist()
sentences[:5]

['A group of kids is playing in a yard and an old man is standing in the background',
 'A group of children is playing in the house and there is no man standing in the background',
 'The young boys are playing outdoors and the man is smiling nearby',
 'The kids are playing outdoors near a man with a smile',
 'The young boys are playing outdoors and the man is smiling nearby']

In [4]:
sentences = data['sentence_A'].tolist()
sentence_b = data['sentence_B'].tolist()
sentences.extend(sentence_b)  # merge them
len(set(sentences))  # together we have ~4.5K unique sentences

4802

In [5]:
urls = [
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/MSRpar.train.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/MSRpar.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2013/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2014/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2014/images.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2015/images.test.tsv'
]

In [6]:
for url in urls:
    res = requests.get(url)
    # extract to dataframe
    data = pd.read_csv(StringIO(res.text), sep='\t', header=None, on_bad_lines='skip')
    # add to columns 1 and 2 to sentences list
    sentences.extend(data[1].tolist())
    sentences.extend(data[2].tolist())

In [7]:
len(set(sentences))

14505

In [8]:
sentences = [word for word in list(set(sentences)) if type(word) is str]

In [9]:
from sentence_transformers import SentenceTransformer
# initialize sentence transformer model
model = SentenceTransformer('bert-base-nli-mean-tokens')
# create sentence embeddings
sentence_embeddings = model.encode(sentences)
sentence_embeddings.shape

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


(14504, 768)

In [10]:
!pip install faiss-gpu



In [11]:
import faiss


In [12]:
d = sentence_embeddings.shape[1]
d

768

In [13]:
index = faiss.IndexFlatL2(d)

In [14]:
index.is_trained

True

In [15]:
index.add(sentence_embeddings)

In [16]:
index.ntotal

14504

In [18]:
k = 4
xq = model.encode(["Someone sprints with a football"])

In [19]:
%%time
D, I = index.search(xq, k)  # search
print(I)

[[1465 4653 1339 9118]]
CPU times: user 16.4 ms, sys: 899 µs, total: 17.3 ms
Wall time: 22.6 ms


In [20]:
[f'{i}: {sentences[i]}' for i in I[0]]

['1465: A group of football players is running in the field',
 '4653: A group of people playing football is running in the field',
 '1339: Two groups of people are playing football',
 '9118: A person playing football is running past an official carrying a football']

In [21]:
import numpy as np

vecs = np.zeros((k, d))
# then iterate through each ID from I and add the reconstructed vector to our zero-array
for i, val in enumerate(I[0].tolist()):
    vecs[i, :] = index.reconstruct(val)

In [35]:
vecs.shape

(4, 768)

In [36]:
vecs[0][:100]

array([ 0.01627072,  0.2232592 , -0.15037425, -0.30747271, -0.27122465,
       -0.10593155, -0.06460934,  0.04738171, -0.73349047, -0.37657681,
       -0.76762789,  0.16902882,  0.53107649,  0.51176697,  1.14415824,
       -0.08562881, -0.67240071, -0.96637076,  0.02545465, -0.21559809,
       -1.25656545, -0.82982159, -0.09825006, -0.21850838,  0.50610238,
        0.10527924,  0.50396848,  0.6524294 , -1.39458752,  0.65847486,
       -0.21525319, -0.22487473,  0.818183  ,  0.08464295, -0.76141769,
       -0.28928289, -0.09825794, -0.73046207,  0.07855801, -0.84354597,
       -0.59242105,  0.7747131 , -1.20920527, -0.22757922, -1.30733585,
       -0.23081493, -1.31322539,  0.01629098, -0.97285485,  0.19308187,
        0.47424555,  1.18920887, -1.96741295, -0.70061141, -0.2963869 ,
        0.60533738,  0.62407422, -0.70340371, -0.86754245,  0.17673171,
       -0.19170482, -0.02951987,  0.22623563, -0.16695446, -0.80402559,
       -0.45918921,  0.69675452, -0.24928184, -1.01478708, -0.92

In [37]:
nlist = 50  # how many cells
quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFFlat(quantizer, d, nlist)

In [38]:
index.is_trained

False

In [39]:
index.train(sentence_embeddings)
index.is_trained  # check if index is now trained

True

In [40]:
index.add(sentence_embeddings)
index.ntotal  # number of embeddings indexed

14504

In [41]:
%%time
D, I = index.search(xq, k)  # search
print(I)

[[1465 4653 1339 9118]]
CPU times: user 729 µs, sys: 984 µs, total: 1.71 ms
Wall time: 1.74 ms


In [42]:
[f'{i}: {sentences[i]}' for i in I[0]]

['1465: A group of football players is running in the field',
 '4653: A group of people playing football is running in the field',
 '1339: Two groups of people are playing football',
 '9118: A person playing football is running past an official carrying a football']

In [43]:
index.nprobe = 10

In [44]:
%%time
D, I = index.search(xq, k)  # search
print(I)

[[1465 4653 1339 9118]]
CPU times: user 4.4 ms, sys: 29 µs, total: 4.43 ms
Wall time: 3.67 ms


In [45]:
[f'{i}: {sentences[i]}' for i in I[0]]

['1465: A group of football players is running in the field',
 '4653: A group of people playing football is running in the field',
 '1339: Two groups of people are playing football',
 '9118: A person playing football is running past an official carrying a football']

In [46]:
index.make_direct_map()

In [47]:
index.reconstruct(7460)[:100]

array([ 0.3074015 ,  0.29784325,  1.169849  ,  0.49171206,  0.15579304,
        0.2688289 ,  0.7620417 ,  0.20051564,  0.27143443, -0.3085456 ,
       -1.0383936 ,  0.5424543 , -0.07457361,  0.7191684 ,  0.98483235,
       -0.3183309 , -0.80500513, -0.03861646,  1.7780961 ,  0.06267159,
       -0.4335138 ,  0.536375  , -0.10101058, -0.7650174 ,  0.20436549,
        0.68602496,  0.12260347, -0.33803013, -1.3555557 , -0.32722074,
        0.16515845,  0.626782  ,  0.2441674 , -0.45406246, -0.53605086,
        0.27161562,  0.7422892 , -0.51232   ,  0.3075459 ,  0.00270357,
        0.42943513,  0.45058873,  0.06177604, -0.177378  , -0.6183112 ,
       -0.44597647,  0.6259822 ,  0.38553518, -1.0830989 , -0.11827786,
       -0.3288778 , -0.261656  ,  0.9885873 , -0.35565   , -0.6618367 ,
       -0.369806  , -0.40991232, -0.8843489 , -0.4786378 ,  0.3552321 ,
       -0.34558553, -0.6263227 ,  0.5656237 ,  0.9058484 , -0.58895785,
       -0.02673524,  0.25150937, -0.2671459 , -1.2325639 , -0.54

In [48]:
m = 8  # number of centroid IDs in final compressed vectors
bits = 8 # number of bits in each centroid

quantizer = faiss.IndexFlatL2(d)  # we keep the same L2 distance flat index
index = faiss.IndexIVFPQ(quantizer, d, nlist, m, bits)

In [49]:
index.is_trained

False

In [50]:
index.train(sentence_embeddings)

In [51]:
index.add(sentence_embeddings)

In [52]:
index.nprobe = 10

In [53]:
%%time
D, I = index.search(xq, k)
print(I)

[[ 644  706 1444 1339]]
CPU times: user 1.48 ms, sys: 29 µs, total: 1.51 ms
Wall time: 1.52 ms


In [54]:
[f'{i}: {sentences[i]}' for i in I[0]]

['644: Football players are on the field.',
 '706: The crowd is watching the football at the game',
 '1444: position in football played by a team member',
 '1339: Two groups of people are playing football']