In [3]:
import hnswlib
import numpy as np
import pandas as pd
import time
import pickle
from sentence_transformers import SentenceTransformer, util

In [4]:
##Load Model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [5]:
##Load data
data=pd.read_csv("data.csv",index_col=[0]).reset_index()
data.shape

(100000, 4)

In [9]:
data.head(10)

Unnamed: 0,index,JobTitle
0,0,production packaging
1,1,contract sr business analyst
2,2,senior digital growth marketing strategist
3,3,senior machine learning engineer
4,4,android developer
5,5,corporate staff accountant
6,6,regional sales manager
7,7,outside sales representative
8,8,technical support specialist
9,9,fire truck inspector


In [12]:
##Query for similarity match
query=['nurse practitioner']

## 1. Elastic Search

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
embeddings1 = model.encode(data['JobTitle'].tolist(),convert_to_tensor=False, convert_to_numpy=True,normalize_embeddings=True)
embeddings2 = model.encode(query,convert_to_tensor=False, convert_to_numpy=True,normalize_embeddings=True)

In [15]:
%%time
topn=10
data['distances'] = cosine_similarity(embeddings1, embeddings2) 
n_largest = data['distances'].nlargest(topn + 1) 
cos_list=list(n_largest.index)

CPU times: user 371 ms, sys: 295 ms, total: 666 ms
Wall time: 250 ms


In [16]:
data[data['index'].isin(cos_list)]

Unnamed: 0,index,JobTitle,distances
4294,4294,nurse practitioner,1.0
11377,11377,nurse practitioner,1.0
33932,33932,nurse practitioner,1.0
42671,42671,advanced registered nurse practitioner,0.821825
49928,49928,nurse practitioner,1.0
50008,50008,nurse practitioner primary care,0.822325
51547,51547,nurse practitioner,1.0
52211,52211,nurse practitioner,1.0
57771,57771,nurse practitioner or physician assistant,0.826532
85303,85303,nurse practitioner,1.0


## 2.Sentence Transformer Util.semantic_search 

In [37]:
#Compute embeddings
embeddings1_util = model.encode(data['JobTitle'].tolist(), convert_to_tensor=True)
embeddings2_util = model.encode(query, convert_to_tensor=True)

In [63]:
%%time
hits = util.semantic_search(embeddings1_util, embeddings2_util, top_k=min(10, topn),
                                    score_function=util.dot_score)


CPU times: user 1.33 s, sys: 3.01 ms, total: 1.33 s
Wall time: 436 ms


In [56]:
s=[]
for i in hits:
    for j in i:
        s.append(j['score'])
data['score']=s
n_largest = data['score'].nlargest(topn + 1) 
sem_list=list(n_largest.index)
data[data['index'].isin(cos_list)]

Unnamed: 0,index,JobTitle,distances,score
4294,4294,nurse practitioner,1.0,1.0
11377,11377,nurse practitioner,1.0,1.0
33932,33932,nurse practitioner,1.0,1.0
42671,42671,advanced registered nurse practitioner,0.821825,0.821824
49928,49928,nurse practitioner,1.0,1.0
50008,50008,nurse practitioner primary care,0.822325,0.822324
51547,51547,nurse practitioner,1.0,1.0
52211,52211,nurse practitioner,1.0,1.0
57771,57771,nurse practitioner or physician assistant,0.826532,0.826533
85303,85303,nurse practitioner,1.0,1.0


## 3. Faiss 

https://towardsdatascience.com/getting-started-with-faiss-93e19e887a0c

In [99]:
##Installl Faiss Library
# pip install faiss-cpu

In [17]:
import faiss

### IndexFlatL2 
IndexFlatL2 measures the L2 (or Euclidean) distance between all given points between our query vector, and the vectors loaded into the index. It’s simple, very accurate, but not too fast.

In [19]:
# Step 2: Instantiate the index
index = faiss.IndexFlatL2(embeddings1.shape[1])

# Step 3: Add the index
index.add(embeddings1)
index.ntotal

100000

In [20]:
%%time
# # Retrieve the 10 nearest neighbours
num_results=10
vector = model.encode(query)
D, I = index.search(np.array(vector).astype("float32"), k=num_results)

CPU times: user 156 ms, sys: 2.01 ms, total: 158 ms
Wall time: 62.1 ms


In [21]:
data[data['index'].isin(I[0])]

Unnamed: 0,index,JobTitle,distances
4294,4294,nurse practitioner,1.0
11377,11377,nurse practitioner,1.0
33932,33932,nurse practitioner,1.0
49928,49928,nurse practitioner,1.0
50008,50008,nurse practitioner primary care,0.822325
51547,51547,nurse practitioner,1.0
52211,52211,nurse practitioner,1.0
57771,57771,nurse practitioner or physician assistant,0.826532
85303,85303,nurse practitioner,1.0
92013,92013,inpatient nurse practitioner,0.863217


### IndexIVFFlat 
IndexIVFFlat returns suboptimal results from approximate search.We do this by increasing the nprobe attribute value — which defines how many nearby cells to search.

In [23]:
nlist = 50  # how many cells
d=embeddings1.shape[1] ##How many dimension
quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFFlat(quantizer, d, nlist)

In [24]:
index.train(embeddings1)
index.is_trained  # check if index is now trained

True

In [25]:
index.add(embeddings1)
index.ntotal  # number of embeddings indexed

100000

In [26]:
%%time
# # Retrieve the 10 nearest neighbours
num_results=10
vector = model.encode(query)
D, I = index.search(np.array(vector).astype("float32"), k=num_results)

CPU times: user 75.4 ms, sys: 2.09 ms, total: 77.5 ms
Wall time: 12.7 ms


### IndexIVFPQ

Where IVF allowed us to approximate by reducing the scope of our search, PQ approximates the distance/similarity calculation instead.PQ achieves this approximated similarity operation by compressing the vectors themselves, which consists of three steps.
1. We split the original vector into several subvectors.

2. For each set of subverters, we perform a clustering operation — creating multiple centroids for each sub-vector set.

3. In our vector of sub-vectors, we replace each sub-vector with the ID of it’s nearest set-specific centroid.

In [27]:
m = 8  # number of centroid IDs in final compressed vectors
bits = 8 # number of bits in each centroid

quantizer = faiss.IndexFlatL2(d)  # we keep the same L2 distance flat index
index = faiss.IndexIVFPQ(quantizer, d, nlist, m, bits) 

In [28]:
index.is_trained

False

In [29]:
index.train(embeddings1)
index.is_trained  # check if index is now trained

True

In [30]:
index.add(embeddings1)
index.ntotal  # number of embeddings indexed

100000

In [31]:
index.nprobe = 50  # align to previous IndexIVFFlat nprobe value

In [32]:
%%time
# # Retrieve the 10 nearest neighbours
num_results=10
vector = model.encode(query)
D, I = index.search(np.array(vector).astype("float32"), k=num_results)

CPU times: user 79 ms, sys: 935 µs, total: 79.9 ms
Wall time: 13.2 ms


## 4. HNSW
https://www.pinecone.io/learn/hnsw/


In [36]:
import hnswlib

In [33]:
# Declaring index
d=embeddings1.shape[1] ##How many dimension
M = 32
p = hnswlib.Index(space = 'l2', dim = 384) # possible options are l2, cosine or ip

# Initializing index - the maximum number of elements should be known beforehand
p.init_index(max_elements = data.shape[0], ef_construction = 200, M = 16)

# Element insertion (can be called several times):
p.add_items(embeddings1, data['index'].tolist())

# Controlling the recall by setting ef:
p.set_ef(50) # ef should always be > k


In [34]:
%%time
# Query dataset, k - number of closest elements (returns 2 numpy arrays)
labels, distances = p.knn_query(embeddings2, k = 10)

CPU times: user 117 µs, sys: 1 ms, total: 1.12 ms
Wall time: 527 µs


In [35]:
data[data['index'].isin(labels[0].tolist())]

Unnamed: 0,index,JobTitle,distances
4294,4294,nurse practitioner,1.0
11377,11377,nurse practitioner,1.0
33932,33932,nurse practitioner,1.0
49928,49928,nurse practitioner,1.0
50008,50008,nurse practitioner primary care,0.822325
51547,51547,nurse practitioner,1.0
52211,52211,nurse practitioner,1.0
57771,57771,nurse practitioner or physician assistant,0.826532
85303,85303,nurse practitioner,1.0
92013,92013,inpatient nurse practitioner,0.863217


As you can see that time taken for similarity match is :
   > HNSW < FAISS-IVFPQ/FAISSIVF < FAISSIVFL2 < Elastic Search