This notebook explores search results and similarity score based on different index used for storing text document embeddings by FAISS from original library vs FAISS vectorstore available integrated with Langchain

In [None]:
! pip install langchain faiss-cpu sentence-transformers scipy==1.8.0

In [2]:
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
import faiss

import numpy as np
from numpy.linalg import norm
from scipy import spatial

In [3]:
sentence_encoder = SentenceTransformerEmbeddings(model_name='all-mpnet-base-v2')

In [4]:
text_A = "The cat is sitting outside the house"
text_B = "A woman is playing guitar in the auditorium"
query = "girl with music instrument"

### Using cosine similarity in Python

In [5]:
# define two lists or array

sent_A_emb = sentence_encoder.embed_query(text_A)
sent_B_emb = sentence_encoder.embed_query(text_B)
query_emb = sentence_encoder.embed_query(query)

In [6]:
 # compute cosine similarity
cosine_distance = 1 - np.dot(sent_A_emb, query_emb)/(norm(sent_A_emb)*norm(query_emb))
print("Cosine Distance between query and A:", cosine_distance)

cosine_distance = 1 - np.dot(query_emb,sent_B_emb)/(norm(query_emb)*norm(sent_B_emb))
print("Cosine Distance between query and B:", cosine_distance)

Cosine Distance between query and A: 1.0357309680895828
Cosine Distance between query and B: 0.6514501500416301


### Using FAISS standalone with Inner Product based index

In [7]:
# dimension of sentence embeddings

dim = len(sent_A_emb)

In [8]:
# Using the index type supporting inner product (IP)

faiss_index = faiss.IndexFlatIP(dim)

In [9]:
faiss_index.is_trained

True

Convert the embeddings into numpy arrays and normalize them

In [10]:
arr = np.array(sent_A_emb)
arr_emb_a = np.reshape(arr, (1,768))
norm = np.linalg.norm(arr_emb_a)
norm_emb_a = arr_emb_a / norm

arr = np.array(sent_B_emb)
arr_emb_b = np.reshape(arr, (1,768))
norm = np.linalg.norm(arr_emb_b)
norm_emb_b = arr_emb_b / norm

arr = np.array(query_emb)
arr_emb_query = np.reshape(arr, (1,768))
norm = np.linalg.norm(arr_emb_query)
norm_emb_query = arr_emb_query / norm


In [11]:
arr_emb_a[0][:6]

array([-0.03011965,  0.00670151,  0.00719493, -0.04349164,  0.02373334,
       -0.01662551])

In [12]:
norm_emb_a[0][:6]

array([-0.03011965,  0.00670151,  0.00719493, -0.04349164,  0.02373334,
       -0.01662551])

In [13]:
# The embeddings by sentence transformer model are already normalized

np.allclose(norm_emb_a, arr_emb_a)

True

In [14]:
norm

1.0000000289648816

In [25]:
# Stack the 2 sentence embeddings into a 2D Numpy array

doc_embeddings = np.vstack((norm_emb_a, norm_emb_b))
# doc_embeddings = np.vstack((arr_emb_a, arr_emb_b))

In [16]:
# Add embeddings in the index

faiss_index.add(doc_embeddings)

In [17]:
# No. of docs in the index

faiss_index.ntotal

2

In [26]:
search_results = faiss_index.search(norm_emb_query, 2)

# search_results = faiss_index.search(arr_emb_query, 2)

In [35]:
# The result is K nearest neighbours with their cosine similarity between each document with the query vector.
# The 2nd array in the result specifies the index of the neighbour in the original doc embeddings array

search_results

(array([[ 0.34854984, -0.03573097]], dtype=float32), array([[1, 0]]))

In [38]:
# cosine similarity tallying above result

result = 1 - spatial.distance.cosine(norm_emb_a, norm_emb_query)
print('Cosine similarity between A and query:{}'.format(result))

Cosine similarity between A and query:-0.03573096808958276


In [39]:
result = 1 - spatial.distance.cosine(norm_emb_b, norm_emb_query)
print('Cosine similarity between B and query:{}'.format(result))

Cosine similarity between B and query:0.34854984995837


#### Using FAISS's default similarity score L2 norm from Langchain

In [43]:
faiss_db = FAISS.from_texts([text_A, text_B], sentence_encoder)

In [44]:
search_result_docs = faiss_db.similarity_search_with_score(query=query, k=2)
search_result_docs

[(Document(page_content='A woman is playing guitar in the auditorium', metadata={}),
  1.3029001),
 (Document(page_content='The cat is sitting outside the house', metadata={}),
  2.0714622)]

In [45]:
search_result_docs = faiss_db.similarity_search_with_relevance_scores(query=query, k=2)
search_result_docs

[(Document(page_content='A woman is playing guitar in the auditorium', metadata={}),
  0.07871052111382104),
 (Document(page_content='The cat is sitting outside the house', metadata={}),
  -0.4647449363393521)]

### Using FAISS standalone with L2 distance based index

In [46]:
faiss_index = faiss.IndexFlatL2(dim)

In [47]:
faiss_index.add(doc_embeddings)

In [48]:
faiss_index.ntotal

2

In [49]:
search_results = faiss_index.search(arr_emb_query, 2)
search_results

(array([[1.3029003, 2.0714622]], dtype=float32), array([[1, 0]]))

Note that the L2 norm distances in the above nearest neighbors matches with that of L2 norms distance using FAISS with L2 index above.

#### References
* https://youtu.be/sKyvsdEv6rk
* https://github.com/facebookresearch/faiss/blob/main/tutorial/python/1-Flat.py
* https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index
* https://github.com/facebookresearch/faiss/wiki/Getting-started
* https://github.com/facebookresearch/faiss/wiki/MetricType-and-distances#how-can-i-index-vectors-for-cosine-similarity