### FAISS - Facebook AI Similarity Search

In [7]:
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS

In [8]:
import os
from dotenv import load_dotenv
load_dotenv()  ## load all the environment variables
from google import genai

In [9]:
os.environ['GOOGLE_API_KEY']=os.getenv("GOOGLE_API_KEY")

In [10]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
embedding = GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")
embedding

GoogleGenerativeAIEmbeddings(client=<google.genai.client.Client object at 0x0000023CB99C6320>, model='gemini-embedding-001', task_type=None, google_api_key=SecretStr('**********'), credentials=None, vertexai=None, project=None, location=None, base_url=None, additional_headers=None, client_args=None, request_options=None, output_dimensionality=None)

In [20]:
from langchain_text_splitters import CharacterTextSplitter

loader = TextLoader('sample.txt')
document = loader.load()

text_splitter = CharacterTextSplitter(separator= "",chunk_size=300, chunk_overlap=30)
docs = text_splitter.split_documents(document)


In [21]:
docs

[Document(metadata={'source': 'sample.txt'}, page_content='1. Artificial Intelligence (AI)\nThe Concept: The broadest umbrella term for any technique that enables computers to mimic human intelligence.\nThe Approach: It doesn\'t always require "learning." Early AI used "Expert Systems" or "If-Then" logicâ€”hard-coded rules written by humans to help a machine m'),
 Document(metadata={'source': 'sample.txt'}, page_content='by humans to help a machine make decisions (e.g., a chess program that follows a specific set of programmed moves).\nThe Goal: To create a system that can execute tasks that would normally require a human brain, such as reasoning, planning, or understanding language.\n2. Machine Learning (ML)\nThe Con'),
 Document(metadata={'source': 'sample.txt'}, page_content='Machine Learning (ML)\nThe Concept: A subset of AI defined by the ability to learn from data rather than following rigid instructions.\nThe Approach: Instead of a human writing code for every possibility, we pr

In [22]:
db = FAISS.from_documents(docs, embedding)

In [None]:
# 1. Get the document ID from the FAISS index position
doc_id = db.index_to_docstore_id[1]

In [None]:
# 2. Retrieve the actual document content from the docstore

document = db.docstore.search(doc_id)
print(document)


page_content='by humans to help a machine make decisions (e.g., a chess program that follows a specific set of programmed moves).
The Goal: To create a system that can execute tasks that would normally require a human brain, such as reasoning, planning, or understanding language.
2. Machine Learning (ML)
The Con' metadata={'source': 'sample.txt'}


In [33]:
## querying

query = "What is machine learning?"

query_replay = db.similarity_search(query)
query_replay[0].page_content

'Machine Learning (ML)\nThe Concept: A subset of AI defined by the ability to learn from data rather than following rigid instructions.\nThe Approach: Instead of a human writing code for every possibility, we provide the machine with examples (data). The machine uses algorithms (like Linear Regression'

#### Retriever
We can also convert the vectorstore into a Retriever class. This allows us to easily use it in other LangChain methods, which largly work with retrievers.

In LLMs we can not acces the vectorstore DB directly, so we have to convert it into retriever then we can give the query.

In [35]:
retriever = db.as_retriever()
docs = retriever.invoke(query)
docs[0].page_content


'Machine Learning (ML)\nThe Concept: A subset of AI defined by the ability to learn from data rather than following rigid instructions.\nThe Approach: Instead of a human writing code for every possibility, we provide the machine with examples (data). The machine uses algorithms (like Linear Regression'

#### Similarity Search with score
In FAISS we can do similarity search along with the respective score.
For that we can use the function similarity_search_with_score.

It calculate the Manhatton distance between the documents, so the less the distance is better.

In [36]:
docs_and_score = db.similarity_search_with_score(query)
docs_and_score

[(Document(id='e7fa1a57-b7b7-4b89-b0c5-45cfd795e3e9', metadata={'source': 'sample.txt'}, page_content='Machine Learning (ML)\nThe Concept: A subset of AI defined by the ability to learn from data rather than following rigid instructions.\nThe Approach: Instead of a human writing code for every possibility, we provide the machine with examples (data). The machine uses algorithms (like Linear Regression'),
  np.float32(0.5030667)),
 (Document(id='d6b26cb6-3593-4263-86f3-b3811322fe57', metadata={'source': 'sample.txt'}, page_content='rithms (like Linear Regression, Decision Trees, or Random Forests) to find patterns. For instance, to identify a "spam" email, the algorithm looks at thousands of emails and notices that the word "Lottery" frequently appears in spam.\nKey Constraint: ML often requires Feature Engineering. A human exp'),
  np.float32(0.5556675)),
 (Document(id='df97e672-2666-4e6e-8c4b-ccf764b4f043', metadata={'source': 'sample.txt'}, page_content='1. Artificial Intelligence (A

In [None]:
## Saving and loading

# db.save_local("faiss_index")

# new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True) 

# docs=new_db.similarity_search(query)