VectorDB  - FIASS

In [1]:
#Loading the environment variables 
from dotenv import load_dotenv
load_dotenv(override=True)

#Loading the variables (HuggingFace Token)
import os 
os.environ['HF_TOKEN']=os.getenv("HF_TOKEN")


In [2]:
#Use Sentence Transfromer 
from langchain_huggingface import HuggingFaceEmbeddings
embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 

In [3]:
#Converting a given sentence into a embedding
embeddings.embed_query("Hello AI")

[-0.033388182520866394,
 0.03453972190618515,
 0.059474531561136246,
 0.05928609147667885,
 -0.0635354220867157,
 -0.06819586455821991,
 0.08823323994874954,
 0.03444080427289009,
 -0.03278516232967377,
 -0.015814989805221558,
 0.02098178118467331,
 -0.01834029331803322,
 -0.03983215242624283,
 -0.0804707482457161,
 -0.014469144865870476,
 0.0332648828625679,
 0.014259284362196922,
 -0.03404996916651726,
 -0.142915740609169,
 -0.023083344101905823,
 -0.021380102261900902,
 0.002633501309901476,
 -0.047292742878198624,
 -0.010752756148576736,
 -0.06866798549890518,
 0.031125057488679886,
 0.0759458914399147,
 0.0011283254716545343,
 0.011631987057626247,
 -0.03603919595479965,
 0.04483763128519058,
 0.018390750512480736,
 0.12672801315784454,
 -0.0013597895158454776,
 0.008206663653254509,
 0.06909968703985214,
 -0.08076353371143341,
 -0.05841314047574997,
 0.053754497319459915,
 0.026227595284581184,
 -0.006828607991337776,
 -0.056358352303504944,
 0.0032930178567767143,
 -0.0725017860

In [4]:
#Determining Cosine Simillarity 
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
#Building Documents
documents=[
    "What is the capital of USA?", 
    "Who is the president of USA?",
    "Who is the prime minister of india?"
]



In [6]:
my_query="Narendra Modi"

In [7]:
#Embed the document 
document_embedding=embeddings.embed_documents(documents)
len(document_embedding)

3

In [8]:
query_embedding=embeddings.embed_query(my_query)

In [9]:
#Performing Cosine Simillarity
cosine_similarity([query_embedding], document_embedding)

array([[0.06368394, 0.27326598, 0.50869447]])

In [10]:
#Euclidian Distance
from sklearn.metrics.pairwise import euclidean_distances

#Perform euclidian distance 
euclidean_distances([query_embedding], document_embedding)

array([[1.36844148, 1.20559864, 0.99126742]])

VectorDB

In [11]:
import faiss 
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore 

In [12]:
#Creating index using Faiss 
index=faiss.IndexFlatL2(384)

In [13]:
index

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x00000261D5CE1F20> >

In [14]:
#Creating vector store 
vector_store = FAISS(
    embedding_function=embeddings, 
    index=index, 
    docstore=InMemoryDocstore(), 
    index_to_docstore_id={}
)

In [15]:
vector_store.add_texts(["AI is future", "AI is Powerful"])

['4c49790c-7767-464d-9bb9-c44b74655c49',
 '20f81f1a-a9fc-4ba9-af87-e9ceeff85c77']

In [16]:
vector_store.index_to_docstore_id

{0: '4c49790c-7767-464d-9bb9-c44b74655c49',
 1: '20f81f1a-a9fc-4ba9-af87-e9ceeff85c77'}

In [19]:
results = vector_store.similarity_search("Tell me about AI", k=1)

In [20]:
results

[Document(id='20f81f1a-a9fc-4ba9-af87-e9ceeff85c77', metadata={}, page_content='AI is Powerful')]

In [21]:
vector_store.add_texts(["Thanvi is a good girl", "Tara is a good girl"])
vector_store.index_to_docstore_id

{0: '4c49790c-7767-464d-9bb9-c44b74655c49',
 1: '20f81f1a-a9fc-4ba9-af87-e9ceeff85c77',
 2: '64f49249-75c6-4f17-818a-7a9b0ab2c225',
 3: 'c7ba6c99-7121-4c0a-8730-fe554b781de9'}

In [23]:

results = vector_store.similarity_search("Tell me about AI")
results

[Document(id='20f81f1a-a9fc-4ba9-af87-e9ceeff85c77', metadata={}, page_content='AI is Powerful'),
 Document(id='4c49790c-7767-464d-9bb9-c44b74655c49', metadata={}, page_content='AI is future'),
 Document(id='c7ba6c99-7121-4c0a-8730-fe554b781de9', metadata={}, page_content='Tara is a good girl'),
 Document(id='64f49249-75c6-4f17-818a-7a9b0ab2c225', metadata={}, page_content='Thanvi is a good girl')]

In [24]:
results = vector_store.similarity_search("Tell me about AI", k=2)
results

[Document(id='20f81f1a-a9fc-4ba9-af87-e9ceeff85c77', metadata={}, page_content='AI is Powerful'),
 Document(id='4c49790c-7767-464d-9bb9-c44b74655c49', metadata={}, page_content='AI is future')]

In [25]:
# from uuid import uuid4
from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]

In [None]:
#Importing the required libary 
import faiss 
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore 

#Creating the embedding object
#Use Sentence Transfromer 
from langchain_huggingface import HuggingFaceEmbeddings
embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


In [26]:
#Initialising the Vector Store
index=faiss.IndexFlatIP(384)

vector_store=FAISS(
    embedding_function=embeddings, 
    index=index, 
    docstore=InMemoryDocstore(), 
    index_to_docstore_id={}
)

In [27]:
#Adding Documents
vector_store.add_documents(documents=documents)

['9a6138e8-fc2e-466c-b6f4-60039df56db6',
 '37ffdc5c-e03a-4fee-be57-27c946566030',
 '09526b56-fe07-4b19-bfaa-e365a55779ad',
 '312edf1b-9ef7-4b32-b504-b1acad7a6cc2',
 '01f2cefa-ea29-47e9-a599-95bb424f66a0',
 '3db48931-f28c-40c4-9dc5-594fdc294608',
 '9812b286-b07f-4ad1-be9d-e53bdbf198a0',
 'a823b817-630f-49cd-958c-6672098a8714',
 '46d79f74-b37e-4b25-953a-403b42ea3562',
 '156ca1cd-2c92-4258-9716-dd5d7a5ec011']

In [28]:
#Performing Simillarity search 
vector_store.similarity_search(
    "Langchain Provides Abstractions to make working with LLM easy", 
    k=2
)

[Document(id='09526b56-fe07-4b19-bfaa-e365a55779ad', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='a823b817-630f-49cd-958c-6672098a8714', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!')]

In [32]:
#Performing Simillarity search 
vector_store.similarity_search(
    "Tell me about Steve Jobs", 
    k=2
)

[Document(id='01f2cefa-ea29-47e9-a599-95bb424f66a0', metadata={'source': 'tweet'}, page_content="Wow! That was an amazing movie. I can't wait to see it again."),
 Document(id='156ca1cd-2c92-4258-9716-dd5d7a5ec011', metadata={'source': 'tweet'}, page_content='I have a bad feeling I am going to get deleted :(')]

In [33]:
#Performing Simillarity search 
vector_store.similarity_search(
    "Langchain Provides Abstractions to make working with LLM easy" 
    #k=2
)

[Document(id='09526b56-fe07-4b19-bfaa-e365a55779ad', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='a823b817-630f-49cd-958c-6672098a8714', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!'),
 Document(id='156ca1cd-2c92-4258-9716-dd5d7a5ec011', metadata={'source': 'tweet'}, page_content='I have a bad feeling I am going to get deleted :('),
 Document(id='312edf1b-9ef7-4b32-b504-b1acad7a6cc2', metadata={'source': 'news'}, page_content='Robbers broke into the city bank and stole $1 million in cash.')]

In [35]:
vector_store.similarity_search(
    "Langchain provides abstractions to make working with LLM easy", 
    k=2, 
    filter={"source":{"$eq":"tweet"}}
)

[Document(id='09526b56-fe07-4b19-bfaa-e365a55779ad', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='a823b817-630f-49cd-958c-6672098a8714', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!')]

In [37]:
result=vector_store.similarity_search(
    "Langchain provides abstractions to make working with LLM easy",
    filter={"source":"news"}     
)

In [38]:
result[0].page_content

'Robbers broke into the city bank and stole $1 million in cash.'

In [39]:
result[0].metadata

{'source': 'news'}

In [40]:
#Performing retrival operation 
retriever=vector_store.as_retriever(search_kwargs={"k":3})

In [42]:
#Retriving data using retriever object
retriever.invoke("Langchain provides abstractions to make working with LLM easy")


[Document(id='09526b56-fe07-4b19-bfaa-e365a55779ad', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='a823b817-630f-49cd-958c-6672098a8714', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!'),
 Document(id='156ca1cd-2c92-4258-9716-dd5d7a5ec011', metadata={'source': 'tweet'}, page_content='I have a bad feeling I am going to get deleted :(')]

In [43]:
vector_store.save_local("First FIASS Index")

In [None]:
new_vector_store=FAISS.load_local(
    "First FAISS Index",
    embeddings,
    allow_dangerous_deserialization=True
)

# Building First RAG 

In [3]:
#Importing libraries 
from langchain_community.document_loaders import PyPDFLoader


In [4]:
#Capturing the location of the document
FILE_PATH="llama2.pdf"

In [5]:
#Creating the loader object
loader=PyPDFLoader(FILE_PATH)

#Perform the load operation 
pages=loader.load()

async data loading can also be done as follows:

In [None]:
pages = []
async for page in loader.alazy_load():
    pages.append(page)

In [6]:
len(loader.load())

77

There are 77 pages within the given document. 

In [7]:
#Perform Chunking using RecursiveCharacterTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter

#Creating the instance of Charac Splitter
splitter=RecursiveCharacterTextSplitter(
    chunk_size=500, #Hyper Parameter 
    chunk_overlap=50 #Hyper Parameter 
)

In [8]:
#Splitting the documents 
split_docs=splitter.split_documents(pages)

In [9]:
#Checking the splitted documents
len(split_docs)

615


we can see that our 77 documents got further split into 615

In [12]:
import faiss 
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore 

In [13]:
#Use Sentence Transfromer 
from langchain_huggingface import HuggingFaceEmbeddings
embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [14]:

#Storing the splitted document into Vector Store
index=faiss.IndexFlatIP(384)

#Create vector store object
vector_store=FAISS(
    embedding_function=embeddings, 
    index=index, 
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

In [15]:
#Loading the dataonto Vector DB
vector_store.add_documents(documents=split_docs)

['83df6fe6-faa4-4ada-a401-9ed89f04ed62',
 'bef6db34-24dc-4d98-9602-428a45cccabb',
 'a9d1c510-f8fd-4fe7-ac55-4be4627d965b',
 'b2aba2a5-3ab3-4e5f-9a2a-a7c6dd55825e',
 'f4156da5-48c8-4048-a048-5c9491f0b3bb',
 '63e45c4a-6664-46d2-8809-baf57686d304',
 '5f1c19a7-ed8a-4686-b7b3-61c879e9a613',
 '21282fc3-15fd-4009-bfbf-5821f5dbad91',
 '703c4343-e9d9-46c9-bc89-46782665c809',
 'b7ca7423-fac0-4cbb-912f-6e13e864fc5a',
 '4c8c36d4-5190-44c0-8aaa-6674a14d8bdb',
 '6119b302-d40f-482f-8971-e9f10e142412',
 '712f9c1e-0dc4-432a-bcd1-f63667ab4391',
 '5838b08e-a688-4d08-81fe-12c3b10c53ce',
 'b8907660-9202-4d44-acf1-a206cb023691',
 '90b1e93a-2c63-477b-8f4b-f118aeeb1bca',
 'd786aad4-bea3-473c-9ce7-e0bca995cb77',
 'a3af306c-a940-480f-9f13-6c01d750c085',
 '8469f18e-dd0c-4a04-bae0-4c1844dfb8ba',
 '97405fdc-b941-4ba3-bf09-66f031a65836',
 '3068a2df-3911-4c61-b155-20ae8cc41dd8',
 '557fca1c-0467-4acd-b46e-6a3ca7ae03ae',
 'db12b41b-941d-46be-a37e-c1d4aa1c612f',
 '951fb711-2aab-4cd2-a517-f6adc1836e63',
 'ca6204f9-df36-

In [16]:
#Creating a retriver object
retriever=vector_store.as_retriever(
    search_kwargs={"k":10} #HyperPArameter, specify the number of records
)

In [17]:
#Ask Retriver a question 
retriever.invoke("What is a llama model?")

[Document(id='ca6204f9-df36-4600-a367-259b7930a6c6', metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-07-20T00:30:36+00:00', 'author': '', 'keywords': '', 'moddate': '2023-07-20T00:30:36+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'llama2.pdf', 'total_pages': 77, 'page': 3, 'page_label': '4'}, page_content='work (Section 6), and conclusions (Section 7).\n‡https://ai.meta.com/resources/models-and-libraries/llama/\n§We are delaying the release of the 34B model due to a lack of time to sufficiently red team.\n¶https://ai.meta.com/llama\n‖https://github.com/facebookresearch/llama\n4'),
 Document(id='8b9391ca-6085-4de1-aa3d-53327976378b', metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-07-20T00:30:36+00:00', 'author': '', 'keywords': '', 'moddate': '2023-07-20T00:30:3

In [22]:
#Creating the model instance 
from langchain_google_genai import ChatGoogleGenerativeAI
model=ChatGoogleGenerativeAI(model='gemini-1.5-flash')

In [23]:
#Building a RAG Prompt 
from langchain import hub 
prompt=hub.pull("rlm/rag-prompt")

In [24]:
#Display the prompt output 
import pprint
pprint.pprint(prompt.messages)


[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})]


In [25]:
#Defining String output pharser 
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

Create Chain

In [28]:
#Creating format document 
def format_docs(docs):
    return"\n \n".join(doc.page_content for doc in docs)

In [29]:
#Building the model 
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    |   prompt 
    |   model 
    |   StrOutputParser()
)

In [30]:
#Executing the pipeline 
rag_chain.invoke("What is llama model?")

'Llama is a large language model developed by Meta.  There are different versions, including Llama 1 and Llama 2, with varying parameter sizes.  Llama 2 models are intended for commercial and research use.'

In [31]:
pprint.pprint(rag_chain.invoke("What is llama model?"))

('Llama is a large language model developed by Meta.  It has various versions, '
 'including tuned models for chat and pretrained models adaptable to different '
 'natural language tasks.  Llama 2, a newer version, is intended for '
 'commercial and research use.')
