In [68]:
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.chat_models import ChatOpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
import os
from langchain.prompts import PromptTemplate

In [12]:
os.environ['HF_TOKEN']=os.getenv("HF_TOKEN")
os.environ['OPENAI_API_KEY']=os.getenv("OPENAI_API_KEY", "")

In [13]:
llm = ChatOpenAI(temperature=0)


In [14]:
embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
FILE_PATH=r"D:\PROJECT_PRACTISE_DIRS\AGENTICAI\2-Langchain_Basics\2.2-DataTransformer\syllabus.pdf"
loader=PyPDFLoader(FILE_PATH)
len(loader.load())

34

In [16]:
pages = []
async for page in loader.alazy_load():
    pages.append(page)

In [17]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,#hyperparameter
    chunk_overlap=50 #hyperparemeter
)

In [18]:
documents = splitter.split_documents(pages)

In [19]:
index=faiss.IndexFlatIP(384)
vector_store=FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [20]:

vector_store.add_documents(documents=documents)


['541d32ee-dd79-4207-bdb7-ae84b912f41f',
 'a4ebe207-33c7-412c-b151-a7b05029115f',
 '82788d07-e695-4372-bb67-f7fd35a1bec8',
 '3ee9ce23-b909-4091-89bb-2fb3497102f8',
 '9e8dc4af-56dd-43bb-ac8a-44f5d22f30e4',
 '2f97ce4f-dde9-4443-8f51-ce6c3a354d35',
 '4a35ceb2-92e2-4cdb-81fe-7a045a685dd2',
 '66896476-7bb2-4973-9642-0c204be630cd',
 '1320e58a-ef44-47ff-869a-09e7ee662ead',
 '59ecccff-af91-4d73-9f66-e50095695de5',
 '9ae07d1d-b778-4004-90f6-7aaf264156d3',
 '35e01210-1587-4df0-81a4-64c8cb575d91',
 '4de8c1ae-7e26-44cc-9999-b13f559a5690',
 '6517e7bf-a535-465b-9d70-202631d3b379',
 'c9d31cb7-00f1-4103-9b56-ca653524421a',
 'fba173cf-0a33-4e67-9b38-395e65aa2305',
 '8ab5b55c-6f7f-4d80-aca1-f55047ebee25',
 'df7b81bf-91b2-4327-8fd5-c884d5cee8ce',
 'dc1376bf-b599-4062-b6bf-fe271c34a7ff',
 'd514beda-2ded-4efc-9088-c998ae0fc9cc',
 '2791d985-1069-4657-aabd-877bd8382358',
 '20c86732-b0f2-4d07-89bc-aaa1faa6c668',
 'adc8c244-c1dd-4241-923c-ab0ee2793a58',
 '83b552e7-5734-46df-8e74-7cacde66bd2f',
 '7b1e59b9-dd31-

In [72]:
retriever = vector_store.as_retriever(
    search_type="mmr",
    search_kwargs={'k': 3}
)

In [73]:
retriever.invoke("what is llama model?")

[Document(id='8768aa5c-9a56-47eb-9120-5f2ae3468b9d', metadata={'producer': 'Canva', 'creator': 'Canva', 'creationdate': '2025-01-30T20:27:03+00:00', 'title': 'Ultimate Data Science & GenAI Bootcamp', 'moddate': '2025-01-30T20:26:59+00:00', 'keywords': 'DAGdmhcqnYw,BAEmsmap8Lg,0', 'author': 'monal singh', 'containsaigeneratedcontent': 'Yes', 'source': 'D:\\PROJECT_PRACTISE_DIRS\\AGENTICAI\\2-Langchain_Basics\\2.2-DataTransformer\\syllabus.pdf', 'total_pages': 34, 'page': 31, 'page_label': '32'}, page_content='Databases for RAG (e.g., Pinecone,\nFAISS, Chroma DB)\nRole of LLMs in RAG How LLMs (Large Language Models)\nEnhance Generation in RAG, Fine-\nTuning LLMs for Retrieval-Augmented\nTasks'),
 Document(id='694c748c-d086-4bb4-b787-b84af7a6f1b3', metadata={'producer': 'Canva', 'creator': 'Canva', 'creationdate': '2025-01-30T20:27:03+00:00', 'title': 'Ultimate Data Science & GenAI Bootcamp', 'moddate': '2025-01-30T20:26:59+00:00', 'keywords': 'DAGdmhcqnYw,BAEmsmap8Lg,0', 'author': 'monal

In [74]:
# Prompt
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a helpful assistant. Use the following context to answer the question.
If you don't know the answer, say you don't know.

Context:
{context}

Question:
{question}

Answer:
"""
)

In [77]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)
    

In [79]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [80]:

rag_chain.invoke("Tell about The information you have")

'I have information about Sourangshu Pal, Monal Kumar, Mayank Aggrawal, and Darius B. I also have information about the types of statistics, types of data, levels of measurement, measures of central tendency, measures of dispersion, exploring random variables, probability, random variables, and set theory.'