In [1]:
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
# the following is to import the environment 
from dotenv import load_dotenv
import os
# to load all the environment variables
load_dotenv() 




True

In [36]:
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
openai_api_keys = os.getenv('OPENAI_API_KEY')

In [3]:
loader = TextLoader('knowledgeBase1.txt')
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=20
)
total_split_docs= text_splitter.split_documents(docs)
len(total_split_docs)

114

In [4]:
embedding = OpenAIEmbeddings(model='text-embedding-3-large')
db =   FAISS.from_documents(total_split_docs,embedding)
db.embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x103185700>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x1222acb50>, model='text-embedding-3-large', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [5]:
query1 = 'what does LLM stand for?'
query1_answer = db.similarity_search(query1)
print(query1_answer[0].page_content)

A large language model (LLM) is a type of machine learning model designed for natural language processing tasks such as language generation. LLMs are language models with many parameters, and are trained with self-supervised learning on a vast amount of text.


In [6]:
#converting the vector db to retriever class, so that we can use it with different langchain libraries

retriever= db.as_retriever()
docs_2 = retriever.invoke(query1)
docs_2

[Document(id='6a00f4e3-add3-4439-8991-8fce3d09e373', metadata={'source': 'knowledgeBase1.txt'}, page_content='A large language model (LLM) is a type of machine learning model designed for natural language processing tasks such as language generation. LLMs are language models with many parameters, and are trained with self-supervised learning on a vast amount of text.'),
 Document(id='504d5801-3319-4374-9739-98326d35781c', metadata={'source': 'knowledgeBase1.txt'}, page_content='An LLM is typically not an autonomous agent by itself, as it lacks the ability to interact with dynamic environments, recall past behaviors, and plan future actions, but can be transformed into one by integrating modules like profiling, memory, planning, and action.[64]'),
 Document(id='c2918401-4fb7-456e-94dc-c7d804d93d3a', metadata={'source': 'knowledgeBase1.txt'}, page_content='Generally, in order to get an LLM to use tools, one must fine-tune it for tool-use. If the number of tools is finite, then fine-tunin

In [7]:
#in FAISS, we can do similarity search with score to get the L2 score or manhattan score , the less the better
docs_and_score = db.similarity_search_with_score(query1)
docs_and_score

[(Document(id='6a00f4e3-add3-4439-8991-8fce3d09e373', metadata={'source': 'knowledgeBase1.txt'}, page_content='A large language model (LLM) is a type of machine learning model designed for natural language processing tasks such as language generation. LLMs are language models with many parameters, and are trained with self-supervised learning on a vast amount of text.'),
  0.8950789),
 (Document(id='504d5801-3319-4374-9739-98326d35781c', metadata={'source': 'knowledgeBase1.txt'}, page_content='An LLM is typically not an autonomous agent by itself, as it lacks the ability to interact with dynamic environments, recall past behaviors, and plan future actions, but can be transformed into one by integrating modules like profiling, memory, planning, and action.[64]'),
  0.91972095),
 (Document(id='c2918401-4fb7-456e-94dc-c7d804d93d3a', metadata={'source': 'knowledgeBase1.txt'}, page_content='Generally, in order to get an LLM to use tools, one must fine-tune it for tool-use. If the number of 

#RAG PIPELINE


In [37]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain_core.runnables import RunnablePassthrough, RunnableParallel

# tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
# model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")


def format_docs(docs):
	return "\n\n".join(doc.page_content for doc in docs)

# #  Initializing prompt

prompt = """You are an assistant for question-answering tasks.
    	Use the following pieces of retrieved context to answer the question.
    	If you don't know the answer,say NAI MALUM .

    	Question: {question}

    	Context: {context}

    	Answer:
    	"""

prompt_template = ChatPromptTemplate({prompt})

# Initializing an LLM

llm = ChatOpenAI(
    model="gpt-4o",
    temperature=0,
    api_key = openai_api_keys
)


"""This code defines a chain where input documents are first formatted,
then passed through a prompt template,
and finally processed by an LLM."""

rag_chain_from_docs = (
	RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
	| prompt_template
	| llm
	)
"""This code creates a parallel process:
one retrieves the context (using a retriever),
and the other passes the question through unchanged.
The results are then combined and assigned to the variable `answer` using the `rag_chain_from_docs` processing chain."""

rag_chain_with_source = RunnableParallel(
	{"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [46]:
response = rag_chain_with_source.invoke("what is llm")
print(response['answer'].content)
print(response)

A large language model (LLM) is a type of machine learning model designed for natural language processing tasks such as language generation. LLMs are language models with many parameters and are trained with self-supervised learning on a vast amount of text.
{'context': [Document(id='504d5801-3319-4374-9739-98326d35781c', metadata={'source': 'knowledgeBase1.txt'}, page_content='An LLM is typically not an autonomous agent by itself, as it lacks the ability to interact with dynamic environments, recall past behaviors, and plan future actions, but can be transformed into one by integrating modules like profiling, memory, planning, and action.[64]'), Document(id='6a00f4e3-add3-4439-8991-8fce3d09e373', metadata={'source': 'knowledgeBase1.txt'}, page_content='A large language model (LLM) is a type of machine learning model designed for natural language processing tasks such as language generation. LLMs are language models with many parameters, and are trained with self-supervised learning on