In [2]:
import os
from dotenv import load_dotenv

load_dotenv

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [3]:
llm_model = "gpt-3.5-turbo"

In [4]:
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings

persistent_dictionary = "docs/faiss/"

vectordb = FAISS.load_local(
    folder_path=persistent_dictionary,
    embeddings=OpenAIEmbeddings(),
    allow_dangerous_deserialization=True
)

  embeddings=OpenAIEmbeddings(),


In [6]:
print(len(vectordb.docstore._dict))

208


In [7]:
question = "What are major topics for this class?"
docs = vectordb.similarity_search(question,k=3)
len(docs)

3

In [8]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name=llm_model, temperature=0)

  llm = ChatOpenAI(model_name=llm_model, temperature=0)


In [9]:
from langchain.chains import RetrievalQA

In [10]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [11]:
result = qa_chain({"query": question})

  result = qa_chain({"query": question})


In [12]:
result["result"]

'The major topics for this class include machine learning, statistics, and algebra. In addition to these main topics, there will be discussions covering extensions of the material taught in the main lectures.'

In [13]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an 
answer. Use three sentences maximum. Keep the answer as concise as possible. 
Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:
"""

QA_CHAIN_PROMPT = PromptTemplate.from_template(template)


In [14]:
# Run chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [15]:
question = "Is probability a class topic?"

In [16]:
result = qa_chain({"query": question})

In [17]:
result["result"]

'Yes, probability is a class topic as the instructor assumes familiarity with basic probability and statistics. Thanks for asking!'

In [18]:
result["source_documents"][0]

Document(id='cdf83743-8799-449e-91e8-9cbf906f1a2d', metadata={'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'creator': 'PScript5.dll Version 5.2.2', 'creationdate': '2008-07-11T11:25:23-07:00', 'author': '', 'moddate': '2008-07-11T11:25:23-07:00', 'title': '', 'source': 'docs/cs229_lectures/MachineLearning-Lecture01.pdf', 'total_pages': 22, 'page': 4, 'page_label': '5'}, page_content="of this class will not be very programming intensive, although we will do some \nprogramming, mostly in either MATLAB or Octave. I'll say a bit more about that later.  \nI also assume familiarity with basic probability and statistics. So most undergraduate \nstatistics class, like Stat 116 taught here at Stanford, will be more than enough. I'm gonna \nassume all of you know what random variables are, that all of you know what expectation \nis, what a variance or a random variable is. And in case of some of you, it's been a while \nsince you've seen some of this material. At some of the discussion secti

Above where stuff type of retrivalQa, where all doccument is given to the LLM
-> Stuffing (All retrieved documents (or chunks) are stuffed into a single prompt and passed to the LLM.)

But if the documents are large then other methods are: 
-> Map_reduce (Map step: LLM processes each document/chunk independently and generates a partial answer. Reduce step: It then combines all partial answers to form the final answer.)
-> Refine (Starts with one document/chunk to generate an initial answer. Iteratively refines the answer by passing each additional document with the current answer to the LLM.)
-> Map_rank (Map step: LLM scores each chunk based on how well it answers the query .Rerank step: The chunk with the highest score is selected to generate the final answer.)

In [19]:
# Map_reduce (Slow -> Answer is not based on the full context)
qa_chain_mr = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    chain_type="map_reduce"
)

result = qa_chain_mr({"query": question})

In [20]:
result["result"]

'Yes, probability is a class topic in the context of machine learning algorithms. The instructor assumes familiarity with basic probability and statistics, and a probabilistic interpretation is used to derive the first classification algorithm, indicating that probability is covered in the class.'

Langsmith Setup to track the process

In [None]:
# load_dotenv()

# os.environ["LANGCHAIN_TRACING_V2"] = "true"
# os.environ["LANGCHAIN_ENDPOINT"] = "https://api.langchain.plus"
# os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
# os.environ["LANGSMITH_PROJECT"] = os.getenv("LANGSMITH_PROJECT")

In [28]:
qa_chain_mr = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    chain_type="map_reduce"
)
result = qa_chain_mr({"query": question})
result["result"]

'Yes, probability is a class topic in the context of machine learning algorithms.'

In [29]:
qa_chain_mr = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    chain_type="refine"
)
result = qa_chain_mr({"query": question})
result["result"]

'The original answer already provides a comprehensive explanation of how probability is a class topic in the course, including its relevance to classification algorithms and its application in various real-world scenarios. The additional context provided does not significantly impact the original answer, so it remains relevant and accurate.'

### RetrievalQA limitations
 
QA fails to preserve conversational history.

In [30]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [31]:
question = "Is probability a class topic?"
result = qa_chain({"query": question})
result["result"]

'Yes, probability is a class topic in the course being described. The instructor assumes familiarity with basic probability and statistics, so it will likely be covered in the context of machine learning and related algorithms.'

In [32]:
question = "why are those prerequesites needed?"
result = qa_chain({"query": question})
result["result"]

'The prerequisites mentioned in the context are needed because the course assumes a certain level of familiarity with basic concepts in probability and statistics, linear algebra, and computer science. These prerequisites are essential for understanding the material covered in the course and for being able to apply machine learning algorithms effectively.'