In [1]:
import os
from dotenv import load_dotenv

load_dotenv

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [3]:
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
persistent_directory = 'docs/faiss'

In [5]:
embedding = OpenAIEmbeddings()
vectordb = FAISS.load_local(
    folder_path=persistent_directory,
    embeddings=embedding,
    allow_dangerous_deserialization=True
)

In [7]:
print(len(vectordb.docstore._dict))


208


In [8]:
texts = [
    """The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).""",
    """A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.""",
    """A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.""",
]

In [9]:
small_db = FAISS.from_texts(texts, embedding)

In [10]:
question = "Tell me about all-white mushrooms with large fruiting bodies"

In [11]:
small_db.similarity_search(question, k=2)

[Document(id='9214f2e7-09df-4dcf-87cb-374615e162c0', metadata={}, page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.'),
 Document(id='e9b6eb23-70c7-491d-a8dc-3130e3463459', metadata={}, page_content='The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).')]

## Max Marginal Relevance Search return diverse set of documents
Most similar one aren't retrieved by MMR
For below example: First fetch 3 documents , then choose 2 with most diversed

In [12]:
small_db.max_marginal_relevance_search(question,k=2,fetch_k=3)

[Document(id='9214f2e7-09df-4dcf-87cb-374615e162c0', metadata={}, page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.'),
 Document(id='5f21d41b-2f20-458c-815d-a8e4308300d3', metadata={}, page_content='A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.')]

In [13]:
# MMR on documents
question = "what did they say about matlab?"
docs_ss = vectordb.similarity_search(question,k=3)

In [14]:
docs_ss[0].page_content[:100]

'those homeworks will be done in either MATLAB or in Octave, which is sort of — I \nknow some people c'

In [15]:
docs_ss[1].page_content[:100]

'those homeworks will be done in either MATLAB or in Octave, which is sort of — I \nknow some people c'

In [16]:
# Using MMR
docs_mmr = vectordb.max_marginal_relevance_search(question,k=3)

In [17]:
docs_mmr[0].page_content[:100]

'those homeworks will be done in either MATLAB or in Octave, which is sort of — I \nknow some people c'

In [18]:
docs_mmr[1].page_content[:100]

'he says it in sort of a really touching, sincere way, and then he has this — you can see it \nin his '

## Self Query or LLM Aided Retrieval
User Question is converted into query by using LLM
One part: Filter(), with metadata, Another part: query/search item

In [None]:
question = "what did they say about regression in the third lecture?"
# Here third lecture should be filter, and search_item is regression

In [20]:
docs = vectordb.similarity_search(
    question,
    k=3,
    filter={"source":"docs/cs229_lectures/MachineLearning-Lecture03.pdf"}
)

In [21]:
for d in docs:
    print(d.metadata)

{'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'creator': 'PScript5.dll Version 5.2.2', 'creationdate': '2008-07-11T11:25:03-07:00', 'author': '', 'moddate': '2008-07-11T11:25:03-07:00', 'title': '', 'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'total_pages': 16, 'page': 0, 'page_label': '1'}
{'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'creator': 'PScript5.dll Version 5.2.2', 'creationdate': '2008-07-11T11:25:03-07:00', 'author': '', 'moddate': '2008-07-11T11:25:03-07:00', 'title': '', 'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'total_pages': 16, 'page': 13, 'page_label': '14'}
{'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'creator': 'PScript5.dll Version 5.2.2', 'creationdate': '2008-07-11T11:25:03-07:00', 'author': '', 'moddate': '2008-07-11T11:25:03-07:00', 'title': '', 'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'total_pages': 16, 'page': 4, 'page_label': '5'}


## Addressing Specificity: working with metadata using self-query retriever
To address this, we can use SelfQueryRetriever, which uses an LLM to extract:

-> The query string to use for vector search
-> A metadata filter to pass in as well

In [29]:
from langchain.chat_models import ChatOpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

In [23]:
metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The lecture the chunk is from, should be one of `docs/cs229_lectures/MachineLearning-Lecture01.pdf`, `docs/cs229_lectures/MachineLearning-Lecture02.pdf`, or `docs/cs229_lectures/MachineLearning-Lecture03.pdf`",
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="The page from the lecture",
        type="integer",
    ),
]

In [None]:
document_content_description = "Lecture notes"
llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm=llm,
    vectorstore=vectordb,
    document_contents=document_content_description,
    metadata_field_info=metadata_field_info,
    verbose= True
)

In [31]:
question = "what did they say about regression in the third lecture?"

In [None]:
docs = retriever.get_relevant_documents(question)

In [None]:
for d in docs:
    print(d.metadata)

## Compression
Shrinking the response to only the relevant information

In [32]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [33]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))


In [37]:
from langchain.llms import OpenAI

In [38]:
# Wrap our vectorstore
llm = OpenAI(temperature=0, model="gpt-3.5-turbo-instruct")
compressor = LLMChainExtractor.from_llm(llm)

  llm = OpenAI(temperature=0, model="gpt-3.5-turbo-instruct")


In [39]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever()
)

In [40]:
question = "what did they say about matlab?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

Document 1:

- those homeworks will be done in either MATLAB or in Octave
- I know some people call it a free version of MATLAB
- MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to plot data
- it's sort of an extremely easy to learn tool to use for implementing a lot of learning algorithms
- there's also a software package called Octave that you can download for free off the Internet
- it has somewhat fewer features than MATLAB, but it's free, and for the purposes of this class, it will work for just about everything
- once a colleague of mine at a different university, not at Stanford, actually teaches another machine learning course
----------------------------------------------------------------------------------------------------
Document 2:

- those homeworks will be done in either MATLAB or in Octave
- I know some people call it a free version of MATLAB
- MATLAB is

In [41]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever(search_type = "mmr")
)

In [42]:
question = "what did they say about matlab?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

Document 1:

- those homeworks will be done in either MATLAB or in Octave
- I know some people call it a free version of MATLAB
- MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to plot data
- it's sort of an extremely easy to learn tool to use for implementing a lot of learning algorithms
- there's also a software package called Octave that you can download for free off the Internet
- it has somewhat fewer features than MATLAB, but it's free, and for the purposes of this class, it will work for just about everything
- once a colleague of mine at a different university, not at Stanford, actually teaches another machine learning course
----------------------------------------------------------------------------------------------------
Document 2:

"Oh, it was the MATLAB."
----------------------------------------------------------------------------------------------------


Other types of retrieval
- SVM
- TF-IDF

In [44]:
from langchain.retrievers import SVMRetriever
from langchain.retrievers import TFIDFRetriever
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [45]:
# Load PDF
loader = PyPDFLoader("docs/cs229_lectures/MachineLearning-Lecture01.pdf")
pages = loader.load()
all_page_text=[p.page_content for p in pages]
joined_page_text=" ".join(all_page_text)

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500,chunk_overlap = 150)
splits = text_splitter.split_text(joined_page_text)


In [None]:
# !pip install scikit-learn

In [48]:
# Retrieve
svm_retriever = SVMRetriever.from_texts(splits,embedding)
tfidf_retriever = TFIDFRetriever.from_texts(splits)

In [50]:
question = "What are major topics for this class?"
docs_svm=svm_retriever.get_relevant_documents(question)
docs_svm[0]

Document(metadata={}, page_content="Testing, testing. Okay, cool. Thanks. So all right, online resources. The class has a home page, so it's in on the handouts. I \nwon't write on the chalkboard — http:// cs229.stanford.edu. And so when there are \nhomework assignments or things like that, we usually won't sort of — in the mission of \nsaving trees, we will usually not give out many handouts in class. So homework \nassignments, homework solutions will be posted online at the course home page.  \nAs far as this class, I've also written, and I guess I've also revised every year a set of \nfairly detailed lecture notes that cover the technical content of this class. And so if you \nvisit the course homepage, you'll also find the detailed lecture notes that go over in detail \nall the math and equations and so on that I'll be doing in class.  \nThere's also a newsgroup, su.class.cs229, also written on the handout. This is a \nnewsgroup that's sort of a forum for people in the class to get 

In [51]:
question = "What are major topics for this class?"
docs_tfidf=tfidf_retriever.get_relevant_documents(question)
docs_tfidf[0]

Document(metadata={}, page_content="personally could, and this is an instance of maybe computers learning to do things that \nthey were not programmed explicitly to do.  \nHere's a more recent, a more modern, more formal definition of machine learning due to \nTom Mitchell, who says that a well-posed learning problem is defined as follows: He \nsays that a computer program is set to learn from an experience E with respect to some \ntask T and some performance measure P if its performance on T as measured by P \nimproves with experience E. Okay. So not only is it a definition, it even rhymes.  \nSo, for example, in the case of checkers, the experience E that a program has would be \nthe experience of playing lots of games of checkers against itself, say. The task T is the \ntask of playing checkers, and the performance measure P will be something like the \nfraction of games it wins against a certain set of human opponents. And by this \ndefinition, we'll say that Arthur Samuel's checke