In [1]:
from openai_key import API_KEY
import openai
openai.api_key=API_KEY

<h3>Embeddings</h3>

In [2]:
# Read data from pdf
from langchain.document_loaders import PyPDFLoader

loaders=[PyPDFLoader('input_data/MachineLearning-Lecture01.pdf'),
         PyPDFLoader('input_data/MachineLearning-Lecture02.pdf'),
         PyPDFLoader('input_data/MachineLearning-Lecture03.pdf')
         ]
docs=[]
for loader in loaders:
    docs.extend(loader.load())

In [3]:
# Splitting docs
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter=RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150
)
splits=text_splitter.split_documents(docs)
len(splits)

152

In [6]:
# Defining the embedding model
from langchain.embeddings.openai import OpenAIEmbeddings
embedding=OpenAIEmbeddings(api_key=API_KEY)

In [7]:
sentences=["I like dogs","I like canines","the weather is ugly outside"]
sent_embeddings=[]
for sent in sentences:
    sent_embeddings.append(embedding.embed_query(sent))

In [8]:
import numpy as np
np.dot(sent_embeddings[0],sent_embeddings[1])

0.9666694936097293

In [9]:
np.dot(sent_embeddings[1],sent_embeddings[2])

0.7558295095982451

<h3>Vector_stores</h3>

In [10]:
# ! pip install chromadb
from langchain.vectorstores import Chroma
persist_directory='chroma/'


In [11]:
vector_db=Chroma.from_documents(documents=splits,
                                embedding=embedding,
                                persist_directory=persist_directory)


In [12]:
vector_db._collection.count()

304

In [13]:
# Similarity search
question="is there an email i can ask for help"
docs=vector_db.similarity_search(question)

In [14]:
docs

[Document(page_content="cs229-qa@cs.stanford.edu. This goes to an acc ount that's read by all the TAs and me. So \nrather than sending us email individually, if you send email to this account, it will \nactually let us get back to you maximally quickly with answers to your questions.  \nIf you're asking questions about homework probl ems, please say in the subject line which \nassignment and which question the email refers to, since that will also help us to route \nyour question to the appropriate TA or to me  appropriately and get the response back to \nyou quickly.  \nLet's see. Skipping ahead — let's see — for homework, one midterm, one open and term \nproject. Notice on the honor code. So one thi ng that I think will help you to succeed and \ndo well in this class and even help you to enjoy this cla ss more is if you form a study \ngroup.  \nSo start looking around where you' re sitting now or at the end of class today, mingle a \nlittle bit and get to know your classmates. I stro

<h3>max marginal Relevance Search</h3>

In [15]:
# Max marginal relevance search
question="what did they say about matlab?"

docs_ss=vector_db.similarity_search(question,k=3)
docs_mmr=vector_db.max_marginal_relevance_search(question,k=3)

In [None]:
docs_ss

[Document(page_content='those homeworks will be done in either MATLA B or in Octave, which is sort of — I \nknow some people call it a free ve rsion of MATLAB, which it sort  of is, sort of isn\'t.  \nSo I guess for those of you that haven\'t s een MATLAB before, and I know most of you \nhave, MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to \nplot data. And it\'s sort of an extremely easy to  learn tool to use for implementing a lot of \nlearning algorithms.  \nAnd in case some of you want to work on your  own home computer or something if you \ndon\'t have a MATLAB license, for the purposes of  this class, there\'s also — [inaudible] \nwrite that down [inaudible] MATLAB — there\' s also a software package called Octave \nthat you can download for free off the Internet. And it has somewhat fewer features than MATLAB, but it\'s free, and for the purposes of  this class

In [None]:
docs_mmr

[Document(page_content='those homeworks will be done in either MATLA B or in Octave, which is sort of — I \nknow some people call it a free ve rsion of MATLAB, which it sort  of is, sort of isn\'t.  \nSo I guess for those of you that haven\'t s een MATLAB before, and I know most of you \nhave, MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to \nplot data. And it\'s sort of an extremely easy to  learn tool to use for implementing a lot of \nlearning algorithms.  \nAnd in case some of you want to work on your  own home computer or something if you \ndon\'t have a MATLAB license, for the purposes of  this class, there\'s also — [inaudible] \nwrite that down [inaudible] MATLAB — there\' s also a software package called Octave \nthat you can download for free off the Internet. And it has somewhat fewer features than MATLAB, but it\'s free, and for the purposes of  this class

<h3>Adding specificity</h3>
<ul>
    <li>Working with metadata</li>
</ul>

In [16]:
question="what did they say about regression in the third lecture?"
docs=vector_db.similarity_search(
question,k=3,
filter={
"source":"input_data/MachineLearning-Lecture03.pdf"
}
)

In [17]:
for d in docs:
    print(d.metadata)

{'page': 0, 'source': 'input_data/MachineLearning-Lecture03.pdf'}
{'page': 0, 'source': 'input_data/MachineLearning-Lecture03.pdf'}
{'page': 14, 'source': 'input_data/MachineLearning-Lecture03.pdf'}


In [18]:
# Self query
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

In [21]:
metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The lecture the chunk is from, should be one of `input_data/MachineLearning-Lecture01.pdf`, `input_data/MachineLearning-Lecture02.pdf`, or `input_data/MachineLearning-Lecture03.pdf`",
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="The page from the lecture",
        type="integer",
    ),
]

document_content_description = "Lecture notes"
llm = OpenAI(temperature=0,api_key=API_KEY)
retriever = SelfQueryRetriever.from_llm(
    llm,
    vector_db,
    document_content_description,
    metadata_field_info,
    verbose=True
)

In [22]:
# !pip install lark
question = "what did they say about regression in the third lecture?"
docs = retriever.get_relevant_documents(question)

In [23]:
for d in docs:
    print(d.metadata)

{'page': 14, 'source': 'input_data/MachineLearning-Lecture03.pdf'}
{'page': 14, 'source': 'input_data/MachineLearning-Lecture03.pdf'}
{'page': 0, 'source': 'input_data/MachineLearning-Lecture03.pdf'}
{'page': 0, 'source': 'input_data/MachineLearning-Lecture03.pdf'}


<h3>Compression</h3>

In [24]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [25]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))


In [26]:
# Wrap our vectorstore
llm = OpenAI(temperature=0,api_key=API_KEY)
compressor = LLMChainExtractor.from_llm(llm)

In [27]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vector_db.as_retriever()
)

In [28]:
question = "what did they say about matlab?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)



Document 1:

"MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to plot data. And it's sort of an extremely easy to learn tool to use for implementing a lot of learning algorithms."
----------------------------------------------------------------------------------------------------
Document 2:

"MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to plot data. And it's sort of an extremely easy to learn tool to use for implementing a lot of learning algorithms."
----------------------------------------------------------------------------------------------------
Document 3:

"And the student said, "Oh, it was the MATLAB." So for those of you that don't know MATLAB yet, I hope you do learn it. It's not hard, and we'll actually have a short MATLAB tutorial in one o

### Combining retreival techniques

In [30]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vector_db.as_retriever(search_type = "mmr")
)

In [31]:
question = "what did they say about matlab?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)



Document 1:

"MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to plot data. And it's sort of an extremely easy to learn tool to use for implementing a lot of learning algorithms."
----------------------------------------------------------------------------------------------------
Document 2:

"And the student said, "Oh, it was the MATLAB." So for those of you that don't know MATLAB yet, I hope you do learn it. It's not hard, and we'll actually have a short MATLAB tutorial in one of the discussion sections for those of you that don't know it."
----------------------------------------------------------------------------------------------------
Document 3:

"all the homeworks can be done in MATLAB or Octave"
