# Generating & Storing Embeddings for a Corpus

In [1]:
import os
import api_keys
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

In [2]:
os.environ['OPENAI_API_KEY'] = api_keys.keys['openai']
sources_directory = 'sources/'
persist_directory = 'db'

### Load and split into chunks all the PDFs in a folder

In [3]:
%%time

# load and process the corpus from the sources directory
loader = PyPDFDirectoryLoader(sources_directory)
docs = loader.load()

CPU times: total: 13.9 s
Wall time: 14.3 s


In [4]:
print(len(docs))
print('----')
print(docs[0])

745
----
page_content='MARCUS AURELIUS AT LANUVIUM  \n \n \n_Letter from Celsus to Lucian_  \n \nI arrived at Lanuvium last night. The Court are here for the summer;  \nthat is to say, the Emperor, the Empress, the Heir Apparent, and the  \nEmperor’s nephew, Ummidius Quadratus, and the Senator who is on duty.  \nAs soon as I arrived I was taken by Eclectus, the Chamberlain, to my  \napartments, which are small, but from which one obtains a beautiful  \nview of the Alban Hills. I was told that I would be expected to come to  \nsupper, and that I must take care not to be late, as the Emperor was  \npunctual to a minute, and the water clocks in the villa were purposely  \nan hour fast according t o ordinary time.  \n \nA few minutes before the hour of supper a slave was sent to fetch me,  \nand I was ushered into a large room, opening on to a portico from  \nwhence you have a gorgeous view of the whole country, where the  \nEmperor and his family meet before goi ng into the dining -room. 

### Generate embeddings and store them in a vector database

In [5]:
%%time

# set up the embeddings model
embedding = OpenAIEmbeddings()
# generate the local vector database from the processed corpus
vectordb = Chroma.from_documents(documents=docs, embedding=embedding, persist_directory=persist_directory)

# In a notebook, we should call persist() to ensure the embeddings are written to disk. 
vectordb.persist()

CPU times: total: 2.58 s
Wall time: 15.9 s


# Querying the Corpus

In [7]:
model_name = 'gpt-4'
k = 1

In [8]:
# set up the LLM to use
llm = OpenAI(model_name = model_name, temperature = 0.1, verbose=True)

# set up the embeddings model
embedding = OpenAIEmbeddings()

# create the connection to the local vector database 
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)



### Basic Similarity Search

In [9]:
resp = vectordb.similarity_search('When was Marcus Aurelius Born?', k=k)
resp

[Document(page_content="THE THOUGHTS  OF THE EMPEROR  MARCUS AURELIUS ANTONINUS  \n \n \n \n \n \nCONTENTS.  \n \n \nBIOGRAPHICAL SKETCH      9  \n \nPHILOSOPHY OF MARCUS AURELIUS ANTONINUS      45  \n \nTHE THOUGHTS      99  \n \nINDEX OF TERMS      305  \n \nGENERAL INDEX      311  \n \n \n \nBIOGRAPHICA L SKETCH  \n \nOF \n \nMARCUS AURELIUS ANTONINUS.  \n \n \nM. Antoninus was born at Rome, A.D. 121, on the 26th of April. His  \nfather, Annius Verus, died while he was praetor. His mother was Domitia  \nCalvilla, also named Lucilla. The Emperor T. Antoninus Pius married  \nAnnia Galeria Faustina, the sister of Annius Verus, and was consequently  \nthe uncle of M. Antoninus. When Hadrian adopted Antoninus Pius and  \ndeclared him his successor in the empire, Antoni nus Pius adopted both L.  \nCeionius Commodus, the son of Aelius Caesar, and M. Antoninus, whose  \noriginal name was M. Annius Verus. Antoninus then took the name of M.  \nAelius Aurelius Verus, to which was added the tit

In [10]:
# usese Cosine Distance metric so lower is better
resp = vectordb.similarity_search_with_score('What was marcus aurelius real name?', k=k)
resp

[(Document(page_content="THE THOUGHTS  OF THE EMPEROR  MARCUS AURELIUS ANTONINUS  \n \n \n \n \n \nCONTENTS.  \n \n \nBIOGRAPHICAL SKETCH      9  \n \nPHILOSOPHY OF MARCUS AURELIUS ANTONINUS      45  \n \nTHE THOUGHTS      99  \n \nINDEX OF TERMS      305  \n \nGENERAL INDEX      311  \n \n \n \nBIOGRAPHICA L SKETCH  \n \nOF \n \nMARCUS AURELIUS ANTONINUS.  \n \n \nM. Antoninus was born at Rome, A.D. 121, on the 26th of April. His  \nfather, Annius Verus, died while he was praetor. His mother was Domitia  \nCalvilla, also named Lucilla. The Emperor T. Antoninus Pius married  \nAnnia Galeria Faustina, the sister of Annius Verus, and was consequently  \nthe uncle of M. Antoninus. When Hadrian adopted Antoninus Pius and  \ndeclared him his successor in the empire, Antoni nus Pius adopted both L.  \nCeionius Commodus, the son of Aelius Caesar, and M. Antoninus, whose  \noriginal name was M. Annius Verus. Antoninus then took the name of M.  \nAelius Aurelius Verus, to which was added the ti

### Querry OpenAI's API using the Similarity Search results as Context

In [11]:
# A retriever is an interface that returns documents given an unstructured query
retriever = vectordb.as_retriever(search_kwargs={'k':k})

# RetrievalQA combines a retriever and a question answering model to 
# retrieve relevant documents and provide answers to questions
qa = RetrievalQA.from_chain_type(llm=llm, 
                                 chain_type="stuff", # can be other types like map-reduce, refine, etc.
                                 retriever=retriever,
                                 return_source_documents=True)

In [12]:
query = "Was marcus aurelius punctual? Can you describe a few instances where is that shown?"
qa(query)

{'query': 'Was marcus aurelius punctual? Can you describe a few instances where is that shown?',
 'result': 'Yes, Marcus Aurelius was described as being punctual. This is shown when Celsus is told upon his arrival at Lanuvium that he must not be late for supper as the Emperor was punctual to a minute. The water clocks in the villa were even set an hour fast to ensure punctuality.',
 'source_documents': [Document(page_content='MARCUS AURELIUS AT LANUVIUM  \n \n \n_Letter from Celsus to Lucian_  \n \nI arrived at Lanuvium last night. The Court are here for the summer;  \nthat is to say, the Emperor, the Empress, the Heir Apparent, and the  \nEmperor’s nephew, Ummidius Quadratus, and the Senator who is on duty.  \nAs soon as I arrived I was taken by Eclectus, the Chamberlain, to my  \napartments, which are small, but from which one obtains a beautiful  \nview of the Alban Hills. I was told that I would be expected to come to  \nsupper, and that I must take care not to be late, as the Empe