In [1]:
#############################################################
# # Vectorstores and Embeddings
# 
# Recall the overall workflow for 
#    Retrieval Augmented Generation (RAG):
#
# 1. Load documents 
# 2. Split the documents into small, 
#    semantically meaningful chunks
# 3. Create an index for each chunk by embeddings
#    - The index is created by embeddings which are 
#      numerical representations of text.
#    - Text with semantically similar content has similar 
#      vectors in this numeric space.
# 4. Store these index in a vector stores for 
#    easy retrieval when answering questions
# 5. Search answer of a question. 
#    - Both should have similar index
# 6. Edge Cases - Failure
#    - 2 types of failures in similarity search
#      + Diversity (Example)
#      + Specifity (Example)
#    - Solved by Advanced Retrieval
#############################################################

In [2]:
%env OPENAI_API_KEY=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

env: OPENAI_API_KEY=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


In [3]:
import os
from openai import OpenAI
# read the api key from environment variable
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [5]:
from langchain_community.document_loaders.pdf import PyPDFLoader

In [6]:
#############################################################
# 1. Load PDF
#
# References of different loading:
# - PDF
# - Youtube
# - URL
# - Notion DB
#############################################################

In [8]:
loaders = [
    # Duplicate documents on purpose - messy data
    PyPDFLoader(
      "2023Catalog.pdf"),
    PyPDFLoader(
      "2023Catalog.pdf")
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [9]:
#############################################################
# 2. Split the content to create chunks
#
# References
# - Document Splitting
#############################################################

In [11]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [12]:
splits = text_splitter.split_documents(docs)

In [13]:
len(splits)

1136

In [14]:
#############################################################
# 3. Create an index for each chunk by embeddings
# 
# Let's take our splits and embed them.
#############################################################

In [17]:
from langchain_openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [18]:
sentence1 = "i like dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugly outside"

In [19]:
embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)

In [21]:
import numpy as np

In [22]:
# numpy.dot(vector_a, vector_b, out = None) 
# returns the dot product of vectors a and b.
np.dot(embedding1, embedding2)

0.9630350414845891

In [23]:
np.dot(embedding1, embedding3)

0.7701147991091326

In [24]:
np.dot(embedding2, embedding3)

0.7591130000177128

In [25]:
#############################################################
# 4. Vectorstores
#############################################################

In [26]:
! pip install chromadb

Collecting chromadb
  Obtaining dependency information for chromadb from https://files.pythonhosted.org/packages/cc/63/b7d76109331318423f9cfb89bd89c99e19f5d0b47a5105439a629224d297/chromadb-0.4.24-py3-none-any.whl.metadata
  Downloading chromadb-0.4.24-py3-none-any.whl.metadata (7.3 kB)
Collecting build>=1.0.3 (from chromadb)
  Obtaining dependency information for build>=1.0.3 from https://files.pythonhosted.org/packages/4f/81/4849059526d02fcc9708e19346dd740e8b9edd2f0675ea7c38302d6729df/build-1.1.1-py3-none-any.whl.metadata
  Downloading build-1.1.1-py3-none-any.whl.metadata (4.2 kB)
Collecting chroma-hnswlib==0.7.3 (from chromadb)
  Obtaining dependency information for chroma-hnswlib==0.7.3 from https://files.pythonhosted.org/packages/11/7a/673ccb9bb2faf9cf655d9040e970c02a96645966e06837fde7d10edf242a/chroma_hnswlib-0.7.3-cp311-cp311-macosx_11_0_arm64.whl.metadata
  Downloading chroma_hnswlib-0.7.3-cp311-cp311-macosx_11_0_arm64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from c

In [27]:
from langchain.vectorstores import Chroma

In [28]:
persist_directory = 'docs/chroma/'

In [29]:
# remove old database files if any

get_ipython().system('rm -rf ./docs/chroma')  


In [30]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [31]:
print(vectordb._collection.count())


1136


In [32]:
#############################################################
# 5. Similarity Search
#############################################################

In [33]:
question = "is there an email i can ask for help"

In [34]:
docs = vectordb.similarity_search(question,k=3)


In [35]:
len(docs)

3

In [36]:
docs[0].page_content

'Report Checker  \n43-4051  43-4051  Customer Service Representatives: Complaint Clerk, Contact Center Specialists, \nCustomer Complaint Clerk, Customer Contact Specialist, Customer Relations \nRepresentative, Customer Support Representative, Gas Distribution and Emergency \nClerk, Passenger Relations Representative, Policyholder I nformation Clerk, Warranty \nClerk  \n 43-4199  Information and Record Clerks, All Other: Election Clerk, Flight Crew Scheduler, \nProbate Clerk, Student Admissions Clerk  \n 51-1011  First -Line Supervisors of Production and Operating Workers: Assembly Line \nSupervisor, Printing Supervisor, Printing Worker Supervisor'

In [37]:
vectordb.persist()


In [38]:
#############################################################
# 6. Edge Case - Failure modes
# 
# This seems great, and basic similarity 
# search will get you 80% of the way there 
# very easily. 
# 
# But there are some failure modes that can creep up. 
# 
# Here are some edge cases that can arise - we'll fix 
# them in the next class.
#############################################################

In [52]:
question = "what did they say about departments?"


In [53]:
docs = vectordb.similarity_search(question,k=5)

In [41]:
#############################################################
# 6.1 Edge Case 1 - Failure modes: Diversity
# 
# Notice that we're getting duplicate chunks 
# (because of the duplicate 
# `MachineLearning-Lecture01.pdf` in the index).
# 
# Semantic search fetches all similar documents, 
# but does not enforce diversity.
# 
# `docs[0]` and `docs[1]` are indentical.
#############################################################

In [54]:
docs[0]

Document(page_content='Catalog 202 3 37 ver. 202 3.09.24 the decision, advise the student of his or her right to a hearing, and provide additional information regarding \nthe hearing.  \n \n9. Document Destruction  \n \nThe Compliance Department is responsible for the ongoing process of identifying its records, which have \nmet the required retention period, and overseeing their destruction. Destruction of financial and personnel -\nrelated documents will be accomplished by sh redding.  \n \n10. Legal Hold  \n \nFrom time to time, the President may issue a notice, known as a “legal hold,” suspending the destruction of \nrecords due to pending, threatened, or otherwise reasonably foreseeable litigation, audits, government \ninvestigations, or similar proceedings. No re cords specified in any legal hold may be destroyed, even if the \nscheduled destruction date has passed, until the legal hold is withdrawn in writing by the President.  \n11. Compliance  \n \nFailure on the part of employ

In [55]:
docs[1]

Document(page_content='Catalog 202 3 37 ver. 202 3.09.24 the decision, advise the student of his or her right to a hearing, and provide additional information regarding \nthe hearing.  \n \n9. Document Destruction  \n \nThe Compliance Department is responsible for the ongoing process of identifying its records, which have \nmet the required retention period, and overseeing their destruction. Destruction of financial and personnel -\nrelated documents will be accomplished by sh redding.  \n \n10. Legal Hold  \n \nFrom time to time, the President may issue a notice, known as a “legal hold,” suspending the destruction of \nrecords due to pending, threatened, or otherwise reasonably foreseeable litigation, audits, government \ninvestigations, or similar proceedings. No re cords specified in any legal hold may be destroyed, even if the \nscheduled destruction date has passed, until the legal hold is withdrawn in writing by the President.  \n11. Compliance  \n \nFailure on the part of employ

In [47]:
#############################################################
# 6.2 Edge Case 2 - Failure modes: Specifity
#
# We can see a new failure mode.
# 
# The question below asks a question about 
# the third lecture, 
# but includes results from other lectures 
# as well.
#############################################################


In [48]:
question = "what did they say about scholarship \
  for MSEE?"


In [49]:
docs = vectordb.similarity_search(question,k=5)


In [50]:

for doc in docs:
    print(doc.metadata)


{'page': 90, 'source': '2023Catalog.pdf'}
{'page': 90, 'source': '2023Catalog.pdf'}
{'page': 23, 'source': '2023Catalog.pdf'}
{'page': 23, 'source': '2023Catalog.pdf'}
{'page': 90, 'source': '2023Catalog.pdf'}


In [51]:

print(docs[4].page_content)


Information Literacy - Demonstrate the expertise and resourcefulness in utilizing multiple sources of 
information to research and strategize solutions necessary to complete engineering projects.  
Integrative Learning, Problem Solving & Creative Thinking - Produce robust hardware/software 
solutions to meet industry needs in the modern technology areas by utilizing existing technology in a 
novel manner.  
 
Background Preparation  
Students admitted into the MSEE degree program are required to have a bachelor's degree (B S / BA / BE) in 
electrical or in an other  field with a sufficient background in engineering,  mathematics  and science , including 
course work and/or experience equivalent  to (as deemed appropriate by the Academic team)  all the following 
subjects:    
 
1. Mathematics:  Calculus, Linear Algebra,  and Statistics /Probability;   
2. Sciences: Physics;  
3. Electrical and Computer Engineering Subjects: C Programming, Python Programming , Circuit 
Theory,  and Logi

In [56]:
#############################################################
# Retrieval
# 
#  - Retrieval is the centerpiece of our retrieval 
#    augmented generation (RAG) flow. 
#    + Let's get our vectorDB from before.
#  - Vectorstore Retrieval by Similarity Search
#    + Could have 2 types of Edge Failures
#      - Diversity
#        + Solved by Maximum Marginal Relevance
#      - Specifity 
#        + Solved by working with metadata using
#          - Self-Query Retriever
#          - Compression
# - Traditional approaches which does not use Vectorstore
#   + SVM Retrieval
#   + TF-IDF Retrieval
#############################################################


#############################################################
# Vectorstore retrieval
# 
#############################################################



!pip install lark


Collecting lark
  Obtaining dependency information for lark from https://files.pythonhosted.org/packages/e7/9c/eef7c591e6dc952f3636cfe0df712c0f9916cedf317810a3bb53ccb65cdd/lark-1.1.9-py3-none-any.whl.metadata
  Downloading lark-1.1.9-py3-none-any.whl.metadata (1.9 kB)
Downloading lark-1.1.9-py3-none-any.whl (111 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.7/111.7 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lark
Successfully installed lark-1.1.9


In [57]:
#############################################################
# Similarity Search
#############################################################

# In[ ]:


from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
persist_directory = 'docs/chroma/'

embedding = OpenAIEmbeddings()
vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding
)


print(vectordb._collection.count())

1136


In [58]:
texts = [
    """The Amanita phalloides has a large and \
       imposing epigeous (aboveground) fruiting \
       body (basidiocarp).""",
    """A mushroom with a large fruiting body is \
       the Amanita phalloides. Some varieties are \
       all-white.""",
    """A. phalloides, a.k.a Death Cap, is one of \
       the most poisonous of all known mushrooms.""",
]


smalldb = Chroma.from_texts(texts, embedding=embedding)

question = "Tell me about all-white mushrooms with \
       large fruiting bodies"

smalldb.similarity_search(question, k=2)


smalldb.max_marginal_relevance_search(question,k=2, 
       fetch_k=3)


[Document(page_content='A mushroom with a large fruiting body is        the Amanita phalloides. Some varieties are        all-white.'),
 Document(page_content='A. phalloides, a.k.a Death Cap, is one of        the most poisonous of all known mushrooms.')]

In [59]:
#############################################################
# Addressing Diversity: Maximum marginal relevance
# 
# Last class we introduced one problem: how to enforce 
# diversity in the search results.
#  
# `Maximum marginal relevance` strives to achieve 
# both relevance to the query *and diversity* 
# among the results.
#############################################################

question = "what did they say about matlab?"
docs_ss = vectordb.similarity_search(question,k=3)

docs_ss[0].page_content[:100]

docs_ss[1].page_content[:100]


'machine learning, search, Markov decision processes , constraint satisfaction, graphical models, log'

In [60]:
#############################################################
# Note the difference in results with `MMR`.
#############################################################
docs_mmr = vectordb.max_marginal_relevance_search(
              question,k=3)

docs_mmr[0].page_content[:100]

docs_mmr[1].page_content[:100]

'Catalog 202 3 174 ver. 202 3.09.24  \nPragati Dharmale  \nM.S.: Master of Science, Information Technol'

In [97]:
#############################################################
# ### Addressing Specificity: working with metadata
# 
# In last lecture, we showed that a question about 
# the third lecture can include results from other 
# lectures as well.
# 
# To address this, many vectorstores support 
# operations on `metadata`.
# 
# `metadata` provides context for each embedded chunk.
#############################################################


question = "what did they say about CPT \
            in the third trimester?"


docs = vectordb.similarity_search(
    question,
    k=3,
    filter={"source":
     "2023Catalog.pdf"}
)


for d in docs:
    print(d.metadata)

{'page': 125, 'source': '2023Catalog.pdf'}
{'page': 125, 'source': '2023Catalog.pdf'}
{'page': 71, 'source': '2023Catalog.pdf'}


In [99]:
print(docs[0].page_content)

Prerequisite : Open to School of Business Undergraduate Students who have earned 90 trimester units before starting 
their senior project.  
 
 
Curricular Practicum  
 
CPT401 Curricular Practicum  (1 unit)  
Curricular practicum, or curricular practical training, is a supervised practical experience that is the application of 
previously studied theory.  The curricular practicum must provide students a valuable learning experience and must 
significantly increase their knowledge in their program of study . It is defined as alternative work/study, internship, 
cooperative education, or any other type of required internship or practicum that is offered by sponsoring employers 
through cooperative agreements with the school and the course is an integral part of an established curriculum.  At 
least three hours of work in a practical setting has the credit equivalency of one hour of classroom lecture (1 unit).  To 
be eligible to take this course, the student must have completed at least

In [95]:
#############################################################
# Addressing Specificity: working with metadata 
#                     using Self-Query Retriever
# 
# But we have an interesting challenge: we often 
# want to infer the metadata from the query itself.
# 
# To address this, we can use `SelfQueryRetriever`, 
# which uses an LLM to extract:
#  
# 1. The `query` string to use for vector search
# 2. A metadata filter to pass in as well
# 
# Most vector databases support metadata filters, 
# so this doesn't require any new databases or indexes.
############################################################# 

from langchain_openai import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

metadata_field_info = [

 AttributeInfo(
   name="source",
   description="The lecture the chunk is from, should \
      be one of \
      `2023Catalog.pdf`",
   type="string",
   ),

 AttributeInfo(
   name="page",
   description="The page from the lecture",
   type="integer",
 ),

]


document_content_description = "Lecture notes"
llm = OpenAI(temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectordb,
    document_content_description,
    metadata_field_info,
    verbose=True
)



question = "what did they say about IEEE?"

In [96]:
#############################################################
# You will receive a warning about predict_and_parse 
# being deprecated the first time you executing the 
# next line. This can be safely ignored.
#############################################################

docs = retriever.get_relevant_documents(question)

for d in docs:
    print(d.metadata)


{'page': 57, 'source': '2023Catalog.pdf'}
{'page': 57, 'source': '2023Catalog.pdf'}
{'page': 57, 'source': '2023Catalog.pdf'}
{'page': 57, 'source': '2023Catalog.pdf'}


In [82]:
#############################################################
# Additional tricks: compression
# 
# Another approach for improving the quality of 
# retrieved docs is compression.
# 
# Information most relevant to a query may be 
# buried in a document with a lot of irrelevant text. 
# 
# Passing that full document through your application 
# can lead to more expensive LLM calls and poorer 
# responses.
# 
# Contextual compression is meant to fix this. 
#############################################################

from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

def pretty_print_docs(docs):
  print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" 
   + d.page_content for i, d in enumerate(docs)]))

In [85]:
#############################################################
# Wrap our vectorstore 
#############################################################
llm = OpenAI(temperature=0)
compressor = LLMChainExtractor.from_llm(llm)

compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever()
)


question = "what did they say about CPT?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)






Document 1:

F-1 International students  must observe additional rules required by the U.S. Immigration & Customs 
Enforcement on Curricular Practical Training (CPT).
----------------------------------------------------------------------------------------------------
Document 2:

F-1 International students  must observe additional rules required by the U.S. Immigration & Customs 
Enforcement on Curricular Practical Training (CPT).


In [87]:
#############################################################
# Combining various techniques
#############################################################
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever(
        search_type = "mmr")
)


question = "what did they say about matlab?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)




Document 1:

- "machine learning, search, Markov decision processes, constraint satisfaction, graphical models, logic, and optimize"
- "design system by plotting data process curves and error analysis in the model"
- "Prerequisite: CS250L"
- "CS485G JavaScript and Internet Programming (3 units)"
- "This course is designed to provide students with advanced programming knowledge and skills for application development on the Internet."
- "Students study both client-side and server-side scripting including HTML, JavaScript, and CSS to develop interactive and responsive web sites."
- "Other topics covered include jQuery, Bootstrap, Node.js Express Framework, RESTful API, MongoDB (NoSQL) and various JavaScript frameworks such as Angular and React."
- "Hands-on exercises are required."
- "Prerequisite: CS250"
----------------------------------------------------------------------------------------------------
Document 2:

- Introduction to Python Programming Language and Programming Logic
- Da

In [90]:
#############################################################
# Other types of retrieval
# 
# Traditional approaches which does not use Vectorstore
# It's worth noting that vectordb as not the only 
#    kind of tool to retrieve documents. 
# 
# The `LangChain` retriever abstraction includes 
#    other ways to retrieve documents, such as 
#     - TF-IDF 
#     - SVM
#############################################################

from langchain_community.retrievers import SVMRetriever
from langchain_community.retrievers import TFIDFRetriever
from langchain.text_splitter import RecursiveCharacterTextSplitter


#############################################################
# Load PDF
#############################################################
loader = PyPDFLoader(
  "2023Catalog.pdf")
pages = loader.load()
all_page_text=[p.page_content for p in pages]
joined_page_text=" ".join(all_page_text)


In [94]:
#############################################################
# Split
#############################################################
text_splitter = RecursiveCharacterTextSplitter(
  chunk_size = 1500,chunk_overlap = 150)
splits = text_splitter.split_text(joined_page_text)


#############################################################
# Retrieve
#############################################################

#############################################################
# SVM Retriever
#############################################################
svm_retriever = SVMRetriever.from_texts(splits,embedding)

#############################################################
# TFIDF Retriever
#############################################################
tfidf_retriever = TFIDFRetriever.from_texts(splits)

#############################################################
# Retrieve with SVM Retriever
#############################################################
question = "What are major topics for genAI class?"
docs_svm=svm_retriever.get_relevant_documents(question)
docs_svm[0]



Document(page_content='to analyze, design, develop, and implement solutions to challenging novel and existing data science problems . \nPrerequisite:  MATH208  \n \nCS483G Fundament als of Artificial Intelligence  (3 units)  \nThis course covers artificial intelligence applications in problem solving, reasoning, planning, natural language \nunderstanding, computer vision, autonomous car navigation, machine learning, business intelligence, robot design, \nand so on. In order to solve  artificial intelligence problems, the major algorithms include machine learning, search, \nMarkov decision processes, constraint satisfaction, graphical models, and logic. The main goal of the course is to equip \nstudents with the tools in Python library to tackle a variety of AI problems in the industries . \nPrerequisite:  CS250  \n \nCS483LG Artificial Intel ligence & Machine Learning Lab (1 unit)  \nStudents will learn python programming in Google colab platform with numpy, pandas, matplotlib, scikit 

In [93]:
#############################################################
# Retrieve with TFIDF Retriever
#############################################################
question = "what did they say about graduation?"
docs_tfidf=tfidf_retriever.get_relevant_documents(question)
docs_tfidf[0]

Document(page_content="• Bulletin Requirements  \n \nThe SFBU  catalog serves as the school's contract with the students.  Therefore, students fall under the \ngraduation requirements written in the catalog used at the time of the student’s entrance to the program as a \ndegree  or academic certificate  seeking student.  The section on “Study Plan” in “Academic Information” \ndescribes the rules for the student to follow for the graduation requirements.  \n \n• Petition to Graduate   \n \nAs a student approaches the end of his/her undergraduate/graduate study, he/she must initiate a review \nprocess for the Records Officers to verify the student’s eligibility for graduation.  The student must file an \nonline petition  form  one trimester  in advance  - prior to his/her last registration – by using the MySFBU  \nstudent portal  to submit this request.  The Records Office staff will then make a graduation evaluation in time \nfor the petitioner to register for the last time before gradu