# Vectorstores and Embeddings

In [1]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [2]:
from langchain.document_loaders import PyPDFLoader

# Load PDF
loaders = [
    PyPDFLoader("attachment/Job_Description.pdf"),
    PyPDFLoader("attachment/Resume.pdf")
    ]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [3]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

splits = text_splitter.split_documents(docs)

len(splits)

5

## Embeddings

In [4]:
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

sentence1 = "i like dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugly outside"

embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)

In [5]:
import numpy as np

In [6]:
np.dot(embedding1, embedding2)

0.9631853877103518

In [7]:
np.dot(embedding1, embedding3)

0.7709997651294672

In [8]:
np.dot(embedding2, embedding3)

0.7596334120325523

## Vectorstores

In [None]:
# ! pip install chromadb

In [28]:
!rm -rf ./chroma  # remove old database files if any

/Users/siva/.zshenv:1: command not found: Export


In [29]:
from langchain.vectorstores import Chroma

#persist_directory = 'chroma/'

# !rm -rf ./docs/chroma  # remove old database files if any

vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    #persist_directory=persist_directory
)

print(vectordb._collection.count())

35


In [21]:
### Similarity Search
question = "is there an email i can ask for help"

docs = vectordb.similarity_search(question,k=3) # k implies number of docs to return

len(docs)



3

In [22]:
docs[0].page_content

'• \n \n   Sivakumarraju.skr@gmail.com  \n   +91 81210 36916  \n    Hyderabad,  IN. Siva Kumar Raju Paidi  \nMachine Learning Engineer  \nMachine Learning/Deep Learning \nEngineer seeking to solve \nchallenging business problems. \nExperience  with data wrangling, \nweb scrapping and data \nexploration using Python and SQL . Professional Experience  \nQuality Specialist            Jun ’19 – Nov ‘21  \nAmazon  | Hyderabad  \nWeb Scrapping         \n• Developed a n automated web scraping tool using  Python \nand Beautiful Soup to find missing changesets in OSM and \nsaved 8.3 manhours/per week .  \nData Visualization  \n• Developed an Excel based dashboard to project productivity \nmetrics to team.  \nJavaScript  \n• Develope d chrome extension s based on HTML and \nJavaScript and automated addition of mandatory comment \nto edits made in OSM and saved 24 manhours/week and \nimproved quality by reducing 21% mistakes/week for the \nteam . \nAwards : \n• Hidden Einstein Award - R&R_ Q2_202

In [None]:
# Let's save this so we can use it later!

vectordb.persist() #to save vector db


In [None]:









## Failure modes
# This seems great, and basic similarity search will get you 80% of the way there very easily. 

# But there are some failure modes that can creep up. 

# Here are some edge cases that can arise - we'll fix them in the next class.

question = "what did they say about matlab?"

docs = vectordb.similarity_search(question,k=5)

# Notice that we're getting duplicate chunks (because of the duplicate `MachineLearning-Lecture01.pdf` in the index).

# Semantic search fetches all similar documents, but does not enforce diversity.

# `docs[0]` and `docs[1]` are indentical.

print(docs[0])

print(docs[1])

# We can see a new failure mode.

# The question below asks a question about the third lecture, but includes results from other lectures as well.

question = "what did they say about regression in the third lecture?"

docs = vectordb.similarity_search(question,k=5)

for doc in docs:
    print(doc.metadata)

print(docs[4].page_content)

# Approaches discussed in the next lecture can be used to address both!



## Chroma DB

In [31]:
#! pip install chromadb

In [41]:
import chromadb
client = chromadb.PersistentClient(path="chroma_007")

In [42]:
client.heartbeat()

1694436852799910000

In [54]:
client.delete_collection(name="my_collection")

In [55]:
collection = client.create_collection(name="my_collection")

In [56]:
collection.count()

0

In [57]:
collection.add(
    documents=splits
    )

TypeError: Collection.add() missing 1 required positional argument: 'ids'

In [13]:
collection.add(
    documents=splits,
    embeddings=embedding,
    )

TypeError: Collection.add() got an unexpected keyword argument 'embedding'

In [52]:
chroma_client.get()

AttributeError: 'SegmentAPI' object has no attribute 'get'

In [53]:
client.get()

AttributeError: 'SegmentAPI' object has no attribute 'get'