## Chroma

Chroma is an AI-native open-source vector database focused on developer productivity and happiness.

In [1]:
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [2]:
loader = TextLoader("data/about.txt")
data = loader.load()
data

[Document(metadata={'source': 'data/about.txt'}, page_content='Cricket is a bat-and-ball game that is played between two teams of eleven players on a field, at the centre of which is a 22-yard (20-metre; 66-foot) pitch with a wicket at each end, each comprising two bails (small sticks) balanced on three stumps. Two players from the batting team, the striker and nonstriker, stand in front of either wicket holding bats, while one player from the fielding team, the bowler, bowls the ball toward the striker\'s wicket from the opposite end of the pitch. The striker\'s goal is to hit the bowled ball with the bat and then switch places with the nonstriker, with the batting team scoring one run for each of these swaps. Runs are also scored when the ball reaches the boundary of the field or when the ball is bowled illegally.\n\nThe fielding team aims to prevent runs by dismissing batters (so they are "out"). Dismissal can occur in various ways, including being bowled (when the ball hits the str

In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=150,chunk_overlap=10)
splits = text_splitter.split_documents(data)
splits

[Document(metadata={'source': 'data/about.txt'}, page_content='Cricket is a bat-and-ball game that is played between two teams of eleven players on a field, at the centre of which is a 22-yard (20-metre; 66-foot)'),
 Document(metadata={'source': 'data/about.txt'}, page_content='66-foot) pitch with a wicket at each end, each comprising two bails (small sticks) balanced on three stumps. Two players from the batting team, the'),
 Document(metadata={'source': 'data/about.txt'}, page_content='team, the striker and nonstriker, stand in front of either wicket holding bats, while one player from the fielding team, the bowler, bowls the ball'),
 Document(metadata={'source': 'data/about.txt'}, page_content="the ball toward the striker's wicket from the opposite end of the pitch. The striker's goal is to hit the bowled ball with the bat and then switch"),
 Document(metadata={'source': 'data/about.txt'}, page_content='switch places with the nonstriker, with the batting team scoring one run for eac

In [4]:
embeddings = OllamaEmbeddings(model="gemma:2b")
vectordb = Chroma.from_documents(splits,embeddings)
vectordb

  embeddings = OllamaEmbeddings(model="gemma:2b")


<langchain_chroma.vectorstores.Chroma at 0x211ad84c9e0>

In [5]:
query = "Where is the sport primarily followed"
docs = vectordb.similarity_search(query)
docs[0].page_content

'Cups, more than any other country, and has been the top-rated Test side more than any other country.'

### Saving the database and loading from local

In [6]:
vectordb = Chroma.from_documents(splits,embeddings,persist_directory="./chroma_db")

In [7]:
# Loading
db2 = Chroma(persist_directory="./chroma_db",embedding_function=embeddings)
docs = db2.similarity_search(query)
print(docs[0].page_content)

Cups, more than any other country, and has been the top-rated Test side more than any other country.


In [8]:
docs_with_score = db2.similarity_search_with_score(query)
print(docs_with_score[0])

(Document(id='19019b29-41df-4cea-9ef9-402c548187fa', metadata={'source': 'data/about.txt'}, page_content='Cups, more than any other country, and has been the top-rated Test side more than any other country.'), 2738.2666015625)


### Chroma DB as retriever

In [10]:
retriever = vectordb.as_retriever()
retriever.invoke(query)[0].page_content

'Cups, more than any other country, and has been the top-rated Test side more than any other country.'