In [1]:
# building a sample vectordb
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
loader = TextLoader("speech.txt")
data = loader.load()
data

[Document(metadata={'source': 'speech.txt'}, page_content='This speech txt file I am just added for test Text Loader')]

In [4]:
#Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
splits = text_splitter.split_documents(data)
splits

[Document(metadata={'source': 'speech.txt'}, page_content='This speech txt file I am just added for test Text Loader')]

In [6]:
embedding = OllamaEmbeddings(model="nomic-embed-text")
vectorrdb = Chroma.from_documents(documents=splits, embedding=embedding)
vectorrdb

<langchain_chroma.vectorstores.Chroma at 0x236cba347f0>

In [9]:
# query it
query = "What does the speaker belive is the main reason the United States should enter the was?"
docs = vectorrdb.similarity_search(query)
docs[0].page_content

'This speech txt file I am just added for test Text Loader'

In [11]:
# saving to the disk
vectorrdb = Chroma.from_documents(documents=splits,embedding=embedding, persist_directory="./chroma_db" )


In [12]:
# load from disk
db2 = Chroma(persist_directory="./chroma_db", embedding_function=embedding)
docs=db2.similarity_search(query)
print(docs[0].page_content)

This speech txt file I am just added for test Text Loader


In [13]:
# similarity Search with score
docs = vectorrdb.similarity_search_with_score(query)
docs

[(Document(id='cafd994e-9f81-46cf-a6be-9de501124071', metadata={'source': 'speech.txt'}, page_content='This speech txt file I am just added for test Text Loader'),
  406.303955078125),
 (Document(id='83e3359d-8720-4397-91fe-e25bb8d12241', metadata={'source': 'speech.txt'}, page_content='This speech txt file I am just added for test Text Loader'),
  406.303955078125)]

In [14]:
# Retriever option
retriever = vectorrdb.as_retriever()
retriever.invoke(query)[0].page_content

'This speech txt file I am just added for test Text Loader'