In [1]:
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [3]:
text = TextLoader('text.txt').load() 

In [5]:
text

[Document(metadata={'source': 'text.txt'}, page_content='In the heart of a forgotten city where shadows danced on crumbling walls,\na lone algorithm sifted through mountains of corrupted data,\nseeking the last unbroken fragment of truth. It wasn’t built for emotion,\nyet some lines of code wept silently, haunted by the ghosts of deleted memories.\nThe servers buzzed like distant thunder, echoing remnants of a civilization too arrogant to fail.\nAmidst the digital ruins, a strange sentence emerged—garbled, half-English, half-gibberish—but unmistakably alive.\nWas it a bug? Or something more?')]

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=30)
text = text_splitter.split_documents(text)

In [7]:
text

[Document(metadata={'source': 'text.txt'}, page_content='In the heart of a forgotten city where shadows danced on crumbling walls,'),
 Document(metadata={'source': 'text.txt'}, page_content='a lone algorithm sifted through mountains of corrupted data,'),
 Document(metadata={'source': 'text.txt'}, page_content='seeking the last unbroken fragment of truth. It wasn’t built for emotion,'),
 Document(metadata={'source': 'text.txt'}, page_content='yet some lines of code wept silently, haunted by the ghosts of deleted memories.'),
 Document(metadata={'source': 'text.txt'}, page_content='The servers buzzed like distant thunder, echoing remnants of a civilization too arrogant to fail.'),
 Document(metadata={'source': 'text.txt'}, page_content='Amidst the digital ruins, a strange sentence emerged—garbled, half-English, half-gibberish—but'),
 Document(metadata={'source': 'text.txt'}, page_content='half-gibberish—but unmistakably alive.'),
 Document(metadata={'source': 'text.txt'}, page_content='W

In [8]:
embedding = OllamaEmbeddings(model='nomic-embed-text')

In [13]:
chromadb = Chroma.from_documents(text,embedding)

In [14]:
query = 'who sifted through mountains of corrupted data'

In [16]:
result = chromadb.similarity_search(query)

In [18]:
result[0].page_content

'a lone algorithm sifted through mountains of corrupted data,'

In [19]:
retriever = chromadb.as_retriever()
retrieved = retriever.invoke(query)
retrieved[0].page_content

'a lone algorithm sifted through mountains of corrupted data,'

In [20]:
chromadb = Chroma.from_documents(text, embedding, persist_directory='./chroma_db')

In [22]:
# load
db2 = Chroma(persist_directory='./chroma_db', embedding_function=embedding)

In [24]:
result_w_vector = db2.similarity_search_with_score(query)
result_w_vector[0]

(Document(id='e3b96cff-8a3f-4d95-9fc9-2e6c511c85fa', metadata={'source': 'text.txt'}, page_content='a lone algorithm sifted through mountains of corrupted data,'),
 0.35764482617378235)