In [None]:
from langchain.document_loaders import TextLoader

In [8]:
%pip install langchain-text-splitters

Note: you may need to restart the kernel to use updated packages.


In [None]:
loader = TextLoader("sample.txt")
data = loader.load()

[Document(metadata={'source': 'sample.txt'}, page_content='Avicenna, also known as Ibn Sina, was one of the greatest Persian scientists and philosophers of the medieval period. He was born in 980 AD in Bukhara (present-day Uzbekistan) and died in 1037 AD in Hamadan.\n\nHe specialized in medicine, philosophy, mathematics, and astronomy. His most famous work, *The Canon of Medicine*, was taught in European universities for several centuries.\n\nAvicenna wrote more than 450 books and treatises, of which around 240 have survived. Along with Rhazes, he is considered one of the founders of modern medicine.\n')]


In [17]:
from langchain_text_splitters import RecursiveCharacterTextSplitter


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=250,
    chunk_overlap=20
)
splitted_documents = text_splitter.split_documents(data)

In [18]:
from langchain.embeddings import OllamaEmbeddings

embeddings = OllamaEmbeddings(model="mxbai-embed-large:latest")

In [19]:
from langchain.vectorstores import Chroma


vectorstore = Chroma.from_documents(splitted_documents, embeddings, persist_directory='./db')

In [20]:
retriever = vectorstore.as_retriever()
retriever.invoke("In what year was Avicenna born?")

[Document(metadata={'source': 'sample.txt'}, page_content='Avicenna, also known as Ibn Sina, was one of the greatest Persian scientists and philosophers of the medieval period. He was born in 980 AD in Bukhara (present-day Uzbekistan) and died in 1037 AD in Hamadan.'),
 Document(metadata={'source': 'sample.txt'}, page_content='Avicenna, also known as Ibn Sina, was one of the greatest Persian scientists and philosophers of the'),
 Document(metadata={'source': 'sample.txt'}, page_content='Avicenna wrote more than 450 books and treatises, of which around 240 have survived. Along with'),
 Document(metadata={'source': 'sample.txt'}, page_content='Avicenna wrote more than 450 books and treatises, of which around 240 have survived. Along with Rhazes, he is considered one of the founders of modern medicine.')]

In [21]:
vectorstore.similarity_search("In what year was Avicenna born?")

[Document(metadata={'source': 'sample.txt'}, page_content='Avicenna, also known as Ibn Sina, was one of the greatest Persian scientists and philosophers of the medieval period. He was born in 980 AD in Bukhara (present-day Uzbekistan) and died in 1037 AD in Hamadan.'),
 Document(metadata={'source': 'sample.txt'}, page_content='Avicenna, also known as Ibn Sina, was one of the greatest Persian scientists and philosophers of the'),
 Document(metadata={'source': 'sample.txt'}, page_content='Avicenna wrote more than 450 books and treatises, of which around 240 have survived. Along with'),
 Document(metadata={'source': 'sample.txt'}, page_content='Avicenna wrote more than 450 books and treatises, of which around 240 have survived. Along with Rhazes, he is considered one of the founders of modern medicine.')]

In [22]:
vectorstore.similarity_search_with_score("In what year was Avicenna born?")

[(Document(metadata={'source': 'sample.txt'}, page_content='Avicenna, also known as Ibn Sina, was one of the greatest Persian scientists and philosophers of the medieval period. He was born in 980 AD in Bukhara (present-day Uzbekistan) and died in 1037 AD in Hamadan.'),
  148.9939727783203),
 (Document(metadata={'source': 'sample.txt'}, page_content='Avicenna, also known as Ibn Sina, was one of the greatest Persian scientists and philosophers of the'),
  172.8038330078125),
 (Document(metadata={'source': 'sample.txt'}, page_content='Avicenna wrote more than 450 books and treatises, of which around 240 have survived. Along with'),
  176.81492614746094),
 (Document(metadata={'source': 'sample.txt'}, page_content='Avicenna wrote more than 450 books and treatises, of which around 240 have survived. Along with Rhazes, he is considered one of the founders of modern medicine.'),
  177.467041015625)]