In [10]:
import re
from uuid import uuid4

import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    SentenceTransformersTokenTextSplitter,
)

In [11]:
# max_length
def max_token_length(txt_list: list):
    max_length = 0
    for txt in txt_list:
        token_count = len(re.findall(r"\w+", txt))
        if token_count > max_length:
            max_length = token_count
    return f"Max Token Length: {max_length} tokens"

In [12]:
lorem_ipsum = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Augue mauris augue neque gravida in fermentum et. Felis bibendum ut tristique et egestas quis ipsum suspendisse ultrices. Duis tristique sollicitudin nibh sit amet commodo nulla facilisi nullam. Pretium aenean pharetra magna ac placerat. Quis risus sed vulputate odio ut. Consectetur adipiscing elit duis tristique sollicitudin nibh. Nec nam aliquam sem et. Sed blandit libero volutpat sed cras. Faucibus pulvinar elementum integer enim neque volutpat ac. Mi in nulla posuere sollicitudin aliquam ultrices sagittis. Eget egestas purus viverra accumsan. Diam vel quam elementum pulvinar etiam non quam. Arcu cursus euismod quis viverra nibh cras. A scelerisque purus semper eget duis at. Lectus vestibulum mattis ullamcorper velit sed ullamcorper. Eget felis eget nunc lobortis mattis aliquam faucibus purus in. Elit scelerisque mauris pellentesque pulvinar pellentesque habitant. Ornare suspendisse sed nisi lacus sed. Interdum velit laoreet id donec ultrices. Ipsum a arcu cursus vitae congue mauris rhoncus aenean vel. Faucibus nisl tincidunt eget nullam non nisi. Urna condimentum mattis pellentesque id nibh. Tellus in hac habitasse platea dictumst vestibulum. Eget est lorem ipsum dolor. Enim eu turpis egestas pretium aenean pharetra magna ac placerat. Ac turpis egestas integer eget aliquet nibh. Vivamus arcu felis bibendum ut tristique et egestas. Nisi lacus sed viverra tellus in hac habitasse platea dictumst. Odio ut enim blandit volutpat maecenas volutpat. Turpis egestas sed tempus urna et pharetra pharetra massa. Dui nunc mattis enim ut tellus elementum sagittis vitae et. Nunc sed velit dignissim sodales ut eu. Aliquam ut porttitor leo a diam sollicitudin tempor id. At quis risus sed vulputate odio ut enim blandit volutpat. Gravida quis blandit turpis cursus in hac habitasse platea dictumst. Sit amet nulla facilisi morbi tempus iaculis urna. Diam maecenas sed enim ut sem viverra aliquet eget. Turpis egestas pretium aenean pharetra. At varius vel pharetra vel turpis nunc eget lorem. Integer quis auctor elit sed. Eget nunc lobortis mattis aliquam. Et magnis dis parturient montes nascetur ridiculus mus mauris vitae. Sollicitudin nibh sit amet commodo. Integer quis auctor elit sed vulputate mi sit amet mauris. Est placerat in egestas erat imperdiet. Ornare quam viverra orci sagittis eu volutpat odio facilisis mauris. Semper quis lectus nulla at volutpat diam. Amet volutpat consequat mauris nunc congue nisi. Ipsum nunc aliquet bibendum enim facilisis gravida neque convallis a. Et pharetra pharetra massa massa ultricies. Nunc eget lorem dolor sed viverra ipsum nunc aliquet bibendum."

In [13]:
# Sentence splitter
# chroma default sentence model "all-MiniLM-L6-v2"
# https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
# max input length: 256 characters
model_max_chunk_length = 256
token_splitter = SentenceTransformersTokenTextSplitter(
    tokens_per_chunk=model_max_chunk_length,
    model_name="all-MiniLM-L6-v2",
    chunk_overlap=0,
)

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [14]:
lorem_ipsum_tokens = token_splitter.split_text(lorem_ipsum)

In [15]:
lorem_ipsum_tokens

['lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. augue mauris augue neque gravida in fermentum et. felis bibendum ut tristique et egestas quis ipsum suspendisse ultrices. duis tristique sollicitudin nibh sit amet commodo nulla facilisi nullam. pretium aenean pharetra magna ac placerat. quis risus sed vulputate odio ut. consectetur adipiscing elit duis tristique sollicitudin nibh. nec nam aliquam sem et. sed blandit libero volutpat sed cras. faucibus pulvinar elementum integer enim neque volutpat ac. mi in nulla posuere sollicitudin aliquam ultrices sagittis. eget egestas purus viverra accumsan. diam vel quam elementum pul',
 '##vinar etiam non quam. arcu cursus euismod quis viverra nibh cras. a scelerisque purus semper eget duis at. lectus vestibulum mattis ullamcorper velit sed ullamcorper. eget felis eget nunc lobortis mattis aliquam faucibus purus in. elit scelerisque mauris pellentesque pulvinar pellentesq

In [16]:
# max token length
max_token_length(lorem_ipsum_tokens)

'Max Token Length: 110 tokens'

In [17]:
text_path = "data/bible.txt"
with open(text_path, "r", encoding="utf-8") as f:
    text_raw = f.read()
print(text_raw[:350])

The Project Gutenberg eBook of The King James Version of the Bible
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or on


In [35]:
# Character splitter
character_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=0,
)
text_splitted = character_splitter.split_text(text_raw)

In [37]:
text_splitted[:5]

['\ufeffThe Project Gutenberg eBook of The King James Version of the Bible\n    \nThis ebook is for the use of anyone anywhere in the United States and\nmost other parts of the world at no cost and with almost no restrictions\nwhatsoever. You may copy it, give it away or re-use it under the terms\nof the Project Gutenberg License included with this ebook or online\nat www.gutenberg.org. If you are not located in the United States,\nyou will have to check the laws of the country where you are located\nbefore using this eBook.\n\nTitle: The King James Version of the Bible\n\n\nRelease date: August 1, 1989 [eBook #10]\n                Most recently updated: February 1, 2024\n\nLanguage: English',
 '*** START OF THE PROJECT GUTENBERG EBOOK THE KING JAMES VERSION OF THE BIBLE ***\nThe Old Testament of the King James Version of the Bible\nThe First Book of Moses: Called Genesis\nThe Second Book of Moses: Called Exodus\nThe Third Book of Moses: Called Leviticus\nThe Fourth Book of Moses: Call

In [36]:
print(f"Total number of splitted chunks: {len(text_splitted)}")
max_token_length(text_splitted)

Total number of splitted chunks: 4985


'Max Token Length: 219 tokens'

In [41]:
text_tokens = []
for text in text_splitted:
    temp = token_splitter.split_text(text)
    text_tokens.extend(temp)
print(f"Total number of tokens: {len(text_tokens)}")

Total number of tokens: 5362


In [43]:
text_tokens[:5]

['the project gutenberg ebook of the king james version of the bible this ebook is for the use of anyone anywhere in the united states and most other parts of the world at no cost and with almost no restrictions whatsoever. you may copy it, give it away or re - use it under the terms of the project gutenberg license included with this ebook or online at www. gutenberg. org. if you are not located in the united states, you will have to check the laws of the country where you are located before using this ebook. title : the king james version of the bible release date : august 1, 1989 [ ebook # 10 ] most recently updated : february 1, 2024 language : english',
 '* * * start of the project gutenberg ebook the king james version of the bible * * * the old testament of the king james version of the bible the first book of moses : called genesis the second book of moses : called exodus the third book of moses : called leviticus the fourth book of moses : called numbers the fifth book of mose

In [46]:
# Check the token length
# reference: model card "By default, input text longer than 256 word pieces is truncated."
max_token_length(text_tokens)

'Max Token Length: 216 tokens'

In [47]:
embedding_fn = SentenceTransformerEmbeddingFunction()

384

In [55]:
# Size of embedding vector
vector = embedding_fn([lorem_ipsum_tokens])
len(vector[0])

384

In [56]:
chroma_db = chromadb.Client()
chroma_collection = chroma_db.create_collection(
    "bible", embedding_function=embedding_fn
)

In [57]:
# add all tokens to collection
ids = [str(uuid4()) for _ in range(len(text_tokens))]
chroma_collection.add(chroma_collection=text_tokens, ids=ids)

In [60]:
# Save the chroma collection
# Run a Query
res = chroma_collection.query(query_texts=["what did noah do?"], n_results=10)

In [65]:
res["documents"]

[['7 : 11 in the six hundredth year of noah ’ s life, in the second month, the seventeenth day of the month, the same day were all the fountains of the great deep broken up, and the windows of heaven were opened. 7 : 12 and the rain was upon the earth forty days and forty nights. 7 : 13 in the selfsame day entered noah, and shem, and ham, and japheth, the sons of noah, and noah ’ s wife, and the three wives of his sons with them, into the ark ; 7 : 14 they, and every beast after his kind, and all the cattle after their kind, and every creeping thing that creepeth upon the earth after his kind, and every fowl after his kind, every bird of every sort. 7 : 15 and they went in unto noah into the ark, two and two of all flesh, wherein is the breath of life. 7 : 16 and they that went in, went in male and female of all flesh, as god had commanded him : and the lord shut him in.',
  '7 : 24 and the waters prevailed upon the earth an hundred and fifty days. 8 : 1 and god remembered noah, and ev