In [1]:
# NOTE: This sample uses The Count of Monte Cristo to create a RAG
# vector database. This is almost certainly a completely pointless
# exercise since the entire book is probably included in the training
# data of every major AI model. Nevertheless, it shows the principle
# of how RAG can be used with vector databases to augment the ability
# to deal with extra data.

import ollama
from qdrant_client import QdrantClient, models

embedding_model = "nomic-embed-text"
embeddings_vector_size = 768
vector_collection_name = "textblobs"

In [2]:
# Use Qdrant as an in-memory vector database for storing embeddings.
qdrant_client = QdrantClient(location=":memory:")

qdrant_client.create_collection(
    collection_name=vector_collection_name,
    vectors_config=models.VectorParams(size=embeddings_vector_size, distance=models.Distance.COSINE),
)

True

In [3]:
# Read in contents of the Count of Monte Cristo
f_book = open("../../assets/books/CountOfMonteCristo.txt", "r")
book_text = f_book.read()

In [None]:
# Split text and load embeddings into the vector database.
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=1024, chunk_overlap=256)
text_blocks = text_splitter.split_text(book_text)

ollama_embed_result = ollama.embed(
    model=embedding_model,
    input=text_blocks
)

text_embeddings_array = ollama_embed_result.embeddings

for i in range(len(text_blocks)):
    text_embeddings = text_embeddings_array[i]
    qdrant_client.upsert(
        collection_name=vector_collection_name,
        points=[
            models.PointStruct(id=i, vector=text_embeddings)
        ]
    )
    

In [13]:
def get_related_text_blocks(prompt, max_blocks):
    ollama_embed_result = ollama.embed(
        model=embedding_model,
        input=prompt
    )
    
    prompt_embedding = ollama_embed_result.embeddings[0]
    
    search_result = qdrant_client.query_points(
        collection_name=vector_collection_name,
        query=prompt_embedding,
        limit=5,
    )

    result_list = []
    for point in search_result.points:
        id = point.id
        result_list.append(text_blocks[id])

    return result_list

In [15]:
prompt = "Tell me about when Mercedes recognizes the Count of Monte Cristo's true identity."

result_list = get_related_text_blocks(prompt, 3)

for result in result_list:
    print("------------------")
    print(result)

------------------
“Yours!” cried she, throwing back her veil,—“yours, which I alone,
perhaps, have not forgotten. Edmond, it is not Madame de Morcerf who is
come to you, it is Mercédès.”

“Mercédès is dead, madame,” said Monte Cristo; “I know no one now of
that name.”

“Mercédès lives, sir, and she remembers, for she alone recognized you
when she saw you, and even before she saw you, by your voice,
Edmond,—by the simple sound of your voice; and from that moment she has
followed your steps, watched you, feared you, and she needs not to
inquire what hand has dealt the blow which now strikes M. de Morcerf.”

“Fernand, do you mean?” replied Monte Cristo, with bitter irony; “since
we are recalling names, let us remember them all.” Monte Cristo had
pronounced the name of Fernand with such an expression of hatred that
Mercédès felt a thrill of horror run through every vein.

“You see, Edmond, I am not mistaken, and have cause to say, ‘Spare my
son!’”

“And who told you, madame, that I have a