In [1]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

from nlp_chat_bot.rag.classic_rag import ClassicRAG
from nlp_chat_bot.model.embedding.late_chunking_embedding import LateChunkingEmbedding
from langchain_google_genai import ChatGoogleGenerativeAI

from nlp_chat_bot.vector_store.late_chunking_chroma_vector_store_builder import LateChunkingChromaVectorStoreBuilder

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
dataset_path = "../data"
model_download_path = "../models"
vector_store_path = "../chromadb"
embedding_function = LateChunkingEmbedding(model_download_path)

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=0,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)

llm_gemini = ChatGoogleGenerativeAI(model="gemini-1.5-flash")
vector_store = LateChunkingChromaVectorStoreBuilder(dataset_path,
                                        embedding_function,
                                        vector_store_path,
                                        splitter=splitter).build()

rag = ClassicRAG(vector_store, llm_gemini)
print("LENGTH", rag.get_num_docs())
docs_retrieved = rag.retrieve(state = {"question": "What is my conclusion in my project report on image inpainting?", "context": []})

print("Num docs:", len(docs_retrieved["context"]))

for i in range(len(docs_retrieved["context"])):
    doc = docs_retrieved["context"][i]
    print("\n\n", "#"*30,"\n")
    print(f"doc {i}: (score: {doc.metadata['score']})")
    print(doc.page_content)

100%|██████████| 1/1 [00:00<00:00,  3.17it/s]
100%|██████████| 1/1 [00:00<00:00,  1.81it/s]
100%|██████████| 1/1 [00:03<00:00,  3.94s/it]
0it [00:00, ?it/s]
Filtering existing documents: 100%|██████████| 15344/15344 [00:00<00:00, 34916.24it/s]
Storing 15344 documents embeddings (batch size is 100): 15400it [10:41, 23.99it/s]                           


Documents are now loaded
LENGTH 109627
Num docs: 3


 ############################## 

doc 0: (score: 44.04249572753906)
imdb_id: tt2113148
title: Arne Dahl: Misterioso
plot_synopsis: Arne Dahl - Misterioso
A failed robbery attempt at Sydbanken outside Avesta leaves one robber lying dead with a dart through his eye, but there are no witnesses to the incident and no perpetrator is ever identified.
In Stockholm, three high-profile businessmen are assassinated in a short period of time, and Jenny Hultin of National CID is assigned the task of putting together a special team to solve the case before the assassin strikes again. Dependable Paul Hjelm, computer expert Jorge Chavez, old warhorse Viggo Norlander, freethinker Arto Söderstedt, muscle-man Gunnar Nyberg and outstanding interrogator Kerstin Holm together form the A-group.


 ############################## 

doc 1: (score: 44.096435546875)
imdb_id: tt0478813
title: Klopka
plot_synopsis: === Intro ===
The film opens with Mladen Pavlov

In [9]:
rag.invoke(query={"question":"What is the plot of the film titled Ico? (イコ, pronounced Ee-ko)"})["answer"]

'In Ico, a horned boy escapes a castle where he was to be sacrificed and encounters a captive girl, Yorda.  They attempt to escape together, facing shadow creatures and the Queen, who prevents their escape.  Ultimately, they are separated, and Ico falls from a bridge.\n'