In [None]:
import os
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_google_genai import GoogleGenerativeAI
from langchain_cohere import CohereRerank

from vectorstore import VectorStore
from rag_chain import RAG_chain
from utils import json_to_documents

In [None]:
from dotenv import load_dotenv
load_dotenv()

## Test chain

In [None]:
docs = json_to_documents('scraper/tmp_docs.json')

In [None]:
embedding = HuggingFaceEmbeddings(
    model_name='bkai-foundation-models/vietnamese-bi-encoder',
    cache_folder='/Users/btp712/Code/University Admission Consulting Chatbot/cache',
    model_kwargs={'device': 'mps'},
)

In [None]:
splitter = SemanticChunker(embedding, breakpoint_threshold_type="percentile")
chunks = splitter.split_documents(docs)

In [None]:
vectorstore = VectorStore(embedding)
vectorstore.indexing(chunks, source_id_key='source')

In [None]:
llm = GoogleGenerativeAI(model='gemini-1.5-flash-latest')
retriever = vectorstore.get_retriever(k=20)

In [None]:
reranker = CohereRerank(cohere_api_key=os.getenv('COHERE_API_KEY'), model='rerank-multilingual-v3.0')

In [None]:
chain = RAG_chain(llm=llm, retriever=retriever, reranker=reranker)

user_input = None
while True:
    user_input = input('Input: ')
    if user_input == '\\exit':
        break
    print(chain.invoke(user_input))