In [None]:
import os
import json
import torch
import faiss
import logging
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from Libraries import A0_Widgets, A1_Define, A2_MyUtils
from Libraries import B1_ExtractData, B2_MergeData,B3_GetStructures
from Libraries import B4_ChunkMaster, B5_ChunkFlex, B6_ChunkFixed
from Libraries import C1_CreateSchema, C2_Embedding, C3_CheckConstruct
from Libraries import D0_FaissConvert, D1_Search, D2_Rerank, D3_Respond

In [None]:
widgets_list = A0_Widgets.create_name_form()

In [None]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["TORCH_USE_CUDA_DSA"] = "1"
force_download = True

In [None]:
config = A1_Define.WidgetValues(widgets_list)

data_foler = config["data_folder"]
dcmt_path = config["dcmt_path"]
base_folder = config["base_folder"]
base_path = config["base_path"]
extracted_path = config["extracted_path"]
merged_path = config["merged_path"]
struct_path = config["struct_path"]
chunks_base = config["chunks_base"]
chunks_segment = config["chunks_segment"]
schema_ex_path = config["schema_ex_path"]
embedding_path = config["embedding_path"]
torch_path = config["torch_path"]
faiss_path = config["faiss_path"]
mapping_path = config["mapping_path"]
mapping_data = config["mapping_data"]

FILE_TYPE = config["FILE_TYPE"]
DATA_KEY = config["DATA_KEY"]
EMBE_KEY = config["EMBE_KEY"]
SWITCH = config["SWITCH"]
EMBEDD_MODEL = config["EMBEDD_MODEL"]
SEARCH_EGINE = config["SEARCH_EGINE"]
RERANK_MODEL = config["RERANK_MODEL"]
RESPON_MODEL = config["RESPON_MODEL"]
MERGE = config["MERGE"]
API_KEY = config["API_KEY"]

WORD_LIMIT = config["WORD_LIMIT"]
LEVEL_INPUT = config["LEVEL_INPUT"]
LEVEL_VALUES = config["LEVEL_VALUES"]

SEARCH_ENGINE = faiss.IndexFlatIP

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if (SWITCH == "Auto Model"):
    try:
        tokenizer = AutoTokenizer.from_pretrained(EMBEDD_MODEL, force_download=force_download)
        model = AutoModel.from_pretrained(EMBEDD_MODEL, force_download=force_download)
        model = model.to(device)
        print("Model and tokenizer loaded successfully")
    except Exception as e:
        raise
elif (SWITCH == "Sentence Transformer"):
    try:
        model = SentenceTransformer(EMBEDD_MODEL).to(device)
        # model = SentenceTransformer("../../cached_model")
        print("SentenceTransformer loaded successfully")
    except Exception as e:
        raise

print(f"Using: {device}")

In [None]:
if os.path.exists(chunks_segment):
    if not os.path.exists(schema_ex_path):
        C1_CreateSchema.create_schema(chunks_segment, schema_ex_path)
    else:
        print(f"{schema_ex_path} alredy existed")
else:
    print(f"{chunks_segment} does not exist")

In [None]:
if os.path.exists(chunks_segment):
    if not os.path.exists(torch_path):
        # os.environ["CUDA_LAUNCH_BLOCKING"] = "0"
        C2_Embedding.json_embeddings(MERGE, chunks_segment, torch_path, schema_ex_path, model, device, DATA_KEY, EMBE_KEY, batches = False)
    else: 
        print(f"{torch_path} alredy existed")
        C3_CheckConstruct.print_json(DATA_KEY, EMBE_KEY, torch_path)
else:
    print(f"{chunks_segment} does not exist")

In [None]:
if os.path.exists(torch_path):
    if not os.path.exists(faiss_path):
        D0_FaissConvert.convert_pt_to_faiss(torch_path, faiss_path, mapping_path, mapping_data, DATA_KEY, nlist = 100, use_pickle = False)
    else: 
        print(f"{faiss_path} alredy existed")
else:
    print(f"{torch_path} does not exist")

In [None]:
import json
def convert_to_json(preliminary_results):
    class LiteralNewlineEncoder(json.JSONEncoder):
        def encode(self, o):
            # Dùng mặc định encode trước
            json_text = super().encode(o)
            # Sau đó thay thế \\n (escaped newline) thành \n thật
            return json_text.replace("\\n", "\n \t \t ")
    
    return json.dumps(
        preliminary_results,
        ensure_ascii=False,
        indent=4,
        cls=LiteralNewlineEncoder
)

In [None]:
""" MAIN """

with open(f"Prompts/Docs_Prompt.txt", "r", encoding="utf-8") as file1:
    docs_prompt = file1.read()

with open(f"Prompts/Docs_Prompt.txt", "r", encoding="utf-8") as file2:
    natr_prompt = file2.read()

print("<< Enter 'exit', 'quit', 'escape', 'bye' or Press ESC to exit >>")
print("Chatbot: Hello there! I'm here to help you!\n\n")

user_inputs = [
    "Quy chế này quy định những gì và áp dụng cho đối tượng nào",
    "Sinh viên có thể được thi lại bao nhiêu lần?",
]

i = 0
while True:
    try:
        if i >= len(user_inputs):
            user_input = "exit"
        else:
            user_input = user_inputs[i]

        # user_input = input("You: ")

        user_question = C2_Embedding.preprocess_text(user_input)
        if user_input.strip().lower() in ["exit", "quit", "escape", "bye", ""]:
            print("Chatbot: Goodbye!")
            break

        print(f"Query: {user_question}")

        #Bước 1: Search
        preliminary_results = D1_Search.search_faiss_index(
            MERGE = MERGE,
            query= user_question,
            embedd_model=EMBEDD_MODEL,
            faiss_path=faiss_path,
            mapping_path=mapping_path,
            mapping_data=mapping_data,
            device=device,
            k=2,
            min_score = 5,
            batches = False,
        )
        print(preliminary_results)

        # # Bước 2: Rerank
        # reranked_results = D2_Rerank.rerank_results(
        #     query= user_question,
        #     results=preliminary_results,
        #     reranker_model=RERANK_MODEL,
        #     device=device,
        #     k=5,
        #     batches = False,
        # )
        # print(reranked_results)
        # context = '\n\n'.join(item['text'] for item in reranked_results)
        # print(f"\nContext:\n")
        # print(context)

        # if (reranked_results):
        #     system_prompt = docs_prompt
        #     doc = True
        # else:
        #     system_prompt = natr_prompt
        #     doc = False

        # # Bước 3: Generate Response
        # response = D3_Respond.respond_naturally(
        #     user_question = user_question,
        #     # results=reranked_results,
        #     context = context,
        #     system_prompt = system_prompt,
        #     responser_model=RESPON_MODEL,
        #     score_threshold=0.85,
        #     max_results=3,
        #     doc = doc,
        #     gemini_api_key=API_KEY,
        # )

        # print(f"\nYou: {user_question}")
        # print(f"Chatbot: {response}\n\n")
        
        print("=" * 200)
        print("\n\n")
        i += 1

    except KeyboardInterrupt:
        print("\nChatbot: Goodbye!")
        break