In [None]:
import os
import re
import torch
import faiss
import logging
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from RAGLibrary import A0_Widgets, A1_Define
from RAGLibrary import B1_ExtractData, B2_MergeData, B3_Chunking
from RAGLibrary import C1_CreateSchema, C2_Embedding, C3_CheckConstruct
from RAGLibrary import D0_FaissConvert, D1_Search, D2_Rerank, D3_Respond

In [None]:
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
widgets_list = A0_Widgets.create_name_form()

In [None]:
config = A1_Define.WidgetValues(widgets_list)

data_foler = config["data_folder"]
dcmt_path = config["dcmt_path"]
base_folder = config["base_folder"]
base_path = config["base_path"]
chunks_base = config["chunks_base"]
json_file_path = config["json_file_path"]
schema_ex_path = config["schema_ex_path"]
embedding_path = config["embedding_path"]
torch_path = config["torch_path"]
faiss_path = config["faiss_path"]
mapping_path = config["mapping_path"]
mapping_data = config["mapping_data"]

FILE_TYPE = config["FILE_TYPE"]
DATA_KEY = config["DATA_KEY"]
EMBE_KEY = config["EMBE_KEY"]
SWITCH = config["SWITCH"]
EMBEDD_MODEL = config["EMBEDD_MODEL"]
SEARCH_EGINE = config["SEARCH_EGINE"]
RERANK_MODEL = config["RERANK_MODEL"]
RESPON_MODEL = config["RESPON_MODEL"]
MERGE = config["MERGE"]
API_KEY = config["API_KEY"]

WORD_LIMIT = config["WORD_LIMIT"]
LEVEL_INPUT = config["LEVEL_INPUT"]
LEVEL_VALUES = config["LEVEL_VALUES"]

SEARCH_ENGINE = faiss.IndexFlatIP

In [None]:
path = dcmt_path
Contents = LEVEL_VALUES[-1] if LEVEL_VALUES else None
print(Contents)

In [None]:
# EXTRACT DATA
text_data = B1_ExtractData.extractData(dcmt_path)
# text_data

BASE


In [None]:
# ADD CHUNKS
def add_chunk(chunks, content):
    if content["Chương"] and content[Contents]:
        content["Index"] += 1
        chunks.append(content.copy())
        content[Contents] = []
        
def is_chapter(text):
    text = text.strip()
    return bool(re.match(r"^Chương\s*[IVXLCDM\d]+\b", text, re.IGNORECASE))

def is_article(text):
    text = text.strip()
    return bool(re.match(r"^Điều\s+([IVXLCDM\d]+)\.\s*(.+)", text, re.IGNORECASE))

def is_clause(text):
    text = text.strip()
    return bool(re.match(r"^\d+\.\s+.+", text))

def is_content(text):
    text = text.strip()
    return bool(re.match(r'^([-+*•●◦○] )|([a-zA-Z\-\+\*]+[.)\]:] )|(\(\w+\) )', text))

In [None]:
# MAIN FUNCTION
def main(text_data):
    chunks = []
    content = {"Index": 0, "Chương": None, "Điều": None, "Khoản": None, Contents: []}
    i = 0
    while i < len(text_data):
        chunk = text_data[i]["text"]

        if is_chapter(chunk):
            if i + 1 < len(text_data):
                chunk += f": {text_data[i + 1]['text']}"
            add_chunk(chunks, content)
            content["Chương"] = chunk
            content["Điều"] = None
            content["Khoản"] = None 
            i += 1

        elif is_article(chunk):
            match = re.match(r"^(Điều\s*[IVXLCDM\d]+)\.\s*(.+)", chunk, re.IGNORECASE)
            if content["Chương"]:
                if match:
                    chunk = f"{match.group(1)}: {match.group(2)}"
                add_chunk(chunks, content)
                content["Điều"] = chunk
                content["Khoản"] = None 
        
        elif is_clause(chunk):
            match = re.match(r"^(\d+)\.\s*(.+)", chunk)
            if content["Chương"]:
                if match:
                    clause_number = match.group(1)
                    clause_content = match.group(2)

                    if i + 1 < len(text_data) and is_content(text_data[i + 1]["text"]):
                        chunk = f"Khoản {clause_number}: {clause_content}"
                        add_chunk(chunks, content)
                        content["Khoản"] = chunk
                    else:
                        chunk = f"Khoản {clause_number}"
                        add_chunk(chunks, content)
                        content["Khoản"] = chunk

                        chunk = clause_content
                        content[Contents].append(chunk)
                else: 
                    print(chunk)

        elif is_content(chunk):
            match = re.match(r'^([-+*•●◦○a-zA-Z\-\+\*]+[.)\]:] )(\s.+)', chunk)
            if content["Chương"]:
                if match:
                    chunk = match.group(2)
                content[Contents].append(chunk)
        i += 1
        
    return chunks

In [None]:
# CHUNKS BASE
chunks = main(text_data)

In [None]:
# EXPORT BASE
import json
with open(chunks_base, "w", encoding="utf-8") as f:
    json.dump(chunks, f, indent=4, ensure_ascii=False)

print(f"Base data saved to {chunks_base}")

FINAL


In [None]:
# import spacy
# nlp = spacy.load("en_core_web_sm")

In [None]:
# SegmentChunks.process_json(chunks_base, json_file_path, Contents, WORD_LIMIT, nlp)