In [None]:
import os
import re
import json
import fitz
import spacy
import pickle
import win32com.client
from docx import Document
import ipywidgets as widgets
from IPython.display import display, Javascript

os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

In [None]:
state_file = "#Chunks_Mark.pkl"

def save_state():
    state = {
        "input_drop": doc_drop.value,
        "word_input": word_input.value,
        "level_input": level_input.value,
        "level_values": {i: text.value for i, text in enumerate(input_box.children)},
    }
    with open(state_file, "wb") as f:
        pickle.dump(state, f)

def load_state():
    if os.path.exists(state_file):
        with open(state_file, "rb") as f:
            return pickle.load(f)
    return {}

input_folder_path = "../Doc"

input_folder = os.listdir(input_folder_path)

state = load_state()

doc_drop = widgets.Dropdown(
    options=input_folder,
    description="Input File:  ",
    disabled=False,
    layout=widgets.Layout(width="50%"),
    value=state.get("input_drop", input_folder[0] if input_folder else None),
)

word_input = widgets.Text(
    description="Word Limit: ",
    placeholder="Default: 200",
    layout=widgets.Layout(width="50%"),
    value=state.get("word_input", ""),
)

level_input = widgets.Dropdown(
    description="Max Level: ",
    options=[str(i) for i in range(0, 10)],
    layout=widgets.Layout(width="50%"),
    value=state.get("level_input", "1"),
)

input_box = widgets.VBox([])

def update_text_inputs(change):
    level_number = int(change.new)
    prev_values = state.get("level_values", [])

    text_inputs = [
        widgets.Text(
            description=f"Level {i+1}: ",
            layout=widgets.Layout(width="50%"),
            value=prev_values[i] if i < len(prev_values) else ""
        )
        for i in range(level_number)
    ]
    input_box.children = text_inputs

level_input.observe(update_text_inputs, names="value")

save_button = widgets.Button(description="Save State", button_style="success")
run_button = widgets.Button(description="Run All Below", button_style="primary")

def on_save_clicked(b):
    save_state()

def on_run_clicked(b):
    save_state()
    display(Javascript("Jupyter.notebook.execute_cells_below()"))

save_button.on_click(on_save_clicked)
run_button.on_click(on_run_clicked)

button_box = widgets.HBox(
    [save_button, run_button],
    layout=widgets.Layout(
        width="50%", 
        justify_content="space-between", 
        padding="0px 4% 0px 12%",
    )
)

footer_display = widgets.HBox(
    [button_box],
    layout=widgets.Layout(
        width="90%", 
        justify_content="space-between", 
        padding="10px 5% 10px 5%",
    )
)

display(doc_drop, word_input, level_input, input_box, footer_display)

level_input.value = "0"
level_input.value = state.get("level_input", "0")

In [None]:
levels = {}

for i, text_input in enumerate(input_box.children):
    levels[i+1] = text_input.value
    print(f"level {i+1}: {levels[i+1]}")

In [None]:
file_name = os.path.splitext(doc_drop.value)[0]

input_path = f"../Doc/{doc_drop.value}"

output_folder = f"../Data/{file_name}"
os.makedirs(f"{output_folder}", exist_ok=True)

chunks_base = f"{output_folder}/Data_{file_name}_Base.json"
chunks_final = f"{output_folder}/Data_{file_name}_Chunk.json"

print(input_path)
print(output_folder)
print(chunks_base)
path = input_path

BASE


In [None]:
def sentence_end(text):
    brackets = ["()", "''", '""', "[]", "{}", "«»", "“”", "‘’"]
    valid_brackets = any(text.startswith(pair[0]) and text.endswith(pair[1]) for pair in brackets)
    valid_end = text.endswith(('.', '!', '?', ':', ';'))
    return valid_end or valid_brackets

def markers(text):
    return bool(re.match(r'^([-+*•●◦○] )|([0-9a-zA-Z\-\+\*ivxIVX]+[.)\]:] )|(\(\d+\) )|(\(\w+\) )|([0-9]+\s+-\s+[0-9]+ )', text))

def unclosed(text):
    stack = []
    brackets = {"(": ")", "[": "]", "{": "}", '"': '"', "'": "'", "«": "»", "“": "”", "‘": "’"}
    for char in text:
        if char in brackets.keys():
            stack.append(char)
        elif char in brackets.values():
            if stack and brackets[stack[-1]] == char:
                stack.pop()
            else:
                return False
    return bool(stack)

def merge_text(para, new_para):
    should_merge = (not (new_para.isupper() ^ para.isupper()) and not markers(new_para) and (not new_para[0].isupper() or not sentence_end(para))) or unclosed(para)
    return should_merge

def extracted(path):
    file_ext = os.path.splitext(path)[1].lower()
    text_data = []

    if file_ext == ".docx":
        doc = Document(path)
        paragraph = ""
        for para in doc.paragraphs:
            for line in para.text.split("\n"):
                cleaned_text = ' '.join(line.strip().split())
                if cleaned_text:
                    if paragraph and merge_text(paragraph, cleaned_text):
                        paragraph += " " + cleaned_text
                    else:
                        if paragraph:
                            text_data.append({"text": paragraph})
                        paragraph = cleaned_text
        if paragraph:
            text_data.append({"text": paragraph})

    elif file_ext == ".doc":
        word = win32com.client.Dispatch("Word.Application")
        word.Visible = False
        doc = word.Documents.Open(os.path.abspath(path))
        text = doc.Content.Text
        doc.Close()
        word.Quit()

        paragraph = ""
        for line in text.split("\n"):
            cleaned_text = ' '.join(line.strip().split())
            if cleaned_text:
                if paragraph and merge_text(paragraph, cleaned_text):
                    paragraph += " " + cleaned_text
                else:
                    if paragraph:
                        text_data.append({"text": paragraph})
                    paragraph = cleaned_text
        if paragraph:
            text_data.append({"text": paragraph})

    elif file_ext == ".pdf":
        doc = fitz.open(path)
        paragraph = ""
        for page in doc:
            blocks = sorted(page.get_text("blocks"), key=lambda b: (b[1], b[0]))
            for block in blocks:
                for line in block[4].split("\n"):
                    cleaned_text = " ".join(line.strip().split())
                    if cleaned_text:
                        if paragraph and merge_text(paragraph, cleaned_text):
                            paragraph += " " + cleaned_text
                        else:
                            if paragraph:
                                text_data.append({"text": paragraph})
                            paragraph = cleaned_text
        if paragraph:
            text_data.append({"text": paragraph})

    return text_data

In [None]:
# ADD CHUNKS
def add_chunk(chunks, content):
    if content["Chương"] and content["Nội dung"]:
        content["Index"] += 1
        chunks.append(content.copy())
        content["Nội dung"] = []
        
def is_chapter(text):
    text = text.strip()
    return bool(re.match(r"^Chương\s*[IVXLCDM\d]+\b", text, re.IGNORECASE))

def is_article(text):
    text = text.strip()
    return bool(re.match(r"^Điều\s+([IVXLCDM\d]+)\.\s*(.+)", text, re.IGNORECASE))

def is_clause(text):
    text = text.strip()
    return bool(re.match(r"^\d+\.\s+.+", text))

def is_content(text):
    text = text.strip()
    return bool(re.match(r'^([-+*•●◦○] )|([a-zA-Z\-\+\*]+[.)\]:] )|(\(\w+\) )', text))


In [None]:
# MAIN FUNCTION
def main(text_data):
    chunks = []
    content = {"Index": 0, "Chương": None, "Điều": None, "Khoản": None, "Nội dung": []}
    i = 0
    while i < len(text_data):
        chunk = text_data[i]["text"]

        if is_chapter(chunk):
            if i + 1 < len(text_data):
                chunk += f": {text_data[i + 1]['text']}"
            add_chunk(chunks, content)
            content["Chương"] = chunk
            content["Điều"] = None
            content["Khoản"] = None 
            i += 1

        elif is_article(chunk):
            match = re.match(r"^(Điều\s*[IVXLCDM\d]+)\.\s*(.+)", chunk, re.IGNORECASE)
            if content["Chương"]:
                if match:
                    chunk = f"{match.group(1)}: {match.group(2)}"
                add_chunk(chunks, content)
                content["Điều"] = chunk
                content["Khoản"] = None 
        
        elif is_clause(chunk):
            match = re.match(r"^(\d+)\.\s*(.+)", chunk)
            if content["Chương"]:
                if match:
                    clause_number = match.group(1)
                    clause_content = match.group(2)

                    if i + 1 < len(text_data) and is_content(text_data[i + 1]["text"]):
                        chunk = f"Khoản {clause_number}: {clause_content}"
                        add_chunk(chunks, content)
                        content["Khoản"] = chunk
                    else:
                        chunk = f"Khoản {clause_number}"
                        add_chunk(chunks, content)
                        content["Khoản"] = chunk

                        chunk = clause_content
                        content["Nội dung"].append(chunk)
                else: 
                    print(chunk)

        elif is_content(chunk):
            match = re.match(r'^([-+*•●◦○a-zA-Z\-\+\*]+[.)\]:] )(\s.+)', chunk)
            if content["Chương"]:
                if match:
                    chunk = match.group(2)
                content["Nội dung"].append(chunk)
        i += 1
        
    return chunks

In [None]:
# CHUNKS BASE
text_data = extracted(input_path)
chunks = main(text_data)

In [None]:
# EXPORT BASE
with open(chunks_base, "w", encoding="utf-8") as f:
    json.dump(chunks, f, indent=4, ensure_ascii=False)

print(f"Base data saved to {chunks_base}")

FINAL


In [None]:
nlp = spacy.load("en_core_web_sm")
WORD_LIMIT = int(word_input.value) if word_input.value.isdigit() else 200

In [None]:
# COUNT WORDS
def count_words(text):
    return len(text.split())

In [None]:
# CHUNKING IF WORD LIMIT EXCEEDED
def semantic_chunking(text, max_words=WORD_LIMIT):

    doc = nlp(text)
    chunks, current_chunk = [], []
    word_count = 0
    
    for sent in doc.sents:
        sentence = sent.text.strip()
        sentence_length = len(sentence.split())
        
        if word_count + sentence_length > max_words and current_chunk:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            word_count = 0
            
        current_chunk.append(sentence)
        word_count += sentence_length
        
    if current_chunk:
        chunks.append(" ".join(current_chunk))
        
    return chunks

In [None]:
# MAIN PROCESSING FUNCTION
def process_json(chunks_base, chunks_final):
    with open(chunks_base, "r", encoding="utf-8") as f:
        data = json.load(f)
        
    processed_data = []
    
    for idx, chunk in enumerate(data):
        
        # PRINT PROGRESS INFO
        if "Nội dung" in chunk and isinstance(chunk["Nội dung"], list):
            new_content = []
            
            for para_idx, paragraph in enumerate(chunk["Nội dung"]):
                word_count = count_words(paragraph)
                
                if word_count > WORD_LIMIT  and not markers(paragraph):
                    
                    # WORDS NUMBER                    
                    chunked_paragraphs = semantic_chunking(paragraph)
                    new_content.extend(chunked_paragraphs)
                    
                    # PRINT SEGMENTS NUMBER
                    print(f"{idx+1:04} / {len(data):04}: {len(chunked_paragraphs):02} segments.")
                    
                else:
                    new_content.append(paragraph)
                    
            chunk["Nội dung"] = new_content
            
        processed_data.append(chunk)
        
        # SAVE PROGRESS
        with open(chunks_final, "w", encoding="utf-8") as f:
            json.dump(processed_data, f, indent=4, ensure_ascii=False)
        print(f"{idx+1:04} / {len(data):04}: Saved!\n")
    
    # FINISHED    
    print(f"Final data saved to {chunks_final}.")

In [None]:
process_json(chunks_base, chunks_final)