In [None]:
import os
import re
import fitz
import json
import spacy
import pickle
import win32com.client
from docx import Document
import ipywidgets as widgets
from IPython.display import display, Javascript

os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

In [None]:
exceptions = {
    "a", "an", "the",
    "and", "but", "or", "nor", "for", "so", "yet",
    "at", "by", "in", "of", "on", "to", "from", "with", "as",
    "into", "like", "over", "under", "up", "down", "out", "upon", "onto",  
    "amid", "among", "between", "before", "after", "against"  
}

In [None]:
state_file = "#Chunks_Size.pkl"

def save_state():
    state = {
        "input_drop": doc_drop.value,
        "word_input": word_input.value,
        "level_input": level_input.value,
        "level_values": {i: text.value for i, text in enumerate(input_box.children)},
    }
    with open(state_file, "wb") as f:
        pickle.dump(state, f)

def load_state():
    if os.path.exists(state_file):
        with open(state_file, "rb") as f:
            return pickle.load(f)
    return {}

input_folder_path = "../Doc"

input_folder = os.listdir(input_folder_path)

state = load_state()

doc_drop = widgets.Dropdown(
    options=input_folder,
    description="Input File:  ",
    disabled=False,
    layout=widgets.Layout(width="50%"),
    value=state.get("input_drop", input_folder[0] if input_folder else None),
)

word_input = widgets.Text(
    description="Word Limit: ",
    placeholder="Default: 200",
    layout=widgets.Layout(width="50%"),
    value=state.get("word_input", ""),
)

level_input = widgets.Dropdown(
    description="Max Level: ",
    options=[str(i) for i in range(0, 10)],
    layout=widgets.Layout(width="50%"),
    value=state.get("level_input", "1"),
)

input_box = widgets.VBox([])

def update_text_inputs(change):
    level_number = int(change.new)
    prev_values = state.get("level_values", [])

    text_inputs = [
        widgets.Text(
            description=f"Level {i+1}: ",
            layout=widgets.Layout(width="50%"),
            value=prev_values[i] if i < len(prev_values) else ""
        )
        for i in range(level_number)
    ]
    input_box.children = text_inputs

level_input.observe(update_text_inputs, names="value")

save_button = widgets.Button(description="Save State", button_style="success")
run_button = widgets.Button(description="Run All Below", button_style="primary")

def on_save_clicked(b):
    save_state()

def on_run_clicked(b):
    save_state()
    display(Javascript("Jupyter.notebook.execute_cells_below()"))

save_button.on_click(on_save_clicked)
run_button.on_click(on_run_clicked)

button_box = widgets.HBox(
    [save_button, run_button],
    layout=widgets.Layout(
        width="50%", 
        justify_content="space-between", 
        padding="0px 4% 0px 12%",
    )
)

footer_display = widgets.HBox(
    [button_box],
    layout=widgets.Layout(
        width="90%", 
        justify_content="space-between", 
        padding="10px 5% 10px 5%",
    )
)

display(doc_drop, word_input, level_input, input_box, footer_display)

level_input.value = "0"
level_input.value = state.get("level_input", "0")

In [None]:
# levels = {}

# for i, text_input in enumerate(input_box.children):
#     levels[i+1] = text_input.value
#     print(f"level {i+1}: {levels[i+1]}")

In [None]:
file_name = os.path.splitext(doc_drop.value)[0]

input_path = f"../Doc/{doc_drop.value}"

output_folder = f"../Data/{file_name}"
os.makedirs(f"{output_folder}", exist_ok=True)

chunks_base = f"{output_folder}/Data_{file_name}_Base.json"
chunks_final = f"{output_folder}/Data_{file_name}_Chunk.json"
embedding_file = f"Embeddings_{output_folder}/{file_name}"

print(input_path)
print(output_folder)
path = input_path

BASE


In [None]:
# EXTRACT INPUT TEXT
def extracted(path):
    text_data = []
    doc = Document(path)
    
    for para in doc.paragraphs:
        cleaned_text = ' '.join(para.text.strip().split())
        if cleaned_text:
            font_size = para.runs[0].font.size.pt if para.runs and para.runs[0].font.size else 0
            text_data.append({"text": cleaned_text, "font_size": font_size})
    
    return text_data

In [None]:
# FORMAT TEXT
def format_text(text, case="upper"):
    if case == "upper":
        return text.upper()
    elif case == "Chapter":
        return ' '.join(word.capitalize() if word.lower() not in exceptions else word.lower() for word in text.split())
    return text

In [None]:
# SIZE OF Chapter AND Title
def find_sizes(text_data):
    Chapter_size, sub_size = 0, 0
    
    for entry in text_data:
        text, font_size = entry["text"], entry["font_size"]
        if text.isupper():
            Chapter_size = max(Chapter_size, font_size)
        else:
            sub_size = max(sub_size, font_size)
    return Chapter_size, sub_size

In [None]:
# ADD CHUNKS
def add_chunk(chunks, content):
    if content["Content"]:
        chunks.append(content.copy())
        content["Content"] = []

In [None]:
# MAIN FUNCTION
def main(text_data, Chapter_size, sub_size):
    chunks = []
    content = {"Chapter": None, "Title": None, "Article": None, "Content": []}

    for entry in text_data:
        text, font_size = entry["text"], entry["font_size"]
        if not text:
            continue
        if text.isupper():
            if font_size == Chapter_size:
                add_chunk(chunks, content)
                content["Chapter"] = format_text(text, "upper")
                content["Title"] = None
                content["Article"] = None
            else:
                add_chunk(chunks, content)
                content["Article"] = format_text(text, "upper")
        else:
            if font_size == sub_size:
                add_chunk(chunks, content)
                content["Title"] = format_text(text, "Chapter")
                content["Article"] = None
            else:
                if content["Content"]:
                    last_sentence = content["Content"][-1]
                    if not last_sentence.endswith((".", "!", "?")):
                        content["Content"][-1] += " " + text
                    else:
                        content["Content"].append(text)
                else:
                    content["Content"].append(text)
    
    add_chunk(chunks, content)
    return chunks

In [None]:
# CHUNKS BASE
text_data = extracted(input_path)
Chapter_size, sub_size = find_sizes(text_data)
chunks = main(text_data, Chapter_size, sub_size)

In [None]:
# EXPORT BASE
with open(chunks_base, "w", encoding="utf-8") as f:
    json.dump(chunks, f, indent=4, ensure_ascii=False)

print(f"Base data saved to {chunks_base}")

FINAL


In [None]:
nlp = spacy.load("en_core_web_sm")
WORD_LIMIT = int(word_input.value) if word_input.value.isdigit() else 200

In [None]:
# COUNT WORDS
def count_words(text):
    return len(text.split())

In [None]:
# CHUNKING IF WORD LIMIT EXCEEDED
def semantic_chunking(text, max_words=WORD_LIMIT):

    doc = nlp(text)
    chunks, current_chunk = [], []
    word_count = 0
    
    for sent in doc.sents:
        sentence = sent.text.strip()
        sentence_length = len(sentence.split())
        
        if word_count + sentence_length > max_words and current_chunk:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            word_count = 0
            
        current_chunk.append(sentence)
        word_count += sentence_length
        
    if current_chunk:
        chunks.append(" ".join(current_chunk))
        
    return chunks

In [None]:
# MAIN PROCESSING FUNCTION
def process_json(chunks_base, chunks_final):
    with open(chunks_base, "r", encoding="utf-8") as f:
        data = json.load(f)
        
    processed_data = []
    
    for idx, chunk in enumerate(data):
        
        # PRINT PROGRESS INFO       
        if "Content" in chunk and isinstance(chunk["Content"], list):
            new_content = []
            
            for para_idx, paragraph in enumerate(chunk["Content"]):
                word_count = count_words(paragraph)
                
                if word_count > WORD_LIMIT:
                    
                    # PRINT WORDS NUMBER
                    print(f"{idx+1:04} / {len(data):04}: {para_idx+1:02}: {word_count} words.")
                    
                    chunked_paragraphs = semantic_chunking(paragraph)
                    new_content.extend(chunked_paragraphs)
                    
                    # PRINT SEGMENTS NUMBER
                    print(f"{idx+1:04} / {len(data):04}: {len(chunked_paragraphs):02} segments.")
                    
                else:
                    new_content.append(paragraph)
                    
            chunk["Content"] = new_content
            
        processed_data.append(chunk)
        
        # SAVE PROGRESS
        with open(chunks_final, "w", encoding="utf-8") as f:
            json.dump(processed_data, f, indent=4, ensure_ascii=False)
                
    # FINISHED    
    print(f"Final data saved to {chunks_final}.")

In [None]:
process_json(chunks_base, chunks_final)

EMBEDDING
