In [6]:
import json
import re

data_foler = "HNMU_Regulations"

merged_path = data_foler + "_dataMerged.json"
marklist_path = data_foler + "_marklist.jsonl"

bullet_pattern = re.compile(r"^\s*[-•●♦▪‣–—]+\s*$")

# ===== 1. Đọc dữ liệu gốc =====
with open(merged_path, "r", encoding="utf-8") as f:
    data = json.load(f)

paragraphs = data.get("paragraphs", [])
common_markers = set(data.get("general", {}).get("commonMarkers", []))

# ===== 2. Lọc bullet + chỉ giữ MarkerType thuộc common =====
raw_markers = []
for p in paragraphs:
    marker_text = p.get("MarkerText")
    marker_type = p.get("MarkerType")

    # Bỏ bullet
    if bullet_pattern.match(marker_text or "") or bullet_pattern.match(marker_type or ""):
        continue

    # Chỉ giữ nếu marker_type nằm trong commonMarkers (giữ nguyên khoảng trắng)
    if marker_type in common_markers:
        raw_markers.append(marker_type)
    elif marker_type is None and None in common_markers:
        raw_markers.append(None)

# ===== 3. Loại bỏ MarkerType trùng kề nhau =====
cleaned = []
prev = object()  # sentinel
for mtype in raw_markers:
    if mtype != prev:
        cleaned.append(mtype)
        prev = mtype

# ===== 4. Xuất trực tiếp ra JSONL =====
with open(marklist_path, "w", encoding="utf-8") as f:
    for marker_type in cleaned:
        record = {"MarkerType": marker_type}
        f.write(json.dumps(record, ensure_ascii=False) + "\n")
        
print("✅")


✅


In [7]:
import json
from collections import Counter

try:
    data_foler
except NameError:
    data_foler = "HNMU_Regulations"

marklist_path = data_foler + "_marklist.jsonl"
struct_path = data_foler + "_structures.jsonl"

with open(marklist_path, "r", encoding="utf-8") as f:
    cleaned = [json.loads(line) for line in f]

marker_types = [item.get("MarkerType", None) for item in cleaned]

results = {}

# --- Bậc 1 ---
unique_markers = list(dict.fromkeys(marker_types))
counter1 = Counter(marker_types)
results[1] = [{"Structure": [str(m)], "Count": counter1[m]} for m in unique_markers]


# --- Bậc >= 2 ---
max_depth = len(unique_markers)
prev_structures = set([tuple([m]) for m in unique_markers])

for i in range(2, max_depth + 1):
    counter = Counter()
    for j in range(len(marker_types) - i + 1):
        seq = tuple(marker_types[j:j+i])
        prefix = seq[:-1]

        if prefix not in prev_structures:
            continue

        if len(seq) != len(set(seq)):
            continue

        counter[seq] += 1

    if not counter:
        break

    min_count = min(counter.values())
    max_count = max(counter.values())

    # Giữ lại: tất cả trừ cấu trúc min (nếu nó khác max)
    filtered = {s: f for s, f in counter.items() if not (f == min_count and f != max_count)}

    # Sắp xếp theo Count giảm dần
    sorted_structs = sorted(filtered.items(), key=lambda x: x[1], reverse=True)

    results[i] = [{"Structure": [str(x) for x in s], "Count": f} for s, f in sorted_structs]

    # Cập nhật prev_structures
    prev_structures = set([tuple(r["Structure"]) for r in results[i]])


with open(struct_path, "w", encoding="utf-8") as f:
    for depth, structs in results.items():
        for s in structs:
            line = {
                "Depth": depth,
                "Structure": s["Structure"],
                "Count": s["Count"]
            }
            f.write(json.dumps(line, ensure_ascii=False) + "\n")

print("✅")

✅


In [5]:
import json

try:
    data_foler
except NameError:
    data_foler = "HNMU_Regulations"

struct_path = data_foler + "_structures.jsonl"

# Đọc lại file JSONL đã tạo trước đó
with open(struct_path, "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

# Tìm depth cao nhất
max_depth = max(item["Depth"] for item in data)

# Lọc ra các cấu trúc ở depth cao nhất
at_max_depth = [item for item in data if item["Depth"] == max_depth]

# Tìm tần suất cao nhất tại depth đó
max_count = max(item["Count"] for item in at_max_depth)

# Lấy các cấu trúc có count = max_count
top_structures = [item for item in at_max_depth if item["Count"] == max_count]

print(top_structures)

[{'Depth': 4, 'Structure': ['Chương XVI', 'Điều 123. ', '123. ', 'abc) '], 'Count': 3}]
