In [8]:
import json
import os
import re
from datetime import datetime
import unicodedata
from typing import List, Dict, Any

In [9]:
# ===============================
# 1Ô∏è‚É£ L√†m s·∫°ch vƒÉn b·∫£n
# ===============================
def clean_formatting(text: str) -> str:
    if not text:
        return ""
    text = text.replace('\r', '\n')
    text = re.sub(r"(?m)^[\*\=\-‚Äì_\.]{3,}\s*$", "", text)
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{2,}", "\n", text)
    text = re.sub(r"([^\n])\n([^\n])", r"\1 \2", text)
    text = re.sub(r"KH√îNGS·ªê", "KH√îNG S·ªê", text, flags=re.IGNORECASE)
    text = re.sub(r"\s{2,}", " ", text)
    return text.strip()

# ===============================
# 2Ô∏è‚É£ Suy lu·∫≠n lƒ©nh v·ª±c (d·ª± ph√≤ng)
# ===============================
def infer_field_category(title: str | None, agency: str | None, content: str | None) -> str | None:
    combined = " ".join(filter(None, [title, agency, content])).lower()
    domain_keywords = {
        "gi√°o d·ª•c": ["gi√°o d·ª•c", "tr∆∞·ªùng h·ªçc", "h·ªçc sinh", "ƒë·∫°i h·ªçc", "b·ªô gi√°o d·ª•c"],
        "y t·∫ø": ["y t·∫ø", "b·ªánh vi·ªán", "s·ª©c kh·ªèe", "b·ªô y t·∫ø"],
        "t√†i ch√≠nh": ["t√†i ch√≠nh", "ng√¢n s√°ch", "thu·∫ø", "kho b·∫°c", "b·ªô t√†i ch√≠nh"],
        "ƒë·∫•t ƒëai": ["ƒë·∫•t ƒëai", "b·∫•t ƒë·ªông s·∫£n", "x√¢y d·ª±ng", "b·ªô x√¢y d·ª±ng"],
        "lao ƒë·ªông": ["lao ƒë·ªông", "vi·ªác l√†m", "ti·ªÅn l∆∞∆°ng", "b·ªô lao ƒë·ªông"],
        "n√¥ng nghi·ªáp": ["n√¥ng nghi·ªáp", "th·ªßy s·∫£n", "chƒÉn nu√¥i", "b·ªô n√¥ng nghi·ªáp"],
        "giao th√¥ng": ["giao th√¥ng", "v·∫≠n t·∫£i", "ƒë∆∞·ªùng b·ªô", "b·ªô giao th√¥ng"],
        "c√¥ng th∆∞∆°ng": ["c√¥ng th∆∞∆°ng", "th∆∞∆°ng m·∫°i", "xu·∫•t nh·∫≠p kh·∫©u", "b·ªô c√¥ng th∆∞∆°ng"],
        "qu·ªëc ph√≤ng": ["qu·ªëc ph√≤ng", "qu√¢n ƒë·ªôi", "b·ªô qu·ªëc ph√≤ng"],
        "c√¥ng an": ["c√¥ng an", "an ninh", "b·ªô c√¥ng an"],
        "vƒÉn h√≥a": ["vƒÉn h√≥a", "th·ªÉ thao", "du l·ªãch", "b·ªô vƒÉn h√≥a"],
        "m√¥i tr∆∞·ªùng": ["m√¥i tr∆∞·ªùng", "t√†i nguy√™n", "b·ªô t√†i nguy√™n"],
    }
    for field, keywords in domain_keywords.items():
        if any(kw in combined for kw in keywords):
            return field.capitalize()
    return "Kh√°c"

In [10]:
# ===============================
# 4Ô∏è‚É£ Chia vƒÉn b·∫£n th√†nh c√°c chunk
# ===============================
def chunk_document(content: str, max_chunk_size: int = 1000, overlap_words: int = 50) -> List[str]:
    """Chia vƒÉn b·∫£n th√†nh c√°c ƒëo·∫°n nh·ªè d·ª±a tr√™n c√¢u, ƒëi·ªÅu lu·∫≠t, ch∆∞∆°ng,..."""
    content = clean_formatting(content)

    # --- T√°ch theo c√°c ch·ªâ m·ª•c ph√°p l√Ω (ƒêi·ªÅu, Kho·∫£n, Ch∆∞∆°ng, M·ª•c)
    parts = re.split(r'(?=(ƒêi·ªÅu\s+th·ª©|KHO·∫¢N\s+TH·ª®|CH∆Ø∆†NG\s+|M·ª§C\s+))', content, flags=re.IGNORECASE)
    if len(parts) == 1:
        # N·∫øu kh√¥ng c√≥ 'ƒêi·ªÅu...', chia theo c√¢u
        sentences = re.split(r'(?<=[.!?])\s+', content)
        parts = []
        current = ""
        for s in sentences:
            if len(current) + len(s) < max_chunk_size:
                current += " " + s
            else:
                parts.append(current.strip())
                current = s
        if current:
            parts.append(current.strip())
    else:
        merged = []
        for i in range(0, len(parts), 2):
            section = "".join(parts[i:i+2]).strip()
            if section:
                merged.append(section)
        parts = merged

    # --- G·ªôp nh·ªè n·∫øu ƒëo·∫°n qu√° ng·∫Øn
    chunks = []
    buffer = ""
    for p in parts:
        if len(buffer) + len(p) < max_chunk_size:
            buffer += " " + p
        else:
            chunks.append(buffer.strip())
            buffer = p
    if buffer:
        chunks.append(buffer.strip())

    # --- Overlap gi·ªØa c√°c chunk
    if overlap_words > 0 and len(chunks) > 1:
        result = [chunks[0]]
        for i in range(1, len(chunks)):
            prev = chunks[i-1].split()
            overlap = " ".join(prev[-overlap_words:]) if len(prev) > overlap_words else chunks[i-1]
            result.append((overlap + " " + chunks[i]).strip())
        chunks = result

    return chunks

# ===============================
# 5Ô∏è‚É£ Chu·∫©n h√≥a t·ª´ng record
# ===============================
def normalize_record(doc: Dict[str, Any]) -> Dict[str, Any]:
    content = doc.get("mt6_text") or doc.get("content") or ""
    content = clean_formatting(content)
    law_id = str(doc.get("law_id") or doc.get("lawid") or "unknown")
    return {
        "law_id": law_id,
        "raw_title": doc.get("title"),
        "source_url": doc.get("href"),
        "category": doc.get("category"),
        "content": content,
        "provided_issue_date": doc.get("date")
    }

In [11]:
# ===============================
# 3Ô∏è‚É£ Tr√≠ch xu·∫•t metadata
# ===============================
def extract_metadata(content: str, law_id: str, title: str | None = None) -> Dict[str, Any]:
    metadata = {
        "law_id": law_id,
        "document_type": None,
        "issuing_agency": None,
        "issue_date": None,
        "title": None,
    }

    DOC_TYPES = [
        "C√îNG ƒêI·ªÜN", "NGH·ªä ƒê·ªäNH", "TH√îNG T∆Ø", "QUY·∫æT ƒê·ªäNH", "LU·∫¨T",
        "NGH·ªä QUY·∫æT", "CH·ªà TH·ªä", "TH√îNG B√ÅO", "K·∫æ HO·∫†CH", "B√ÅO C√ÅO",
        "VƒÇN B·∫¢N", "C√îNG VƒÇN", "S·∫ÆC L·ªÜNH"
    ]
    AGENCIES = [
        "CH·ª¶ T·ªäCH", "CH√çNH PH·ª¶", "B·ªò", "QU·ªêC H·ªòI", "TH·ª¶ T∆Ø·ªöNG",
        "T√íA √ÅN", "VI·ªÜN KI·ªÇM S√ÅT", "KI·ªÇM TO√ÅN", "UBND", "HƒêND"
    ]

    def _norm(s: str) -> str:
        s = unicodedata.normalize("NFC", s or "").lower()
        return re.sub(r"\s+", " ", s).strip()

    # 1Ô∏è‚É£ Lo·∫°i vƒÉn b·∫£n
    if title:
        tnorm = _norm(title)
        for dt in DOC_TYPES:
            if dt.lower() in tnorm:
                metadata["document_type"] = dt
                break
    if not metadata["document_type"]:
        head = "\n".join(content.splitlines()[:20]).upper()
        for dt in DOC_TYPES:
            if re.search(rf"\b{dt}\b", head, re.IGNORECASE):
                metadata["document_type"] = dt
                break

    # 2Ô∏è‚É£ C∆° quan ban h√†nh
    if title:
        tnorm = _norm(title)
        if "ch·ªß t·ªãch" in tnorm:
            metadata["issuing_agency"] = "CH·ª¶ T·ªäCH N∆Ø·ªöC"
        elif "b·ªô" in tnorm:
            m = re.search(r"b·ªô\s+([^\d,.;]+)", tnorm)
            if m:
                metadata["issuing_agency"] = "B·ªò " + m.group(1).upper()
        elif "ch√≠nh ph·ªß" in tnorm:
            metadata["issuing_agency"] = "CH√çNH PH·ª¶"
    if not metadata["issuing_agency"]:
        scan = content[:800].upper()
        for ag in AGENCIES:
            if ag in scan:
                metadata["issuing_agency"] = ag
                break

    # 3Ô∏è‚É£ Ng√†y ban h√†nh
    date_patterns = [
        r'ng√†y\s+(\d{1,2})\s+th√°ng\s+(\d{1,2})\s+nƒÉm\s+(\d{4})',
        r'(\d{1,2})[./-](\d{1,2})[./-](\d{4})'
    ]
    for pat in date_patterns:
        m = re.search(pat, content, re.IGNORECASE)
        if m:
            try:
                d, mth, y = int(m.group(1)), int(m.group(2)), int(m.group(3))
                metadata["issue_date"] = datetime(y, mth, d).strftime("%Y-%m-%d")
                break
            except ValueError:
                continue

    # 4Ô∏è‚É£ Ti√™u ƒë·ªÅ
    metadata["title"] = title.strip() if title else None
    return metadata


In [12]:
# ===============================
# 4Ô∏è‚É£ Chunking
# ===============================
def chunk_document(content: str, max_chunk_size: int = 1000, overlap_words: int = 50) -> List[str]:
    content = clean_formatting(content)
    parts = re.split(r'(?=(ƒêi·ªÅu\s+th·ª©|KHO·∫¢N\s+TH·ª®|CH∆Ø∆†NG\s+|M·ª§C\s+))', content, flags=re.IGNORECASE)

    if len(parts) == 1:
        sentences = re.split(r'(?<=[.!?])\s+', content)
        parts = []
        current = ""
        for s in sentences:
            if len(current) + len(s) < max_chunk_size:
                current += " " + s
            else:
                parts.append(current.strip())
                current = s
        if current:
            parts.append(current.strip())
    else:
        merged = []
        for i in range(0, len(parts), 2):
            section = "".join(parts[i:i+2]).strip()
            if section:
                merged.append(section)
        parts = merged

    chunks = []
    buffer = ""
    for p in parts:
        if len(buffer) + len(p) < max_chunk_size:
            buffer += " " + p
        else:
            chunks.append(buffer.strip())
            buffer = p
    if buffer:
        chunks.append(buffer.strip())

    if overlap_words > 0 and len(chunks) > 1:
        result = [chunks[0]]
        for i in range(1, len(chunks)):
            prev = chunks[i-1].split()
            overlap = " ".join(prev[-overlap_words:]) if len(prev) > overlap_words else chunks[i-1]
            result.append((overlap + " " + chunks[i]).strip())
        chunks = result

    return chunks

# ===============================
# 5Ô∏è‚É£ Chu·∫©n h√≥a b·∫£n ghi
# ===============================
def normalize_record(doc: Dict[str, Any]) -> Dict[str, Any]:
    """D·ªØ li·ªáu g·ªëc: content = category, mt6_text = full text"""
    category = doc.get("content") or ""
    content = doc.get("mt6_text") or ""
    content = clean_formatting(content)
    law_id = str(doc.get("law_id") or doc.get("lawid") or "unknown")
    return {
        "law_id": law_id,
        "raw_title": doc.get("title"),
        "source_url": doc.get("href"),
        "category": category.strip(),
        "content": content,
        "provided_issue_date": doc.get("date")
    }



In [13]:
# ===============================
# 6Ô∏è‚É£ Pipeline ch√≠nh
# ===============================
def process_law_data(input_path: str, max_chunk_size: int = 400, overlap_words: int = 30) -> str:
    if not os.path.exists(input_path):
        raise FileNotFoundError(f"Kh√¥ng t√¨m th·∫•y file: {input_path}")

    base = os.path.splitext(os.path.basename(input_path))[0]
    output_dir = os.path.dirname(input_path) or "."
    processed_path = os.path.join(output_dir, f"{base}_processedv6.json")

    with open(input_path, "r", encoding="utf-8") as f:
        raw_data = json.load(f)
    if isinstance(raw_data, dict):
        raw_data = [raw_data]

    processed: List[Dict[str, Any]] = []

    for doc in raw_data:
        norm = normalize_record(doc)
        metadata = extract_metadata(norm["content"], norm["law_id"], norm.get("raw_title"))
        chunks = chunk_document(norm["content"], max_chunk_size=max_chunk_size, overlap_words=overlap_words)

        # ‚úÖ ∆Øu ti√™n category t·ª´ d·ªØ li·ªáu g·ªëc
        if norm["category"]:
            final_category = norm["category"]
        else:
            final_category = infer_field_category(norm["raw_title"], metadata["issuing_agency"], norm["content"])

        for chunk_num, chunk_text in enumerate(chunks, start=1):
            record = {
                "law_id": norm["law_id"],
                "chunk_num": chunk_num,
                "document_type": metadata["document_type"],
                "issuing_agency": metadata["issuing_agency"],
                "issue_date": metadata["issue_date"] or norm.get("provided_issue_date"),
                "title": metadata["title"],
                "source_url": norm["source_url"],
                "raw_title": norm["raw_title"],
                "category": final_category,
                "chunk": chunk_text
            }
            record = {k: v for k, v in record.items() if v not in [None, ""]}
            processed.append(record)


    with open(processed_path, "w", encoding="utf-8") as f:
        json.dump(processed, f, ensure_ascii=False, indent=2)

    return processed_path


In [14]:
# ===============================
# 7Ô∏è‚É£ G·ªçi th·ª≠ pipeline
# ===============================
input_path = r"C:\Code\Ky5\SEG\Crawl\Dataprocessing\Process\tvpl_congvan3days.json"
output_path = process_law_data(input_path)

print("‚úÖ ƒê√£ x·ª≠ l√Ω xong!")
print(f"üìÑ File ƒë·∫ßu ra: {output_path}")

with open(output_path, 'r', encoding='utf-8') as f:
    data = json.load(f)
print(f"üì¶ T·ªïng s·ªë record (chunk): {len(data)}")
print("\n--- V√≠ d·ª• record ƒë·∫ßu ti√™n ---")
print(json.dumps(data[0], ensure_ascii=False, indent=2))

‚úÖ ƒê√£ x·ª≠ l√Ω xong!
üìÑ File ƒë·∫ßu ra: C:\Code\Ky5\SEG\Crawl\Dataprocessing\Process\tvpl_congvan3days_processedv6.json
üì¶ T·ªïng s·ªë record (chunk): 374251

--- V√≠ d·ª• record ƒë·∫ßu ti√™n ---
{
  "law_id": "22714",
  "chunk_num": 1,
  "document_type": "NGH·ªä ƒê·ªäNH",
  "issuing_agency": "B·ªò TR∆Ø·ªûNG B·ªò QU·ªêC GIA GI√ÅO D·ª§C BAN H√ÄNH",
  "issue_date": "1945-10-15",
  "title": "Ngh·ªã ƒë·ªãnh nƒÉm 1945 v·ªÅ H·ªôi ƒë·ªìng c·ªë v·∫•n h·ªçc ch√≠nh do B·ªô tr∆∞·ªüng B·ªô Qu·ªëc Gia Gi√°o D·ª•c ban h√†nh.",
  "source_url": "https://thuvienphapluat.vn/van-ban/Giao-duc/Nghi-dinh-Hoi-dong-co-van-hoc-chinh-22714.aspx",
  "raw_title": "Ngh·ªã ƒë·ªãnh nƒÉm 1945 v·ªÅ H·ªôi ƒë·ªìng c·ªë v·∫•n h·ªçc ch√≠nh do B·ªô tr∆∞·ªüng B·ªô Qu·ªëc Gia Gi√°o D·ª•c ban h√†nh.",
  "category": "Gi√°o d·ª•c",
  "chunk": "B·ªò QU·ªêC GIA GI√ÅO D·ª§C VI·ªÜT NAM D√ÇN CH·ª¶ C·ªòNG H√íA ƒê·ªôc l·∫≠p - T·ª± do - H·∫°nh ph√∫c S·ªë: KH√îNG S·ªê1 H√† N·ªôi, ng√†y 15 th√°ng 10 nƒÉm 1945 B·ªò TR∆Ø·ªûNG B·ªò Q