In [17]:
import io
import requests
import docx
import json
import re


In [34]:
import json
import os
import re

def clean_line(line):
    """Removes unnecessary whitespace and BOM characters from a line."""
    line = line.strip()
    line = line.strip('\uFEFF')
    return line

def read_law_text(file_path):
    """Reads and parses a Vietnamese law document, structuring it into JSON format."""
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    law_data = {
        "chapters": []
    }

    current_chapter = None
    current_article = None

    for line in lines:
        line = clean_line(line)

        if not line:
            continue

        if line.startswith("Chương"):
            if current_chapter:
                law_data["chapters"].append(current_chapter)
            current_chapter = {
                "chapter_title": line,
                "articles": []
            }
            # Reset current_article when a new chapter starts
            current_article = None 
        elif line.startswith("Điều"):
            # Use regex to match "Điều" followed by a number
            match = re.match(r"(Điều \d+)\.", line)  # Keep "Điều" in the captured group
            if match:
                article_number = match.group(1)  # article_number now includes "Điều"

                if current_article:
                    if current_chapter:
                        current_chapter["articles"].append(current_article)
                    else:
                        # Handle articles before chapters (create a default chapter)
                        print(f"Info: Article '{line}' found before any chapter in {file_path}. Creating a default chapter.")
                        current_chapter = {
                            "chapter_title": "Chương Không Xác Định",  # Default chapter name
                            "articles": []
                        }
                        current_chapter["articles"].append(current_article)
                        law_data["chapters"].append(current_chapter)

                current_article = {
                    "article_number": article_number,  # Now includes "Điều"
                    "title": line.split(".", 1)[1].strip() if "." in line else "",
                    "content": ""
                }
            else:
                print(f"Warning: Line starts with 'Điều' but does not match expected format: {line} in {file_path}")
        else:
            if current_article:
                current_article["content"] += line + " "

    # Append the last article and chapter
    if current_article:
        if current_chapter:
            current_chapter["articles"].append(current_article)
        else:
            # If the file ends with an article but no chapter, create a default chapter
            print(f"Info: File {file_path} ended with an article but no chapter. Creating a default chapter.")
            current_chapter = {
                "chapter_title": "Chương Không Xác Định",
                "articles": [current_article]
            }
            
    if current_chapter:
        law_data["chapters"].append(current_chapter)

    return law_data

def process_law_files(directory):
    """Processes specific law files in a directory and returns a combined JSON structure."""
    law_files = {
        "luat-giao-duc.txt": {
            "title": "Luật Giáo Dục",
            "number": "Luật số: 43/2019/QH14"  # Now with "Luật số:"
        },
        "luat-hon-nhan-va-gia-dinh.txt": {
            "title": "Luật Hôn Nhân Và Gia Đình",
            "number": "Số: 52/2014/QH13"  # Now with "Số:"
        },
        "luat-giao-thong-duong-bo.txt": {
            "title": "Luật Giao Thông Đường Bộ",
            "number": "Luật số: 23/2008/QH12"  # Now with "Luật số:"
        }
    }

    all_laws_data = {}

    for filename, law_info in law_files.items():
        file_path = os.path.join(directory, filename)
        if os.path.exists(file_path):
            law_data = read_law_text(file_path)
            # Use the updated law_info["number"] (the one with "Luật số:" or "Số:")
            all_laws_data[law_info["title"]] = {
                "law_title": law_info["title"],
                "law_number": law_info["number"],  # This line is changed
                "chapters": law_data["chapters"]
            }
        else:
            print(f"Error: File not found: {file_path}")

    return all_laws_data

# Directory containing the law text files
directory = "./"  # Replace with your directory if needed

# Process the files and get the combined JSON data
all_laws_json = process_law_files(directory)

# Save the output as a JSON file
with open("law-corpus.json", "w", encoding="utf-8") as f:
    json.dump(all_laws_json, f, ensure_ascii=False, indent=4)

print("Conversion to JSON complete. Output saved as 'law-corpus.json'.")

Info: Article 'Điều 2. Đối tượng áp dụng' found before any chapter in ./luat-giao-thong-duong-bo.txt. Creating a default chapter.
Conversion to JSON complete. Output saved as 'law-corpus.json'.


In [38]:
import json
import os
import re

def clean_line(line):
    """Removes unnecessary whitespace and BOM characters from a line."""
    line = line.strip()
    line = line.strip('\uFEFF')
    return line

def read_law_text(file_path):
    """Reads and parses a Vietnamese law document, structuring it into JSON format."""
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    law_data = {
        "chapters": []
    }

    current_chapter = None
    current_article = None

    for line in lines:
        line = clean_line(line)

        if not line:
            continue

        # Use regex to match "Chương" (uppercase or lowercase)
        chapter_match = re.match(r"(Chương|CHƯƠNG)\s+\w+", line)  # Match "Chương" or "CHƯƠNG" followed by whitespace and a word character
        if chapter_match:
            if current_chapter:
                law_data["chapters"].append(current_chapter)
            current_chapter = {
                "chapter_title": line,
                "articles": []
            }
            current_article = None  # Reset current_article for a new chapter
        elif line.startswith("Điều"):
            match = re.match(r"(Điều \d+)\.", line)
            if match:
                article_number = match.group(1)

                if current_article:
                    if current_chapter:
                        current_chapter["articles"].append(current_article)
                    else:
                        # Handle articles before chapters
                        print(f"Info: Article '{line}' found before any chapter in {file_path}. Creating a default chapter.")
                        current_chapter = {
                            "chapter_title": "Chương Không Xác Định",
                            "articles": []
                        }
                        current_chapter["articles"].append(current_article)
                        law_data["chapters"].append(current_chapter)

                current_article = {
                    "article_number": article_number,
                    "title": line.split(".", 1)[1].strip() if "." in line else "",
                    "content": ""
                }
            else:
                print(f"Warning: Line starts with 'Điều' but does not match expected format: {line} in {file_path}")
        else:
            if current_article:
                current_article["content"] += line + " "

    # Append the last article and chapter
    if current_article:
        if current_chapter:
            current_chapter["articles"].append(current_article)
        else:
            # Handle file ending with an article but no chapter
            print(f"Info: File {file_path} ended with an article but no chapter. Creating a default chapter.")
            current_chapter = {
                "chapter_title": "Chương Không Xác Định",
                "articles": [current_article]
            }

    if current_chapter:
        law_data["chapters"].append(current_chapter)

    return law_data

def process_law_files(directory):
    """Processes specific law files in a directory and returns a combined JSON structure."""
    law_files = {
        "luat-giao-duc.txt": {
            "title": "Luật Giáo Dục",
            "number": "Luật số: 43/2019/QH14"
        },
        "luat-hon-nhan-va-gia-dinh.txt": {
            "title": "Luật Hôn Nhân Và Gia Đình",
            "number": "Số: 52/2014/QH13"
        },
        "luat-giao-thong-duong-bo.txt": {
            "title": "Luật Giao Thông Đường Bộ",
            "number": "Luật số: 23/2008/QH12"
        }
    }

    all_laws_data = {}

    for filename, law_info in law_files.items():
        file_path = os.path.join(directory, filename)
        if os.path.exists(file_path):
            law_data = read_law_text(file_path)
            all_laws_data[law_info["title"]] = {
                "law_title": law_info["title"],
                "law_number": law_info["number"],
                "chapters": law_data["chapters"]
            }
        else:
            print(f"Error: File not found: {file_path}")

    return all_laws_data

# Directory containing the law text files
directory = "./"  # Replace with your directory if needed

# Process the files and get the combined JSON data
all_laws_json = process_law_files(directory)

# Save the output as a JSON file
with open("law-corpus.json", "w", encoding="utf-8") as f:
    json.dump(all_laws_json, f, ensure_ascii=False, indent=4)

print("Conversion to JSON complete. Output saved as 'law-corpus.json'.")

Conversion to JSON complete. Output saved as 'law-corpus.json'.
