In [None]:
%pip install azure-ai-documentintelligence==1.0.0b4
%pip install azure-ai-formrecognizer

In [None]:
# 데이터 비율, 민법 비율, 형법비율, 편향되지 않고 평평, 

"""
This code sample shows Prebuilt Layout operations with the Azure Form Recognizer client library. 
The async versions of the samples require Python 3.6 or later.

To learn more, please visit the documentation - Quickstart: Document Intelligence (formerly Form Recognizer) SDKs
https://learn.microsoft.com/azure/ai-services/document-intelligence/quickstarts/get-started-sdks-rest-api?pivots=programming-language-python
"""
import json
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest


"""
Remember to remove the key from your code when you're done, and never post it publicly. For production, use
secure methods to store and access your credentials. For more information, see 
https://docs.microsoft.com/en-us/azure/cognitive-services/cognitive-services-security?tabs=command-line%2Ccsharp#environment-variables-and-application-configuration
"""
endpoint = ""
key = ""

formUrl = ""
document_analysis_client = DocumentAnalysisClient(
    endpoint=endpoint, credential=AzureKeyCredential(key)
)
    
poller = document_analysis_client.begin_analyze_document_from_url("prebuilt-layout", formUrl)
result = poller.result()


response_content=result.content



In [None]:
response_content

In [None]:
import re
import json


def parse_immigration_law(text):

    # 정규 표현식 패턴 정의
    chapter_pattern = re.compile(r'^제(\d+)장\s+(.+)', re.MULTILINE)
    section_pattern = re.compile(r'^제(\d+)절\s+(.+)', re.MULTILINE)
    article_pattern = re.compile(r'^제(\d+)조(?:의(\d+))?\s*\(([^)]+)\)', re.MULTILINE)
    
    chapters = list(chapter_pattern.finditer(text))
    sections = list(section_pattern.finditer(text))
    articles = list(article_pattern.finditer(text))
    
    chapter_idx = 0
    section_idx = 0
    
    result = []
    
    current_chapter_num = None
    current_chapter_title = ""
    current_section_title = ""
    
    article_starts = [m.start() for m in articles]
    article_starts.append(len(text))
    
    for i, article in enumerate(articles):
        article_num = article.group(1)
        sub_num = article.group(2) if article.group(2) else ""
        article_title = article.group(3).strip()
        
        article_start = article.start()
        article_end = article.end()
        next_article_start = article_starts[i+1]
        
        content = text[article_end:next_article_start].strip()
        
        # 장 갱신
        while chapter_idx < len(chapters) and chapters[chapter_idx].start() < article_start:
            current_chapter_num = chapters[chapter_idx].group(1)
            # 장 제목은 '외국인의 입국 및 상륙' 등 장 명만 추출 (제N장 제거)
            current_chapter_title = chapters[chapter_idx].group(2).strip()
            current_section_title = ""
            chapter_idx += 1
        
        # 절 갱신
        while section_idx < len(sections) and sections[section_idx].start() < article_start:
            section_number = sections[section_idx].group(1)
            section_text = sections[section_idx].group(2).strip()
            # 제N절을 포함한 형식으로 절 제목 구성
            current_section_title = f"제{section_number}절 {section_text}"
            section_idx += 1
        
        # chapter_id 구성
        if sub_num:
            chapter_id = f"{current_chapter_num}-{article_num}-{sub_num}"
        else:
            chapter_id = f"{current_chapter_num}-{article_num}"
        
        # chapter_title 구성
        # 절이 있으면: "장제목 - 제N절 절제목"
        if current_section_title:
            chapter_title = f"{current_chapter_title} - {current_section_title}"
        else:
            chapter_title = current_chapter_title
        
        json_obj = {
            "chapter_id": chapter_id,
            "chapter_title": chapter_title,
            "title": article_title,
            "content": content
        }
        
        result.append(json_obj)
    
    return result

# 스크립트 실행
response_content = re.sub(r'(제\d+장[^\n]+?)\s+(제\d+절)', r'\1\n\2', response_content) # 장, 절 그냥 연결된거 사이에 '\n'넣음 

parsed_law = parse_immigration_law(response_content)


# Save as JSON
output_file = r"immigration.json"

with open(output_file, "w", encoding="utf-8") as f:
    json_output = json.dump(parsed_law, f, ensure_ascii=False, indent=2) # JSON으로 변환 및 출력

print(f"Parsed data saved to {output_file}")
