In [None]:
%pip install -r requirements.txt

In [None]:
import re
import os
from typing import List, Dict, Optional, Tuple
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, DocumentContentFormat
from azure.identity import ClientSecretCredential
import json
from dotenv import load_dotenv

In [None]:
# Loading variables from .env into the environment
load_dotenv()
# Intializing Azure Credential
AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
AZURE_DOCUMENT_INTELLIGENCE_KEY = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY")
# Analyze the document
nureg_pdf_url="https://czvgnalcs00dsta001.blob.core.usgovcloudapi.net/non-eci/nureg/ML13032A220.pdf"

In [None]:
def analyze_layout(nureg_pdf_url):
    '''function to extract the pdf data using document intelligence in markdown format'''
    # Initialize the Document Intelligence client
    document_intelligence_client = DocumentIntelligenceClient(
        endpoint=AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT, credential=AzureKeyCredential(AZURE_DOCUMENT_INTELLIGENCE_KEY )
    )
    
    # Start the analysis process
    poller = document_intelligence_client.begin_analyze_document(
        model_id="prebuilt-layout",
        body=AnalyzeDocumentRequest(url_source=nureg_pdf_url),
        output_content_format=DocumentContentFormat.MARKDOWN
    )
    
    # Retrieve the result
    AnalyzeResult = poller.result()
    return AnalyzeResult

In [None]:
def extract_page_number(section_text):
    ''' Extracting section page numbers from where the section starts'''
    # Check if the comment <!-- PageNumber=... --> exists
    comment_exists = re.search(r'<!--\s*PageNumber\s*=\s*"?\d+"?\s*-->', section_text, re.IGNORECASE)

    # If the comment exists, get the page_no and return page_no - 1
    if comment_exists:
        page_no_match = re.search(r'page_no:\s*(\d+)', section_text, re.IGNORECASE)
        if page_no_match:
            return int(page_no_match.group(1)) - 1
    else:
        # If no comment, just return the page_no as-is
        page_no_match = re.search(r'page_no:\s*(\d+)', section_text, re.IGNORECASE)
        if page_no_match:
            return int(page_no_match.group(1))

    return None


def extract_sub_section(section_text: str) -> Tuple[Optional[str], Optional[str]]:
    """
    Extracting 50.72 & 50.73 section labels from table
    """
    # searching for table
    table_match = re.search(r'<table>.*?</table>', section_text, re.DOTALL)
    # regex to match 50.72 & 50.73 in table
    matches = re.findall(r'§\s*50\.72(?:\([^)]+\))*|§\s*50\.73(?:\([^)]+\))*', table_match.group(0))
    
    # Separate into two lists
    list_50_72 = [match for match in matches if match.startswith('§ 50.72')]
    list_50_73 = [match for match in matches if match.startswith('§ 50.73')]
    # Edge case: if subsection is not captured
    if len(list_50_72)==0 | len(list_50_73)==0:
      list_50_72, list_50_73 = _extract_sub_section_edgecase(section_text)
    return list_50_72, list_50_73

def _extract_sub_section_edgecase(section_text):
    ''' Function to handle edgecase in extracting sub section '''
    # Step 1: Split into lines and collect until a 'Discussion' header
    lines = section_text.splitlines()
    collected = []
    for line in lines:
        # Stop when we hit a heading like "### Discussion" (any level)
        if re.match(r'#{1,6}\s+Discussion([ \t]*[a-zA-Z0-9])?', line.strip()):
            break
        collected.append(line)
    examples_text = "\n".join(collected).strip()
    # Step 2: Regex to extract § 50.72... and § 50.73... with groups
    pattern = r'§\s*50\.72(?:\([^)]+\))*|§\s*50\.73(?:\([^)]+\))*'
    matches = re.findall(pattern, examples_text, re.DOTALL)
    # Separate into two lists
    list_50_72 = [match for match in matches if match.startswith('§ 50.72')]
    list_50_73 = [match for match in matches if match.startswith('§ 50.73')]
    return list_50_72, list_50_73

    
def clean_text(raw_text):
    ''' Cleaning text'''
    
    # Clean unwanted formatting
    text_clean = re.sub(r'<!--.*?-->', '', raw_text)        # remove HTML comments
    text_clean = re.sub(r'\n+', ' ', text_clean)            # collapse line breaks
    text_clean = re.sub(r'\s{2,}', ' ', text_clean)         # collapse extra spaces
    text_clean = re.sub(r'\bPageNumber="?[\dixv]+"?', '', text_clean, flags=re.IGNORECASE)
    text_clean = re.sub(r'^\s*\d+\s*$', '', text_clean, flags=re.MULTILINE)  # orphaned page numbers
    text_clean = re.sub(r'^#{1,10}\s+', '', text_clean, flags=re.MULTILINE)

    return text_clean

def extract_description(section_text: str) -> Optional[str]:
    """
    Extracts description after one or more consecutive <table> blocks
    that are optionally split by <!-- PageBreak -->.
    """
    pos = 0
    while True:
        # 1. Find </table>
        table_end = re.search(r'</table>', section_text[pos:], re.IGNORECASE)
        if not table_end:
            return None

        # Update absolute position
        pos += table_end.end()
        
        # 2. Look ahead for <!-- PageBreak --> followed by <table>
        lookahead = section_text[pos:]
        if '<table>' not in lookahead[:70]:
            break

        # If found, find the next </table> again
        pagebreak_table_end = re.search(r'</table>', lookahead, re.IGNORECASE)
        if not pagebreak_table_end:
            break  # malformed, but safe exit

        # Advance absolute position to end of next </table>
        pos += pagebreak_table_end.end()

    # 3. Now extract description from pos until next header
    remaining = section_text[pos:]
    desc_match = re.search(
        r"(.*?)(?=\n\s*#+|\n-{3,}|\n\s*Table|\n\s*Discussion|\n\s*Examples|\Z)",
        remaining,
        re.DOTALL | re.IGNORECASE
    )
    if not desc_match:
        return None

    # 4. Cleanup
    description = desc_match.group(1)
    description_clean = clean_text(description) 
    return description_clean.strip() if description_clean.strip() else None


def extract_discussion(section_text: str) -> Optional[str]:
    """
    Extracts the text under the 'Discussion' header until the next section or 'Examples'.
    """
    start_match = re.search(r'#{1,6}\s+Discussion([ \t]*[a-zA-Z0-9])?', section_text)
    if not start_match:
        return None

    start_pos = start_match.end()
    rest = section_text[start_pos:]

    # Split text into lines and iterate until we hit ## 3.2.x
    lines = rest.splitlines()
    collected = []
    for line in lines:
        if re.match(r'#{1,6}\s+Examples?\b', line.strip()):
            break
        collected.append(line)

    discussion = "\n".join(collected).strip()
    discussion_clean = clean_text(discussion)

    return discussion_clean


def extract_examples(section_text: str) -> Optional[List[Dict[str, str]]]:
    """
    Extracts the text under the 'Examples' header until the next section header (like 3.2.x)
    and returns a list of dictionaries with Title and Description.
    """
    
    start_match = re.search(r'#{1,6}\s+Examples?', section_text)
    if not start_match:
        return None

    start_pos = start_match.end()
    rest = section_text[start_pos:]

    # Split text into lines and iterate until we hit ## 3.2.x
    lines = rest.splitlines()
    collected = []
    for line in lines:
        if re.match(r'#{1,6}\s+3\.2\.\d+', line.strip()):
            break
        collected.append(line)

    examples_text = "\n".join(collected).strip()

    # Flexible heading level pattern for examples
    pattern = r"#{1,6}\s+\(\d+\)\s+([^\n]+)\n(.*?)(?=(?:#{1,6}\s+\(\d+\)|\Z))"
    matches = re.findall(pattern, examples_text, re.DOTALL)

    # Format into list of dicts
    # examples = [{"Title": title.strip(), "Description": text.strip()} for title, text in matches]
    if matches:
        examples = [{"Title": title.strip(), "Description": text.strip()} for title, text in matches]
    else:
        # examples = [{"Title": None, "Description": examples_text}]
        parts = examples_text.split("\n\n", 1)
        title = parts[0].strip() if parts else None
        description = parts[1].strip() if len(parts) > 1 else ""
        examples = [{"Title": title, "Description": description}]
    return examples

In [None]:
# Extracting data in markdown format using DocIntelligence
markdown_result = analyze_layout(nureg_pdf_url)
markdown_text = markdown_result.content 

# Numbering pages
# Replace PageBreak with numbered breaks
markdown_text = markdown_result.content 
pages = markdown_text.split("<!-- PageBreak -->")
numbered_markdown = "\n\n".join(
    f" ----- page_no: {i+1}\n\n{page.strip()}" for i, page in enumerate(pages)
)

# storing as txt file
# with open('markdown_with_page_no.txt', 'w', encoding='utf-8') as f:
#     f.write(numbered_markdown)

# Split text by subsection markers (e.g., "3.2.1 Some Title")
subsections = re.split(r'(?:^|\n)###*\s*(3\.2\.\d+)\s+(.+)', numbered_markdown)

# Initialize structured output
structured_data = []

# Iterate over the split text
for i in range(1, len(subsections), 3):
    section_number = subsections[i].strip()
    section_title  = subsections[i + 1].strip()
    section_text   = subsections[i + 2]

    section_page_no = extract_page_number(section_text)

    # Extract subsections from table
    subsection_5072, subsection_5073 = extract_sub_section(section_text)


    # Description: first paragraph after table that is not a header
    description = extract_description(section_text)

    # Discussion
    discussion = extract_discussion(section_text)

    # Examples
    examples = extract_examples(section_text)

    # Final structure
    data = {
        'Section Number': section_number,
        'Section Title': section_title,
        'Section Page Number': section_page_no,
        'sub_section_5072': subsection_5072,
        'sub_section_5073': subsection_5073,
        'Description': description,
        'Discussion': discussion,
        'Examples': examples
        
    }

    structured_data.append(data)
    # break

In [None]:
# saving output in a json file
with open('structure_output.json', 'w', encoding='utf-8') as json_file:
    json.dump(structured_data, json_file, ensure_ascii=False, indent=4) 