# Step-by-Step: Extracting Regulatory Policy Rules as Structured JSON

This notebook will:

1. Parse the USDA NEPA XML (`title-7.xml`) using `xmltodict`.
2. Traverse all sections and paragraphs to identify regulatory requirements.
3. Extract each requirement as a structured JSON rule, including:
   - Regulatory text
   - Metadata (citation, effective date, authority)
   - Explanation of the rule's intent
4. Output all rules as a formatted JSON list for review.

This approach ensures all regulatory requirements are captured and ready for further translation into code logic.

In [9]:
%pip install xmltodict
import xmltodict

# Read and parse the XML file
# Source doc: https://www.ecfr.gov/current/title-7/subtitle-A/part-1b
# Source XML: https://www.ecfr.gov/api/versioner/v1/full/2025-08-26/title-7.xml?part=1b
with open("title-7.xml", "r", encoding="utf-8", errors="replace") as file:
    xml_content = file.read()

policy_dict = xmltodict.parse(xml_content)

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
# Extract regulatory requirements from USDA NEPA XML and output as structured JSON rules
import re
import json
from collections.abc import Iterable

def extract_text_recursive(item):
    """Recursively extract all text from nested dicts/lists/strings."""
    if isinstance(item, str):
        return [item.strip()] if item.strip() else []
    elif isinstance(item, dict):
        texts = []
        # Common XML text keys
#text, HEAD, PSPACE, etc.
        for k, v in item.items():
            if k in ['#text', 'HEAD', 'PSPACE', 'P'] or isinstance(v, (dict, list, str)):
                texts.extend(extract_text_recursive(v))
        return texts
    elif isinstance(item, Iterable):
        texts = []
        for subitem in item:
            texts.extend(extract_text_recursive(subitem))
        return texts
    return []

def flatten_paragraphs(section):
    """Extracts all paragraphs from a section, handling nested structure and dicts."""
    paragraphs = []
    if 'P' in section:
        ps = section['P']
        paragraphs.extend(extract_text_recursive(ps))
    return paragraphs

def extract_rules_from_section(section, section_citation, effective_date, authority):
    requirement_keywords = [
        'must', 'shall', 'required', 'mandate', 'prohibit', 'forbid', 'ensure', 'direct', 'provision', 'deadline', 'limit', 'review', 'prepare', 'submit', 'report', 'determine', 'specify', 'clarify', 'add', 'remove', 'revise', 'publish', 'provide', 'consider', 'apply', 'adopt', 'implement', 'establish', 'define', 'categorical exclusion', 'environmental assessment', 'environmental impact statement'
    ]
    rules = []
    paragraphs = flatten_paragraphs(section)
    for idx, para in enumerate(paragraphs):
        if not isinstance(para, str):
            continue
        para_lower = para.lower()
        if any(kw in para_lower for kw in requirement_keywords):
            citation_match = re.search(r'(\d+\s*U\.S\.C\.\s*\d+[a-zA-Z0-9\(\)]*)', para)
            citation = citation_match.group(0) if citation_match else section_citation
            rule = {
                "id": f"{section_citation}_rule_{idx+1}",
                "text": para.strip(),
                "metadata": {
                    "citation": citation,
                    "effective_date": effective_date,
                    "authority": authority
                },
                "explanation": f"Requirement extracted from section {section_citation}."
            }
            rules.append(rule)
    return rules

# Get top-level part and sections
part = policy_dict.get('DIV5', {})
authority = part.get('AUTH', {}).get('PSPACE', None)
effective_date = re.search(r'(\d{4}-\d{2}-\d{2})', part.get('SOURCE', {}).get('PSPACE', ''))
effective_date = effective_date.group(0) if effective_date else None

# Debug: Inspect parsed XML structure
# print("Keys in part:", part.keys())
# print("Type of DIV8:", type(part.get('DIV8', None)))
# print("Sample DIV8:", part.get('DIV8', None))

# Normalize sections to a list
div8_raw = part.get('DIV8', None)
if isinstance(div8_raw, list):
    sections = div8_raw
elif isinstance(div8_raw, dict):
    sections = [div8_raw]
else:
    sections = []

all_rules = []
for section in sections:
    meta = section.get('hierarchy_metadata', None)
    section_citation = None
    if isinstance(meta, str):
        try:
            meta_dict = json.loads(meta)
            section_citation = meta_dict.get('citation', None)
        except Exception as e:
            print(f"Failed to parse hierarchy_metadata for section {section.get('@N', '')}: {e}")
            section_citation = None
    elif isinstance(meta, dict):
        section_citation = meta.get('citation', None)
    # Fallback: use section number or header if citation is missing
    if not section_citation:
        section_citation = section.get('@N', section.get('HEAD', 'Unknown'))
    rules = extract_rules_from_section(section, section_citation, effective_date, authority)
    all_rules.extend(rules)

print(f"Total rules extracted: {len(all_rules)}")
print(json.dumps(all_rules, indent=2))

Total rules extracted: 456
[
  {
    "id": "1b.1_rule_2",
    "text": "(a)  The purpose of this part is to outline the procedures by which the U.S. Department of Agriculture (hereinafter USDA or the Department) will integrate the National Environmental Policy Act (NEPA) into decision-making processes. Specifically, this part: describes the process by which USDA determines what actions are subject to NEPA's procedural requirements and the applicable level of NEPA review; ensures that relevant environmental information is identified and considered early in the process in order to ensure informed decision making; enables USDA to conduct coordinated, consistent, predictable and timely environmental reviews; reduces unnecessary burdens and delays; and implements NEPA's mandates regarding lead and cooperating agency roles, page and time limits, and sponsor preparation of environmental assessments and environmental impact statements.",
    "metadata": {
      "citation": "1b.1",
      "effect