In [0]:
import requests

# Step 1: Download XML from URL
url = "https://www.federalregister.gov/documents/full_text/xml/2025/07/03/2025-12326.xml"
xml_content = requests.get(url).content

# Step 2: Save XML to DBFS
with open("/dbfs/tmp/2025-12326.xml", "wb") as f:
    f.write(xml_content)

In [0]:
import xml.etree.ElementTree as ET

# Load XML content (from file, string, or URL)
tree = ET.parse("/dbfs/tmp/example.xml")  # If you've saved the file to DBFS
root = tree.getroot()

print("Root tag:", root.tag)
print("Immediate child tags under root:")
for child in root:
    print(child.tag)

# Or, if you have the XML content as a string:
# root = ET.fromstring(xml_content)
# print("Root tag:", root.tag)
# print("Immediate child tags under root:")
# for child in root:
#     print(child.tag)

In [0]:
# Step 3: Read XML to Spark DataFrame (requires Databricks XML library)
df = spark.read.format("xml") \
    .option("rowTag", "YOUR_ROW_TAG") \  # Replace with the XML's main row tag
    .load("dbfs:tmp/2025-12326.xml")

In [0]:
# Step 4: Convert DataFrame to Python Dictionary
dict_list = [row.asDict() for row in df.collect()]

# Step 5: (Optional) Custom Parsing with ElementTree
import xml.etree.ElementTree as ET
root = ET.fromstring(xml_content)
# Traverse and parse as needed to build dicts

In [0]:
%pip install xmltodict
import xmltodict

# Read and parse the XML file
with open("2025-12326.xml", "r", encoding="utf-8", errors="replace") as file:
    xml_content = file.read()

policy_dict = xmltodict.parse(xml_content)

In [0]:
# Refined extraction logic for regulatory requirements and metadata from NEPA XML
import re
import json

# Helper function to identify requirement-like sentences
requirement_keywords = [
    'must', 'shall', 'required', 'mandate', 'prohibit', 'forbid', 'ensure', 'direct', 'provision', 'deadline', 'limit', 'review', 'prepare', 'submit', 'report', 'determine', 'specify', 'clarify', 'add', 'remove', 'revise', 'publish', 'provide', 'consider', 'apply', 'adopt', 'implement', 'establish', 'define', 'categorical exclusion', 'environmental assessment', 'environmental impact statement'
]

# Flatten paragraphs from SUPLINF
suplinf = policy_dict['RULE']['SUPLINF']
paragraphs = []
for p in suplinf.get('P', []):
    if isinstance(p, str) and p.strip():
        paragraphs.append(p.strip())

# Extract requirements and decision points
extracted_rules = []
for idx, para in enumerate(paragraphs):
    # Check for requirement keywords
    if any(kw in para.lower() for kw in requirement_keywords):
        # Try to extract citation and effective date from context
        citation_match = re.search(r'(\d+\s*U\.S\.C\.\s*\d+[a-zA-Z0-9\(\)]*)', para)
        citation = citation_match.group(0) if citation_match else None
        # Use known effective date from context if not found
        effective_date = '2025-04-11'
        # Build rule
        rule = {
            "id": f"rule_{idx+1}",
            "text": para,
            "logic": "To be refined (AI-assisted translation)",
            "metadata": {
                "citation": citation,
                "effective_date": effective_date
            }
        }
        extracted_rules.append(rule)

# Display the extracted rules as JSON
print(json.dumps(extracted_rules, indent=2))

# Summary: This cell refines extraction by scanning for regulatory keywords and attempts to capture citations and effective dates for each rule. You can further refine logic or prompt AI to translate 'text' into executable code logic.