In [5]:
# National Archives Data Pipeline Demo

This notebook demonstrates the full pipeline for processing legislation data from legislation.gov.uk.


SyntaxError: invalid syntax (3588283786.py, line 3)

In [6]:
import sys
sys.path.append('..')

# Force reload modules to get latest changes
import importlib
import src.api_client
import src.metadata_extractor
import src.reporting
importlib.reload(src.api_client)
importlib.reload(src.metadata_extractor)
importlib.reload(src.reporting)

from src.api_client import LegislationAPIClient
from src.atom_parser import parse_atom_feed
from src.xml_validator import XMLValidator
from src.metadata_extractor import MetadataExtractor
from src.reporting import generate_pdf_report

import pandas as pd

# Initialize extractor early
extractor = MetadataExtractor()

print("✅ All modules imported successfully!")
print("✅ Modules reloaded to get latest changes")
print("✅ Metadata extractor initialized")


✅ All modules imported successfully!
✅ Modules reloaded to get latest changes
✅ Metadata extractor initialized


In [None]:
## 1. Fetch Atom Feed


In [7]:
# Initialize API client
client = LegislationAPIClient()

# Fetch Atom feed for UK Public General Acts
print("Fetching Atom feed for UK legislation...")
atom_feed = client.get_atom_feed(query='coronavirus', legislation_type='ukpga')

# Parse feed into DataFrame
feed_df = parse_atom_feed(atom_feed)
print(f"✅ Found {len(feed_df)} items in the feed")
feed_df.head()


Fetching Atom feed for UK legislation...
✅ Found 3 items in the feed


Unnamed: 0,uri,title,date
0,http://www.legislation.gov.uk/id/ukpga/2022/12,Commercial Rent (Coronavirus) Act 2022,2024-05-20 12:59:01+01:00
1,http://www.legislation.gov.uk/id/ukpga/2021/34,Rating (Coronavirus) and Directors Disqualific...,2024-05-20 13:20:32+01:00
2,http://www.legislation.gov.uk/id/ukpga/2020/7,Coronavirus Act 2020,2025-03-26 11:14:03+01:00


In [None]:
## 2. Download and Validate Sample XML


In [8]:
# Get sample legislation - try to find the actual Coronavirus Act
sample_ids = ['ukpga/2020/7', 'ukpga/2020/1']  # Try different IDs

xml_content = None
sample_id = None

for test_id in sample_ids:
    try:
        print(f"Trying {test_id}...")
        test_xml = client.get_legislation_xml(test_id)
        test_metadata = extractor.extract_metadata(test_xml)
        print(f"  Title: {test_metadata['title']}")
        
        if 'coronavirus' in test_metadata['title'].lower():
            print(f"✅ Found Coronavirus Act: {test_id}")
            xml_content = test_xml
            sample_id = test_id
            break
        else:
            print(f"  Not Coronavirus Act, continuing search...")
            # Use first successful one as fallback
            if xml_content is None:
                xml_content = test_xml
                sample_id = test_id
                
    except Exception as e:
        print(f"  Failed to fetch {test_id}: {e}")

if xml_content:
    print(f"\n✅ Using legislation: {sample_id}")
    print(f"📄 Downloaded XML content ({len(xml_content)} characters)")
else:
    print("❌ Could not download any legislation")


INFO:src.metadata_extractor:Successfully extracted metadata for: Coronavirus Act 2020


Trying ukpga/2020/7...
  Title: Coronavirus Act 2020
✅ Found Coronavirus Act: ukpga/2020/7

✅ Using legislation: ukpga/2020/7
📄 Downloaded XML content (1251391 characters)


In [None]:
## 3. Extract Metadata


In [9]:
# Extract metadata (extractor already initialized in Cell 1)
metadata = extractor.extract_metadata(xml_content)

# Convert to DataFrames
dfs = extractor.to_dataframe(metadata)

# Display main metadata
print("📋 Main Metadata:")
print(f"Title: {metadata.get('title', 'N/A')}")
print(f"Year: {metadata.get('year', 'N/A')}")
print(f"Type: {metadata.get('legislation_type', 'N/A')}")
print(f"Number: {metadata.get('legislation_number', 'N/A')}")
print(f"Sections Count: {metadata.get('sections_count', 'N/A')}")
print(f"Document URI: {metadata.get('document_uri', 'N/A')}")

dfs['main']


INFO:src.metadata_extractor:Successfully extracted metadata for: Coronavirus Act 2020


📋 Main Metadata:
Title: Coronavirus Act 2020
Year: 2020
Type: ukpga
Number: 7
Sections Count: 579
Document URI: http://www.legislation.gov.uk/ukpga/2020/7


Unnamed: 0,title,year,document_uri,legislation_type,number
0,Coronavirus Act 2020,2020,http://www.legislation.gov.uk/ukpga/2020/7,ukpga,7


In [10]:
# Display sections information
print(f"📄 Found {len(dfs['sections'])} sections:")
if not dfs['sections'].empty:
    dfs['sections'].head()
else:
    print("No sections found")


📄 Found 4 sections:


In [None]:
## 4. Generate PDF Report


In [11]:
# Generate PDF report
output_path = '../data/processed/legislation_report.pdf'
generate_pdf_report(metadata, output_path)
print(f"📄 Report generated: {output_path}")

# Display summary
print(f"""
📊 Summary:
- Title: {metadata.get('title', 'N/A')}
- Year: {metadata.get('year', 'N/A')}
- Type: {metadata.get('legislation_type', 'N/A')}
- Number: {metadata.get('legislation_number', 'N/A')}
- Sections: {metadata.get('sections_count', 0)}
- Key Sections: {len(metadata.get('key_sections', []))}
- Long Title: {metadata.get('long_title', 'N/A')[:100]}...
""")


INFO:src.reporting:PDF report successfully generated: ../data/processed/legislation_report.pdf


📄 Report generated: ../data/processed/legislation_report.pdf

📊 Summary:
- Title: Coronavirus Act 2020
- Year: 2020
- Type: ukpga
- Number: 7
- Sections: 579
- Key Sections: 4
- Long Title: An Act to make provision in connection with coronavirus; and for connected purposes....



In [20]:
## 5. Debug XML Structure (Optional)


In [21]:
# Let's debug what we're actually getting
from lxml import etree

# Parse XML to investigate structure
root = etree.fromstring(xml_content.encode('utf-8'))

print("🔍 XML Root Information:")
print(f"Root tag: {root.tag}")
print(f"Root attributes: {root.attrib}")
print()

print("🔍 Available Namespaces:")
for prefix, uri in root.nsmap.items():
    prefix_name = prefix if prefix else "default"
    print(f"  {prefix_name}: {uri}")
print()

# Check if this is actually the right legislation
print("🔍 Looking for title elements:")
for elem in root.iter():
    if 'title' in elem.tag.lower():
        print(f"  {elem.tag}: '{elem.text}'")
        
print()
print("🔍 Try a different legislation ID:")
print("Let's try ukpga/2020/7 (might be Coronavirus Act)")

# Alternative - let's check ukpga/2020/7 which might be the Coronavirus Act
try:
    covid_xml = client.get_legislation_xml('ukpga/2020/7')
    covid_metadata = extractor.extract_metadata(covid_xml)
    print(f"ukpga/2020/7 title: {covid_metadata['title']}")
except:
    print("ukpga/2020/7 not found")


🔍 XML Root Information:
Root tag: {http://www.legislation.gov.uk/namespaces/legislation}Legislation
Root attributes: {'DocumentURI': 'http://www.legislation.gov.uk/ukpga/2020/7', 'IdURI': 'http://www.legislation.gov.uk/id/ukpga/2020/7', 'NumberOfProvisions': '579', '{http://www.w3.org/2001/XMLSchema-instance}schemaLocation': 'http://www.legislation.gov.uk/namespaces/legislation http://www.legislation.gov.uk/schema/legislation.xsd', 'SchemaVersion': '1.0', 'RestrictExtent': 'E+W+S+N.I.', 'RestrictStartDate': '2025-03-20'}

🔍 Available Namespaces:
  default: http://www.legislation.gov.uk/namespaces/legislation
  xsi: http://www.w3.org/2001/XMLSchema-instance

🔍 Looking for title elements:
  {http://purl.org/dc/elements/1.1/}title: 'Coronavirus Act 2020'
  {http://www.legislation.gov.uk/namespaces/metadata}AffectedTitle: 'Coronavirus Act 2020'
  {http://www.legislation.gov.uk/namespaces/metadata}AffectingTitle: 'Health and Care Act 2022'
  {http://www.legislation.gov.uk/namespaces/metadat