# National Archives Data Pipeline Demo

This notebook demonstrates the full pipeline for processing legislation data from legislation.gov.uk.

In [None]:
import sys
sys.path.append('..')

from src.api_client import LegislationAPIClient
from src.atom_parser import parse_atom_feed
from src.xml_validator import XMLValidator
from src.metadata_extractor import LegislationMetadataExtractor
from src.reporting import generate_pdf_report

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot style
plt.style.use('seaborn')
sns.set_palette('husl')


In [None]:
## 1. Fetch Atom Feed


In [None]:
# Initialize API client
client = LegislationAPIClient()

# Fetch Atom feed for recent legislation
atom_feed = client.get_atom_feed(query='coronavirus')

# Parse feed into DataFrame
feed_df = parse_atom_feed(atom_feed)
feed_df.head()


In [None]:
## 2. Download and Validate Sample XML


In [None]:
# Get sample legislation
sample_id = 'ukpga/2020/1'  # Coronavirus Act 2020
xml_content = client.get_legislation_xml(sample_id)

# Validate XML
validator = XMLValidator()
schema_name = validator.get_schema_for_legislation(sample_id)
errors = validator.validate_xml(xml_content, schema_name)

if errors:
    print('Validation errors found:')
    for error in errors:
        print(f'- {error}')
else:
    print('XML is valid!')


In [None]:
## 3. Extract Metadata


In [None]:
# Extract metadata
extractor = LegislationMetadataExtractor()
metadata = extractor.extract_metadata(xml_content)

# Convert to DataFrames
dfs = extractor.to_dataframe(metadata)

# Display main metadata
dfs['main']


In [None]:
## 4. Analyze Sections


In [None]:
# Analyze section lengths
sections_df = dfs['sections']
sections_df['content_length'] = sections_df['content'].str.len()

plt.figure(figsize=(12, 6))
sns.histplot(data=sections_df, x='content_length', bins=30)
plt.title('Distribution of Section Lengths')
plt.xlabel('Content Length (characters)')
plt.ylabel('Count')
plt.show()


In [None]:
## 5. Generate Report


In [None]:
# Generate PDF report
output_path = '../data/processed/legislation_report.pdf'
generate_pdf_report(metadata, output_path)
print(f'Report generated: {output_path}')


In [None]:
## 6. Additional Analysis


In [None]:
# Analyze amendments over time
if not dfs['amendments'].empty:
    amendments_df = dfs['amendments']
    amendments_df['date'] = pd.to_datetime(amendments_df['date'])
    
    plt.figure(figsize=(12, 6))
    amendments_df.groupby('date').size().plot(kind='bar')
    plt.title('Amendments Over Time')
    plt.xlabel('Date')
    plt.ylabel('Number of Amendments')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print('No amendments found for this legislation.')
