# Congress.gov Data Exploration

Explore the congressional domain data for NER training.

**Goals:**
- Load and inspect bills, members, committees
- Visualize entity distributions
- Test NER extraction patterns

In [None]:
# Install dependencies if needed
# !pip install -e ../corpus-core
# !pip install -e ../pipelines

In [None]:
import sys
from pathlib import Path

# Add corpus-core to path
sys.path.insert(0, str(Path('../corpus-core/src').resolve()))
sys.path.insert(0, str(Path('../pipelines/src').resolve()))

from corpus_core.loaders import ParquetLoader
import pandas as pd
import matplotlib.pyplot as plt

## Load Congressional Data

In [None]:
# Initialize loader
loader = ParquetLoader(Path('../datasets'))

# Check available datasets
print("Available datasets:")
for ds in loader.list_datasets():
    print(f"  - {ds['domain']}/{ds['name']}")

In [None]:
# Load bills dataset
if loader.exists('congress', 'congress_bills'):
    bills_df = loader.read_pandas('congress', 'congress_bills')
    print(f"Bills: {len(bills_df)} records")
    display(bills_df.head())
else:
    print("Run the Dagster pipeline first: dagster dev")

In [None]:
# Load members dataset
if loader.exists('congress', 'congress_members'):
    members_df = loader.read_pandas('congress', 'congress_members')
    print(f"Members: {len(members_df)} records")
    display(members_df.head())
else:
    print("Run the Dagster pipeline first")

## Analyze Entity Distributions

In [None]:
# Party distribution
if 'members_df' in dir():
    party_counts = members_df['party'].value_counts()
    party_counts.plot(kind='bar', title='Members by Party')
    plt.xlabel('Party')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.show()

In [None]:
# Bills by policy area
if 'bills_df' in dir():
    policy_counts = bills_df['policy_area'].value_counts().head(15)
    policy_counts.plot(kind='barh', title='Bills by Policy Area (Top 15)')
    plt.xlabel('Count')
    plt.tight_layout()
    plt.show()

## Test NER Extraction

In [None]:
# Sample text for NER testing
if 'bills_df' in dir():
    sample_bill = bills_df.iloc[0]
    sample_text = f"{sample_bill['title']}\n\n{sample_bill.get('latest_action_text', '')}"
    print("Sample text for NER:")
    print("-" * 50)
    print(sample_text[:500])

In [None]:
# Test with spaCy (if available)
try:
    import spacy
    nlp = spacy.load('en_core_web_sm')
    
    if 'sample_text' in dir():
        doc = nlp(sample_text[:1000])
        
        print("\nExtracted entities:")
        for ent in doc.ents:
            print(f"  {ent.text:30} -> {ent.label_}")
except ImportError:
    print("spaCy not installed. Run: pip install spacy && python -m spacy download en_core_web_sm")

## Entity Type Analysis

Expected entity types in congressional data:
- **PERSON**: Congress members, presidents, officials
- **ORG**: Committees, agencies, departments
- **GPE**: States, countries, districts
- **DATE**: Bill dates, term dates
- **LAW**: Bill numbers, acts

In [None]:
# Analyze entity patterns in member names
if 'members_df' in dir():
    print("Sample member names:")
    for name in members_df['name'].head(10):
        print(f"  - {name}")

## Next Steps

1. Run Dagster pipeline to populate datasets
2. Analyze entity extraction accuracy
3. Create training data annotations
4. Export to spaCy/HuggingFace format