# SEC EDGAR Data Exploration

Explore the EDGAR domain data for NER training.

**Goals:**
- Load and inspect 10-K filing sections
- Extract financial entities (ORG, MONEY, PERCENT)
- Analyze risk factor patterns

In [None]:
import sys
from pathlib import Path

sys.path.insert(0, str(Path('../corpus-core/src').resolve()))
sys.path.insert(0, str(Path('../pipelines/src').resolve()))

from corpus_core.loaders import ParquetLoader
import pandas as pd
import matplotlib.pyplot as plt

## Load EDGAR Data

In [None]:
loader = ParquetLoader(Path('../datasets'))

# List EDGAR datasets
edgar_datasets = [ds for ds in loader.list_datasets() if ds['domain'] == 'edgar']
print("EDGAR datasets:")
for ds in edgar_datasets:
    print(f"  - {ds['name']}")

In [None]:
# Load companies
if loader.exists('edgar', 'edgar_companies'):
    companies_df = loader.read_pandas('edgar', 'edgar_companies')
    print(f"Companies: {len(companies_df)} records")
    display(companies_df.head())
else:
    print("Run the Dagster pipeline first: dagster dev")

In [None]:
# Load document sections
if loader.exists('edgar', 'edgar_sections'):
    sections_df = loader.read_pandas('edgar', 'edgar_sections')
    print(f"Sections: {len(sections_df)} records")
    display(sections_df[['company_name', 'section', 'section_title']].head(10))

## Analyze Section Distribution

In [None]:
if 'sections_df' in dir():
    section_counts = sections_df['section_title'].value_counts()
    section_counts.plot(kind='barh', title='Documents by Section Type')
    plt.xlabel('Count')
    plt.tight_layout()
    plt.show()

In [None]:
# Content length distribution
if 'sections_df' in dir():
    sections_df['content_length'] = sections_df['content'].str.len()
    sections_df.groupby('section')['content_length'].mean().plot(kind='bar')
    plt.ylabel('Avg Content Length (chars)')
    plt.title('Average Section Length')
    plt.tight_layout()
    plt.show()

## Extract Financial Entities

In [None]:
# Sample Risk Factors section
if 'sections_df' in dir():
    risk_sections = sections_df[sections_df['section'] == 'item_1a']
    if len(risk_sections) > 0:
        sample_risk = risk_sections.iloc[0]
        print(f"Company: {sample_risk['company_name']}")
        print(f"Section: {sample_risk['section_title']}")
        print("-" * 50)
        print(sample_risk['content'][:2000])

In [None]:
import re

def extract_financial_entities(text):
    """Extract financial entities using regex patterns."""
    entities = []
    
    # Money patterns (e.g., $1.5 million, $500,000)
    money_pattern = r'\$[\d,]+(?:\.\d+)?(?:\s*(?:million|billion|thousand|M|B|K))?'
    for match in re.finditer(money_pattern, text, re.IGNORECASE):
        entities.append(('MONEY', match.group()))
    
    # Percentage patterns (e.g., 15%, 3.5 percent)
    percent_pattern = r'[\d.]+\s*(?:%|percent|percentage)'
    for match in re.finditer(percent_pattern, text, re.IGNORECASE):
        entities.append(('PERCENT', match.group()))
    
    # Date patterns (fiscal year, quarters)
    date_pattern = r'(?:fiscal\s+)?(?:year|FY)\s*20\d{2}|Q[1-4]\s*20\d{2}'
    for match in re.finditer(date_pattern, text, re.IGNORECASE):
        entities.append(('DATE', match.group()))
    
    return entities

# Test extraction
if 'sample_risk' in dir():
    entities = extract_financial_entities(sample_risk['content'][:5000])
    print(f"\nExtracted {len(entities)} entities:")
    for ent_type, text in entities[:20]:
        print(f"  {ent_type:10} | {text}")

## Entity Type Analysis

Expected entity types in EDGAR data:
- **ORG**: Company names, subsidiaries, competitors
- **MONEY**: Revenue, expenses, financial figures
- **PERCENT**: Growth rates, market share, ratios
- **DATE**: Fiscal years, reporting periods
- **GPE**: Headquarters, market locations
- **PRODUCT**: Products, services, brands

In [None]:
# Test with spaCy for ORG entities
try:
    import spacy
    nlp = spacy.load('en_core_web_sm')
    
    if 'sample_risk' in dir():
        doc = nlp(sample_risk['content'][:3000])
        
        # Count entity types
        ent_counts = {}
        for ent in doc.ents:
            ent_counts[ent.label_] = ent_counts.get(ent.label_, 0) + 1
        
        print("Entity type distribution:")
        for label, count in sorted(ent_counts.items(), key=lambda x: -x[1]):
            print(f"  {label:10}: {count}")
except ImportError:
    print("spaCy not installed")

## Next Steps

1. Expand company list (full S&P 500)
2. Improve section parsing accuracy
3. Train custom NER model for financial entities
4. Compare with FiNER-ORD benchmark