In [1]:
## Cell 1: Install Libraries
# Install all required packages

!pip install sec_api requests beautifulsoup4 lxml
!pip install spacy
!python -m spacy download en_core_web_lg
!pip install transformers torch


Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
## Cell 2: Imports
import requests
import re
from bs4 import BeautifulSoup, NavigableString
from collections import defaultdict
import spacy
from spacy import displacy
from transformers import pipeline
import json

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
## Cell 3: Data Loading Functions (Same as before)
# --- Configuration ---
HEADERS = {'User-Agent': 'MultiModelNLP-SEC-RAG-Project zhiminng03@gmail.com'}
CIK_MAP_URL = 'https://www.sec.gov/files/company_tickers.json'

def get_latest_10q(ticker: str) -> str:
    """Fetches the HTML content of the latest 10-Q filing for a given stock ticker."""
    print(f"1. Fetching CIK for ticker: {ticker}...")
    response = requests.get(CIK_MAP_URL, headers=HEADERS)
    response.raise_for_status()
    company_data = response.json()

    cik = None
    for company in company_data.values():
        if company['ticker'] == ticker.upper():
            cik = str(company['cik_str']).zfill(10)
            break

    if not cik:
        raise ValueError(f"Ticker '{ticker}' not found in SEC CIK mapping.")

    print(f"   Found CIK: {cik}")

    print("2. Fetching submission history from SEC EDGAR...")
    submissions_url = f"https://data.sec.gov/submissions/CIK{cik}.json"
    response = requests.get(submissions_url, headers=HEADERS)
    response.raise_for_status()
    submissions = response.json()

    latest_10q = None
    for i, form in enumerate(submissions['filings']['recent']['form']):
        if form == '10-Q':
            accession_number = submissions['filings']['recent']['accessionNumber'][i]
            primary_document = submissions['filings']['recent']['primaryDocument'][i]
            filing_date = submissions['filings']['recent']['filingDate'][i]
            latest_10q = {
                'accession_number': accession_number.replace('-', ''),
                'primary_document': primary_document,
                'date': filing_date
            }
            break

    if not latest_10q:
        raise ValueError(f"No recent 10-Q filings found for ticker '{ticker}'.")

    print(f"   Found latest 10-Q filed on: {latest_10q['date']}")

    filing_url = (
        f"https://www.sec.gov/Archives/edgar/data/{cik}/"
        f"{latest_10q['accession_number']}/{latest_10q['primary_document']}"
    )

    print(f"3. Fetching 10-Q document from: {filing_url}")
    response = requests.get(filing_url, headers=HEADERS)
    response.raise_for_status()

    print("   Successfully fetched document.")
    return response.text

def _normalize_header_text(text: str) -> str | None:
    """Normalizes header text to a standard format."""
    text = text.strip().upper()
    part_match = re.search(r'^\s*(PART\s+I{1,2})', text)
    if part_match:
        return re.sub(r'\s+', ' ', part_match.group(1))
    item_match = re.search(r'^\s*(ITEM\s+\d[A-Z]?)', text)
    if item_match:
        return re.sub(r'\s+', ' ', item_match.group(1))
    return None

def _parse_html_table(table_tag: BeautifulSoup) -> str:
    """Converts a BeautifulSoup table Tag into a Markdown formatted string."""
    markdown_rows = []
    for tr in table_tag.find_all('tr'):
        cells = [" ".join(cell.get_text(strip=True).split()) for cell in tr.find_all(['td', 'th'])]
        if any(cells):
            markdown_rows.append(cells)
    if not markdown_rows:
        return ""
    md_output = []
    header = markdown_rows[0]
    md_output.append("| " + " | ".join(header) + " |")
    md_output.append("| " + " | ".join(['---'] * len(header)) + " |")
    for row in markdown_rows[1:]:
        while len(row) < len(header):
            row.append("")
        row = row[:len(header)]
        md_output.append("| " + " | ".join(row) + " |")
    return "\n" + "\n".join(md_output) + "\n"

def parse_10q(html_content: str) -> dict:
    """Parses the HTML of a 10-Q filing to extract Parts and Items."""
    print("4. Parsing HTML content...")
    soup = BeautifulSoup(html_content, 'html.parser')
    potential_headers = soup.find_all(['p', 'b', 'strong', 'div'])
    doc_headers = []
    for header in potential_headers:
        text = header.get_text(strip=True)
        if len(text) > 100:
            continue
        normalized_key = _normalize_header_text(text)
        if normalized_key:
            if not header.find_parent('a'):
                doc_headers.append({'tag': header, 'key': normalized_key})

    if not doc_headers:
        print("   Warning: Could not find any standard Part/Item headers.")
        return {}

    parsed_data = defaultdict(lambda: defaultdict(str))
    current_part_key = None

    for i, header_info in enumerate(doc_headers):
        current_key = header_info['key']
        if 'PART' in current_key:
            current_part_key = current_key
            continue
        if 'ITEM' in current_key:
            if not current_part_key:
                current_part_key = "PART I"
            start_node = header_info['tag']
            end_node = doc_headers[i + 1]['tag'] if i + 1 < len(doc_headers) else None
            content_parts = []
            element = start_node.next_element
            while element and element != end_node:
                if isinstance(element, NavigableString):
                    if not element.find_parent('table'):
                        text = element.strip()
                        if text:
                            content_parts.append(text)
                elif element.name == 'table':
                    if not element.find_parent('table'):
                         table_markdown = _parse_html_table(element)
                         if table_markdown:
                            content_parts.append(table_markdown)
                element = element.next_element

            full_content = "\n".join(content_parts)
            clean_content = re.sub(r'\n{3,}', '\n\n', full_content).strip()
            parsed_data[current_part_key][current_key] = clean_content

    print("   Parsing complete.")
    return {part: dict(items) for part, items in parsed_data.items()}


In [4]:
## Cell 4: Enhanced Knowledge Base and Normalization Mappings

class FinancialKnowledgeBase:
    """
    Comprehensive knowledge base for financial entity normalization and linking.
    """

    def __init__(self):
        # Build company mapping from SEC data
        self.company_to_ticker = self._build_company_ticker_map()

        # Financial metrics normalization
        self.financial_metrics = {
            # Revenue variants
            "revenue": "REVENUE",
            "revenues": "REVENUE",
            "total revenue": "REVENUE",
            "total revenues": "REVENUE",
            "net revenue": "REVENUE",
            "sales": "REVENUE",
            "total sales": "REVENUE",

            # Income variants
            "net income": "NET_INCOME",
            "net earnings": "NET_INCOME",
            "profit": "NET_INCOME",
            "net profit": "NET_INCOME",
            "earnings": "NET_INCOME",
            "income": "NET_INCOME",

            # Operating metrics
            "operating income": "OPERATING_INCOME",
            "operating profit": "OPERATING_INCOME",
            "ebit": "EBIT",
            "ebitda": "EBITDA",
            "operating margin": "OPERATING_MARGIN",
            "gross profit": "GROSS_PROFIT",
            "gross margin": "GROSS_MARGIN",

            # Cash flow
            "cash flow": "CASH_FLOW",
            "operating cash flow": "OPERATING_CASH_FLOW",
            "free cash flow": "FREE_CASH_FLOW",
            "fcf": "FREE_CASH_FLOW",

            # Balance sheet items
            "total assets": "TOTAL_ASSETS",
            "assets": "TOTAL_ASSETS",
            "total liabilities": "TOTAL_LIABILITIES",
            "liabilities": "TOTAL_LIABILITIES",
            "equity": "SHAREHOLDERS_EQUITY",
            "shareholders' equity": "SHAREHOLDERS_EQUITY",
            "stockholders' equity": "SHAREHOLDERS_EQUITY",
            "cash and cash equivalents": "CASH_AND_EQUIVALENTS",
            "cash": "CASH_AND_EQUIVALENTS",

            # Per share metrics
            "earnings per share": "EPS",
            "eps": "EPS",
            "diluted eps": "DILUTED_EPS",
            "basic eps": "BASIC_EPS",
            "book value per share": "BOOK_VALUE_PER_SHARE",

            # Ratios
            "p/e ratio": "PE_RATIO",
            "price to earnings": "PE_RATIO",
            "debt to equity": "DEBT_TO_EQUITY",
            "current ratio": "CURRENT_RATIO",
            "return on equity": "ROE",
            "roe": "ROE",
            "return on assets": "ROA",
            "roa": "ROA",

            # Other
            "market cap": "MARKET_CAP",
            "market capitalization": "MARKET_CAP",
            "dividend": "DIVIDEND",
            "dividend yield": "DIVIDEND_YIELD",
        }

        # Risk types normalization
        self.risk_types = {
            "market risk": "MARKET_RISK",
            "credit risk": "CREDIT_RISK",
            "operational risk": "OPERATIONAL_RISK",
            "liquidity risk": "LIQUIDITY_RISK",
            "interest rate risk": "INTEREST_RATE_RISK",
            "currency risk": "CURRENCY_RISK",
            "foreign exchange risk": "CURRENCY_RISK",
            "commodity risk": "COMMODITY_RISK",
            "regulatory risk": "REGULATORY_RISK",
            "compliance risk": "COMPLIANCE_RISK",
            "legal risk": "LEGAL_RISK",
            "reputational risk": "REPUTATIONAL_RISK",
            "strategic risk": "STRATEGIC_RISK",
            "cybersecurity risk": "CYBERSECURITY_RISK",
            "technology risk": "TECHNOLOGY_RISK",
            "counterparty risk": "COUNTERPARTY_RISK",
            "concentration risk": "CONCENTRATION_RISK",
            "political risk": "POLITICAL_RISK",
            "environmental risk": "ENVIRONMENTAL_RISK",
            "climate risk": "CLIMATE_RISK",
        }

        # Regex patterns for financial metrics detection
        self.metric_patterns = [
            (r'\$\s*\d+(?:,\d{3})*(?:\.\d+)?\s*(?:million|billion|thousand|M|B|K)', 'MONETARY_VALUE'),
            (r'\d+(?:\.\d+)?%', 'PERCENTAGE'),
            (r'\d+(?:,\d{3})*(?:\.\d+)?\s*shares?', 'SHARE_COUNT'),
        ]

    def _build_company_ticker_map(self):
        """Build comprehensive company to ticker mapping from SEC data."""
        try:
            response = requests.get(CIK_MAP_URL, headers=HEADERS)
            response.raise_for_status()
            company_data = response.json()

            mapping = {}
            for company in company_data.values():
                ticker = company['ticker']
                title = company['title']

                # Add exact company name
                mapping[title] = ticker
                mapping[title.upper()] = ticker

                # Add common variations
                # Remove Inc., Corp., etc.
                base_name = re.sub(
                    r'\s+(Inc\.?|Corp\.?|Corporation|Company|Co\.?|Ltd\.?|LLC|L\.P\.)\s*,?',
                    '',
                    title,
                    flags=re.IGNORECASE
                ).strip()
                mapping[base_name] = ticker
                mapping[base_name.upper()] = ticker

                # Add ticker itself
                mapping[ticker] = ticker
                mapping[ticker.upper()] = ticker

            print(f"   Built company mapping with {len(mapping)} entries")
            return mapping

        except Exception as e:
            print(f"   Warning: Could not build company map from SEC. Using fallback. Error: {e}")
            # Fallback to major companies
            return {
                "Alphabet": "GOOGL", "Alphabet Inc.": "GOOGL", "Google": "GOOGL",
                "Microsoft": "MSFT", "Microsoft Corporation": "MSFT",
                "Tesla": "TSLA", "Tesla, Inc.": "TSLA",
                "Apple": "AAPL", "Apple Inc.": "AAPL",
                "Amazon": "AMZN", "Amazon.com": "AMZN",
                "Meta": "META", "Meta Platforms": "META", "Facebook": "META",
                "NVIDIA": "NVDA", "Nvidia": "NVDA",
                "Berkshire Hathaway": "BRK.B",
                "JPMorgan": "JPM", "JPMorgan Chase": "JPM",
                "Visa": "V", "Mastercard": "MA",
            }

    def normalize_company(self, text):
        """Normalize company name to ticker."""
        # Try exact match first
        if text in self.company_to_ticker:
            return self.company_to_ticker[text]

        # Try case-insensitive
        text_upper = text.upper()
        if text_upper in self.company_to_ticker:
            return self.company_to_ticker[text_upper]

        # Try fuzzy matching (remove common suffixes)
        clean_text = re.sub(
            r'\s+(Inc\.?|Corp\.?|Corporation|Company|Co\.?|Ltd\.?|LLC)\s*,?',
            '',
            text,
            flags=re.IGNORECASE
        ).strip()
        if clean_text in self.company_to_ticker:
            return self.company_to_ticker[clean_text]

        return None

    def normalize_metric(self, text):
        """Normalize financial metric to standard form."""
        text_lower = text.lower().strip()
        return self.financial_metrics.get(text_lower)

    def normalize_risk(self, text):
        """Normalize risk type to standard form."""
        text_lower = text.lower().strip()
        return self.risk_types.get(text_lower)

    def detect_metric_pattern(self, text):
        """Detect financial metrics using regex patterns."""
        for pattern, metric_type in self.metric_patterns:
            if re.search(pattern, text, re.IGNORECASE):
                return metric_type
        return None


In [5]:
## Cell 5: Enhanced Entity Extraction and Normalization

class FinancialEntityExtractor:
    """
    Enhanced entity extractor with financial domain knowledge.
    """

    def __init__(self, knowledge_base):
        self.kb = knowledge_base

        # Load NER models
        print("Loading NER models...")
        self.spacy_nlp = spacy.load("en_core_web_lg")

        # Use a reliable transformer model
        try:
            self.transformer_ner = pipeline(
                "token-classification",
                model="dslim/bert-base-NER",
                aggregation_strategy="simple"
            )
            print("   Transformer model loaded successfully")
        except Exception as e:
            print(f"   Warning: Could not load transformer model: {e}")
            self.transformer_ner = None

    def extract_entities_spacy(self, text):
        """Extract entities using SpaCy."""
        doc = self.spacy_nlp(text)
        entities = []

        for ent in doc.ents:
            entities.append({
                'text': ent.text,
                'label': ent.label_,
                'start': ent.start_char,
                'end': ent.end_char,
                'source': 'spacy'
            })

        return entities

    def extract_entities_transformer(self, text):
        """Extract entities using transformer model."""
        if not self.transformer_ner:
            return []

        # Handle long texts by chunking
        max_length = 512
        chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]

        all_entities = []
        offset = 0

        for chunk in chunks:
            try:
                entities = self.transformer_ner(chunk)
                for ent in entities:
                    all_entities.append({
                        'text': ent['word'],
                        'label': ent['entity_group'],
                        'start': offset + ent.get('start', 0),
                        'end': offset + ent.get('end', 0),
                        'score': ent.get('score', 1.0),
                        'source': 'transformer'
                    })
            except Exception as e:
                print(f"   Warning: Error processing chunk: {e}")

            offset += len(chunk)

        return all_entities

    # def extract_financial_metrics_contextual(self, text):
    #     """Extract financial metrics using contextual patterns."""
    #     entities = []

    #     # Pattern 1: Monetary values
    #     monetary_pattern = r'(?:revenues?|sales|income|profit|loss|assets?|liabilities|equity|cash)[\s\w]*?(?:of|was|were|totaled|reached)?\s*\$\s*(\d+(?:,\d{3})*(?:\.\d+)?)\s*(million|billion|thousand|M|B|K)?'
    #     for match in re.finditer(monetary_pattern, text, re.IGNORECASE):
    #         entities.append({
    #             'text': match.group(0),
    #             'label': 'MONETARY_VALUE',
    #             'start': match.start(),
    #             'end': match.end(),
    #             'source': 'pattern'
    #         })

    #     # Pattern 2: Percentage changes
    #     percentage_pattern = r'(?:increased|decreased|grew|declined|rose|fell)[\s\w]*?by\s*(\d+(?:\.\d+)?%)'
    #     for match in re.finditer(percentage_pattern, text, re.IGNORECASE):
    #         entities.append({
    #             'text': match.group(0),
    #             'label': 'PERCENTAGE_CHANGE',
    #             'start': match.start(),
    #             'end': match.end(),
    #             'source': 'pattern'
    #         })

    #     # Pattern 3: Risk mentions
    #     risk_pattern = r'((?:\w+\s+)?risk(?:s)?(?:\s+(?:of|related to|associated with|from))?\s+[\w\s]{1,30})'
    #     for match in re.finditer(risk_pattern, text, re.IGNORECASE):
    #         risk_text = match.group(1).strip()
    #         if len(risk_text.split()) <= 6:  # Avoid overly long matches
    #             entities.append({
    #                 'text': risk_text,
    #                 'label': 'RISK_MENTION',
    #                 'start': match.start(),
    #                 'end': match.end(),
    #                 'source': 'pattern'
    #             })

    #     return entities

    def extract_financial_metrics_contextual(self, text):
        """Extract financial metrics using contextual patterns."""
        entities = []

        # --- MODIFICATION 1 ---
        # This pattern is simpler and more robust.
        # It looks for:
        # 1) A dollar sign followed by a number (e.g., $185.81)
        # 2) A number with commas, (e.g., 19,077 or 2,571)
        monetary_pattern = r'(\$\s*\d{1,3}(?:,\d{3})*(?:\.\d+)?|\d{1,3}(?:,\d{3})+)'

        for match in re.finditer(monetary_pattern, text):
            # Exclude 4-digit numbers without commas (likely years)
            if ',' not in match.group(1) and len(match.group(1)) == 4 and '$' not in match.group(1):
                continue

            entities.append({
                'text': match.group(1),
                'label': 'MONETARY_VALUE',
                'start': match.start(),
                'end': match.end(),
                'source': 'pattern'
            })

        # --- MODIFICATION 2 ---
        # Build a dynamic risk pattern from your Knowledge Base
        # This is much more effective than a generic regex
        risk_keys = '|'.join(re.escape(k) for k in self.kb.risk_types.keys())
        risk_pattern = rf'({risk_keys})'

        for match in re.finditer(risk_pattern, text, re.IGNORECASE):
            entities.append({
                'text': match.group(1),
                'label': 'RISK_MENTION',
                'start': match.start(),
                'end': match.end(),
                'source': 'pattern'
            })

        # (Your percentage pattern is likely fine, but the two above are critical)
        percentage_pattern = r'(?:increased|decreased|grew|declined|rose|fell)[\\s\\w]*?by\\s*(\\d+(?:\\.\\d+)?%)'
        for match in re.finditer(percentage_pattern, text, re.IGNORECASE):
            entities.append({
                'text': match.group(0),
                'label': 'PERCENTAGE_CHANGE',
                'start': match.start(),
                'end': match.end(),
                'source': 'pattern'
            })

        return entities

    # def normalize_and_link_entities(self, entities, text):
    #     """
    #     Normalize entities and link them to knowledge base.
    #     """
    #     normalized = []
    #     seen_spans = set()  # Avoid duplicate entities

    #     for ent in entities:
    #         # Skip if we've already processed this span
    #         span = (ent['start'], ent['end'])
    #         if span in seen_spans:
    #             continue
    #         seen_spans.add(span)

    #         ent_text = ent['text'].strip()
    #         ent_label = ent['label']

    #         result = {
    #             'original_text': ent_text,
    #             'normalized_text': None,
    #             'entity_type': ent_label,
    #             'linked_id': None,
    #             'confidence': ent.get('score', 1.0),
    #             'source': ent.get('source', 'unknown'),
    #             'span': span
    #         }

    #         # Normalize based on entity type
    #         if ent_label in ['ORG', 'ORGANIZATION']:
    #             # Try to normalize to ticker
    #             ticker = self.kb.normalize_company(ent_text)
    #             if ticker:
    #                 result['normalized_text'] = ticker
    #                 result['entity_type'] = 'COMPANY'
    #                 result['linked_id'] = f"TICKER:{ticker}"

    #         elif ent_label in ['MONEY', 'MONETARY_VALUE']:
    #             result['entity_type'] = 'MONETARY_VALUE'
    #             # Extract numeric value
    #             numbers = re.findall(r'\d+(?:,\d{3})*(?:\.\d+)?', ent_text)
    #             if numbers:
    #                 result['normalized_text'] = numbers[0].replace(',', '')

    #         elif ent_label in ['PERCENT', 'PERCENTAGE', 'PERCENTAGE_CHANGE']:
    #             result['entity_type'] = 'PERCENTAGE'
    #             # Extract percentage value
    #             pct = re.search(r'(\d+(?:\.\d+)?)%', ent_text)
    #             if pct:
    #                 result['normalized_text'] = pct.group(1)

    #         elif ent_label == 'RISK_MENTION':
    #             # Try to normalize risk type
    #             risk_type = self.kb.normalize_risk(ent_text)
    #             if risk_type:
    #                 result['normalized_text'] = risk_type
    #                 result['entity_type'] = 'RISK_TYPE'
    #                 result['linked_id'] = f"RISK:{risk_type}"

    #         # Check if text contains a financial metric
    #         metric = self.kb.normalize_metric(ent_text)
    #         if metric:
    #             result['entity_type'] = 'FINANCIAL_METRIC'
    #             result['normalized_text'] = metric
    #             result['linked_id'] = f"METRIC:{metric}"

    #         normalized.append(result)

    #     return normalized

    def normalize_and_link_entities(self, entities, text):
        """
        Normalize entities and link them to knowledge base.
        """
        normalized = []
        seen_spans = set()  # Avoid duplicate entities

        for ent in entities:
            # Skip if we've already processed this span
            span = (ent['start'], ent['end'])
            if span in seen_spans:
                continue
            seen_spans.add(span)

            ent_text = ent['text'].strip()
            ent_label = ent['label']

            result = {
                'original_text': ent_text,
                'normalized_text': None,
                'entity_type': ent_label,
                'linked_id': None,
                'confidence': ent.get('score', 1.0),
                'source': ent.get('source', 'unknown'),
                'span': span
            }

            # --- START OF LOGIC ---

            # Normalize based on entity type
            if ent_label in ['ORG', 'ORGANIZATION']:
                # Try to normalize to ticker
                ticker = self.kb.normalize_company(ent_text)
                if ticker:
                    result['normalized_text'] = ticker
                    result['entity_type'] = 'COMPANY'
                    result['linked_id'] = f"TICKER:{ticker}"

            # --- THIS IS THE NEW BLOCK YOU WERE ADDING ---
            elif ent_label == 'CARDINAL':
                # If a "cardinal" number has a comma, it's almost certainly a monetary value
                # in a financial report.
                if ',' in ent_text:
                    result['entity_type'] = 'MONETARY_VALUE'
                    result['normalized_text'] = ent_text.replace(',', '')
                # (If it doesn't have a comma, we leave it as CARDINAL)
            # --- END OF NEW BLOCK ---

            elif ent_label in ['MONEY', 'MONETARY_VALUE']:
                result['entity_type'] = 'MONETARY_VALUE'
                # Extract numeric value
                numbers = re.findall(r'\d+(?:,\d{3})*(?:\.\d+)?', ent_text.replace('$', ''))
                if numbers:
                    result['normalized_text'] = numbers[0].replace(',', '')

            elif ent_label in ['PERCENT', 'PERCENTAGE', 'PERCENTAGE_CHANGE']:
                result['entity_type'] = 'PERCENTAGE'
                # Extract percentage value
                pct = re.search(r'(\d+(?:\.\d+)?)%', ent_text)
                if pct:
                    result['normalized_text'] = pct.group(1)

            elif ent_label == 'RISK_MENTION':
                # Try to normalize risk type
                risk_type = self.kb.normalize_risk(ent_text)
                if risk_type:
                    result['normalized_text'] = risk_type
                    result['entity_type'] = 'RISK_TYPE'
                    result['linked_id'] = f"RISK:{risk_type}"

            # Check if text contains a financial metric
            metric = self.kb.normalize_metric(ent_text)
            if metric:
                result['entity_type'] = 'FINANCIAL_METRIC'
                result['normalized_text'] = metric
                result['linked_id'] = f"METRIC:{metric}"

            normalized.append(result)

        return normalized

    def extract_and_normalize(self, text):
        """
        Complete pipeline: extract entities from multiple sources and normalize.
        """
        print("\nExtracting entities from multiple sources...")

        # Extract from SpaCy
        spacy_entities = self.extract_entities_spacy(text)
        print(f"   SpaCy found {len(spacy_entities)} entities")

        # Extract from Transformer
        transformer_entities = self.extract_entities_transformer(text)
        print(f"   Transformer found {len(transformer_entities)} entities")

        # Extract using patterns
        pattern_entities = self.extract_financial_metrics_contextual(text)
        print(f"   Pattern matching found {len(pattern_entities)} entities")

        # Combine all entities
        all_entities = spacy_entities + transformer_entities + pattern_entities

        # Normalize and link
        print("\nNormalizing and linking entities...")
        normalized = self.normalize_and_link_entities(all_entities, text)

        return normalized


In [6]:
## Cell 6: Load Data and Initialize System

print("="*60)
print("ENHANCED FINANCIAL NER SYSTEM")
print("="*60)

# Initialize knowledge base
print("\nInitializing Financial Knowledge Base...")
kb = FinancialKnowledgeBase()

# Initialize entity extractor
extractor = FinancialEntityExtractor(kb)

# Load 10-Q data
target_ticker = 'GOOGL'  # Change as needed
# target_ticker = 'MSFT'
# target_ticker = 'TSLA'

try:
    # Fetch and parse 10-Q
    html = get_latest_10q(target_ticker)
    report_data = parse_10q(html)

    # Use specific section or full document
    # For demonstration, let's use a specific item that's rich in entities
    if 'PART II' in report_data and 'ITEM 2' in report_data['PART II']:
        text_to_analyze = report_data['PART II']['ITEM 2']
        print(f"\n   Analyzing: PART II, ITEM 2")
    else:
        # Fallback to first available content
        all_content = []
        for part_key, items in report_data.items():
            for item_key, content in items.items():
                all_content.append(content)
        text_to_analyze = "\n\n".join(all_content)[:10000]  # Limit for demo
        print(f"\n   Analyzing: First 10000 characters of full document")

    print(f"   Text length: {len(text_to_analyze)} characters")

except Exception as e:
    print(f"\nError loading data: {e}")
    print("Using sample text for demonstration...")
    text_to_analyze = """
    Apple Inc. reported total revenues of $394.3 billion for fiscal 2022,
    an increase of 7.8% compared to the prior year. Net income reached $99.8 billion.
    Microsoft Corporation saw its cloud revenue grow by 22%, while operating income
    increased to $83.4 billion. The company faces market risk related to foreign
    currency fluctuations and interest rate risk from its debt obligations.
    Tesla reported delivery of 1.31 million vehicles and faces regulatory risk
    in various markets. The company's liquidity risk is managed through cash reserves
    of $22.2 billion.
    """

ENHANCED FINANCIAL NER SYSTEM

Initializing Financial Knowledge Base...
   Built company mapping with 34397 entries
Loading NER models...


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


   Transformer model loaded successfully
1. Fetching CIK for ticker: GOOGL...
   Found CIK: 0001652044
2. Fetching submission history from SEC EDGAR...
   Found latest 10-Q filed on: 2025-10-30
3. Fetching 10-Q document from: https://www.sec.gov/Archives/edgar/data/0001652044/000165204425000091/goog-20250930.htm
   Successfully fetched document.
4. Parsing HTML content...
   Parsing complete.

   Analyzing: PART II, ITEM 2
   Text length: 1608 characters


In [7]:
## Cell 7: Run Enhanced NER Pipeline

print("\n" + "="*60)
print("RUNNING ENHANCED NER PIPELINE")
print("="*60)

# Extract and normalize entities
normalized_entities = extractor.extract_and_normalize(text_to_analyze)

print(f"\n   Total entities extracted and normalized: {len(normalized_entities)}")



RUNNING ENHANCED NER PIPELINE

Extracting entities from multiple sources...
   SpaCy found 27 entities
   Transformer found 9 entities
   Pattern matching found 8 entities

Normalizing and linking entities...

   Total entities extracted and normalized: 34


In [8]:
## Cell 8: Analyze and Display Results

def display_entity_analysis(entities):
    """Display comprehensive entity analysis."""

    # Group by entity type
    by_type = defaultdict(list)
    for ent in entities:
        by_type[ent['entity_type']].append(ent)

    print("\n" + "="*60)
    print("ENTITY ANALYSIS BY TYPE")
    print("="*60)

    for ent_type in sorted(by_type.keys()):
        print(f"\n### {ent_type} ({len(by_type[ent_type])} found)")
        print("-" * 60)

        for ent in by_type[ent_type][:10]:  # Show top 10 per type
            print(f"  Original: {ent['original_text']}")
            if ent['normalized_text']:
                print(f"  Normalized: {ent['normalized_text']}")
            if ent['linked_id']:
                print(f"  Linked ID: {ent['linked_id']}")
            print(f"  Confidence: {ent['confidence']:.2f}")
            print(f"  Source: {ent['source']}")
            print()

        if len(by_type[ent_type]) > 10:
            print(f"  ... and {len(by_type[ent_type]) - 10} more")

    # Summary statistics
    print("\n" + "="*60)
    print("SUMMARY STATISTICS")
    print("="*60)

    companies = [e for e in entities if e['entity_type'] == 'COMPANY']
    metrics = [e for e in entities if e['entity_type'] == 'FINANCIAL_METRIC']
    risks = [e for e in entities if e['entity_type'] == 'RISK_TYPE']
    monetary = [e for e in entities if e['entity_type'] == 'MONETARY_VALUE']

    print(f"  Companies identified: {len(companies)}")
    print(f"  Financial metrics: {len(metrics)}")
    print(f"  Risk types: {len(risks)}")
    print(f"  Monetary values: {len(monetary)}")
    print(f"  Total entities: {len(entities)}")

    # Unique companies
    unique_companies = set(e['normalized_text'] for e in companies if e['normalized_text'])
    if unique_companies:
        print(f"\n  Unique companies (tickers): {', '.join(sorted(unique_companies))}")

    # Unique risk types
    unique_risks = set(e['normalized_text'] for e in risks if e['normalized_text'])
    if unique_risks:
        print(f"  Unique risk types: {', '.join(sorted(unique_risks))}")

# Display analysis
display_entity_analysis(normalized_entities)


ENTITY ANALYSIS BY TYPE

### CARDINAL (9 found)
------------------------------------------------------------
  Original: 2
  Confidence: 1.00
  Source: spacy

  Original: 185.81
  Confidence: 1.00
  Source: spacy

  Original: 203.08
  Confidence: 1.00
  Source: spacy

  Original: 551
  Confidence: 1.00
  Source: spacy

  Original: 241.96
  Confidence: 1.00
  Source: spacy

  Original: 1
  Confidence: 1.00
  Source: spacy

  Original: 10b5-1
  Confidence: 1.00
  Source: spacy

  Original: 11
  Confidence: 1.00
  Source: spacy

  Original: 2
  Confidence: 1.00
  Source: spacy


### COMPANY (1 found)
------------------------------------------------------------
  Original: Alphabet
  Normalized: GOOG
  Linked ID: TICKER:GOOG
  Confidence: 1.00
  Source: spacy


### DATE (6 found)
------------------------------------------------------------
  Original: the quarter ended September 30, 2025
  Confidence: 1.00
  Source: spacy

  Original: May
  Confidence: 1.00
  Source: spacy

  Original: Ju