# Import Needed Packages and then Pull Tickers from ESG Data

In [None]:
#%pip install sec-edgar-downloader
#%pip install pandas

# STEP 1: DOWNLOAD ALL FILINGS

In [15]:
from sec_edgar_downloader import Downloader
import pandas as pd
from datetime import datetime
import time

EMAIL       = "ecradeck@syr.edu"
ESG_PATH    = "../data/ESG_data.csv"
FILINGS_DIR = "../data/sec-filings"

df = pd.read_csv(ESG_PATH)
print(f"Loaded {len(df)} companies")

dl = Downloader("Syracuse University", EMAIL, download_folder=FILINGS_DIR)

success, failed = [], []

for ticker in df['ticker']:
    ticker_upper = ticker.upper()
    try:
        dl.get("10-K", ticker_upper, before=datetime(2022, 4, 1), limit=1)
        success.append(ticker_upper)
        print(f"✓ {ticker_upper}")
    except Exception as e:
        failed.append(ticker_upper)
        print(f"✗ {ticker_upper}: {e}")
    time.sleep(0.5)

print(f"\nSuccess: {len(success)}/{len(df)}")
print(f"Failed:  {len(failed)}")
print(f"Failed tickers: {failed}")

Loaded 722 companies
✓ DIS
✓ GM
✓ GWW
✓ MHK
✓ LYV
✓ LVS
✓ CLX
✓ AACG
✓ AAL
✓ AAME
✓ AAOI
✓ AAON
✓ AAPL
✓ AATC
✗ AAWW: Ticker 'AAWW' is invalid and cannot be mapped to a CIK. Please enter a valid ticker or CIK.
✓ AACI
✗ AADI: Ticker 'AADI' is invalid and cannot be mapped to a CIK. Please enter a valid ticker or CIK.
✓ ABEO
✓ ABNB
✗ ABIO: Ticker 'ABIO' is invalid and cannot be mapped to a CIK. Please enter a valid ticker or CIK.
✗ ABMD: Ticker 'ABMD' is invalid and cannot be mapped to a CIK. Please enter a valid ticker or CIK.
✓ ABOS
✓ ABSI
✗ ABTX: Ticker 'ABTX' is invalid and cannot be mapped to a CIK. Please enter a valid ticker or CIK.
✓ ABUS
✓ ABVC
✓ ACAD
✗ ACAC: Ticker 'ACAC' is invalid and cannot be mapped to a CIK. Please enter a valid ticker or CIK.
✗ ACCD: Ticker 'ACCD' is invalid and cannot be mapped to a CIK. Please enter a valid ticker or CIK.
✓ ACET
✓ ABCL
✓ ABCB
✗ ACEV: Ticker 'ACEV' is invalid and cannot be mapped to a CIK. Please enter a valid ticker or CIK.
✗ ACHL: Ticke

# STEP 2: EXTRACTION FUNCTIONS

In [16]:
from bs4 import BeautifulSoup
import re

FILINGS_BASE = "../data/sec-filings/sec-edgar-filings"

def load_filing_text(file_path):
    """Read a full-submission.txt file and return cleaned plain text."""
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        content = f.read()
    try:
        soup = BeautifulSoup(content, 'html.parser')
    except Exception:
        return None
    text = soup.get_text()
    return ' '.join(text.split())


def extract_section(text, pattern, next_pattern):
    """Take the second match to skip TOC, extract until the next section begins."""
    matches = list(re.finditer(pattern, text, re.IGNORECASE))
    if len(matches) >= 2:
        m = matches[1]
    elif len(matches) == 1:
        m = matches[0]
    else:
        return None

    start = m.start()

    next_match = re.search(next_pattern, text[start + 100:], re.IGNORECASE)
    if next_match:
        end = start + 100 + next_match.start()
    else:
        end = len(text)

    return text[start:end]


def extract_business(text):
    return extract_section(text, r'ITEM\s*1\.\s*BUSINESS', r'ITEM\s*1A\.')

def extract_risk_factors(text):
    return extract_section(text, r'ITEM\s*1A\.\s*RISK\s*FACTORS', r'ITEM\s*1B\.')

def extract_mda(text):
    return extract_section(text, r'ITEM\s*7\.\s*MANAGEMENT', r'ITEM\s*7A\.')


def extract_all_sections(file_path):
    """Run all three extractors on a single filing file."""
    text = load_filing_text(file_path)
    if text is None:
        return {'business': None, 'risk_factors': None, 'mda': None}
    return {
        'business':     extract_business(text),
        'risk_factors': extract_risk_factors(text),
        'mda':          extract_mda(text)
    }

print("Extraction functions loaded.")

Extraction functions loaded.


# STEP 3: TEST EXTRACTION ON MSFT

In [17]:
import glob

files = glob.glob(f"{FILINGS_BASE}/MSFT/10-K/*/full-submission.txt")

if not files:
    print("MSFT filing not found — make sure Cell 1 ran successfully.")
else:
    sections = extract_all_sections(files[0])

    print(f"Extracted {sum(v is not None for v in sections.values())}/3 sections\n")

    for name, text in sections.items():
        if text:
            print(f"{name.upper()}: {len(text):,} characters")
            print(f"  First 200: {text[:200]}")
            print(f"  Last 200:  {text[-200:]}")
        else:
            print(f"{name.upper()}: NOT FOUND")
        print()

Extracted 3/3 sections

BUSINESS: 74,757 characters
  First 200: ITEM 1. BUSINESS GENERAL Embracing Our Future Microsoft is a technology company whose mission is to empower every person and every organization on the planet to achieve more. We strive to create local
  Last 200:  stors. We encourage investors, the media, and others interested in Microsoft to review the information we post on the social media channels listed on our Investor Relations website. 21 PART I Item 1A 

RISK_FACTORS: 68,581 characters
  First 200: ITEM 1A. RISK FACTORS Our operations and financial results are subject to various risks and uncertainties, including those described below, that could adversely affect our business, financial conditio
  Last 200:  loyment-related laws are interpreted and applied to our workforce practices may result in increased operating costs and less flexibility in how we meet our workforce needs. 35 PART I Item 1B, 2, 3, 4 

MDA: 55,571 characters
  First 200: ITEM 7. MANAGEMENT’S D

# STEP 4: FULL EXTRACTION LOOP

In [None]:
import glob
import os
import json
from concurrent.futures import ThreadPoolExecutor, as_completed

OUTPUT_DIR = "../data/extracted_sections"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def process_ticker(ticker):
    files = glob.glob(f"{FILINGS_BASE}/{ticker}/10-K/*/full-submission.txt")
    if not files:
        return ticker, 'no_file', 0
    sections = extract_all_sections(files[0])
    n = sum(v is not None for v in sections.values())
    if n == 0:
        return ticker, 'no_sections', 0
    out_path = os.path.join(OUTPUT_DIR, f"{ticker}.json")
    with open(out_path, 'w') as f:
        json.dump({"ticker": ticker, "sections": sections}, f)
    return ticker, 'ok', n

extract_success, extract_partial, extract_failed = [], [], []

with ThreadPoolExecutor(max_workers=4) as executor:
    futures = {executor.submit(process_ticker, t): t for t in success}
    for future in as_completed(futures):
        ticker, status, n = future.result()
        if status in ('no_file', 'no_sections'):
            extract_failed.append(ticker)
            print(f"✗ {ticker}: {status}")
        elif n < 3:
            extract_partial.append(ticker)
            print(f"⚠ {ticker}: {n}/3 sections")
        else:
            extract_success.append(ticker)
            print(f"✓ {ticker}: 3/3 sections")

print(f"\n{'='*40}")
print(f"Full extractions (3/3): {len(extract_success)}")
print(f"Partial extractions:    {len(extract_partial)}")
print(f"Failed:                 {len(extract_failed)}")
print(f"\nPartial tickers: {extract_partial}")
print(f"Failed tickers:  {extract_failed}")

✗ AACG: no_file
✗ GWW: no_sections
✓ MHK: 3/3 sections
✗ LVS: no_sections
✓ DIS: 3/3 sections
✓ GM: 3/3 sections
✓ AAPL: 3/3 sections
✗ AACI: no_file
✓ AATC: 3/3 sections
✓ AAON: 3/3 sections
✓ ABEO: 3/3 sections
✓ ABOS: 3/3 sections
✓ ABNB: 3/3 sections
✓ ABSI: 3/3 sections
✓ ABUS: 3/3 sections
✓ ABVC: 3/3 sections
✓ ACAD: 3/3 sections
✓ ABCL: 3/3 sections
✓ ACET: 3/3 sections
✓ ACIW: 3/3 sections
✓ ABCB: 3/3 sections
✗ ACIU: no_file
✓ AAOI: 3/3 sections
✓ ACMR: 3/3 sections
✗ AAME: no_sections
✓ ACRS: 3/3 sections
✓ ACLS: 3/3 sections
✓ ACHV: 3/3 sections
⚠ CLX: 2/3 sections
✗ ADAG: no_file
✓ ACTG: 3/3 sections
✓ ACVA: 3/3 sections
✓ ACT: 3/3 sections
✓ ACXP: 3/3 sections
✓ ADBE: 3/3 sections
✓ ADIL: 3/3 sections
✓ ADI: 3/3 sections
✓ ADPT: 3/3 sections
✓ ADSK: 3/3 sections
✓ ACGL: 3/3 sections
✗ LYV: no_sections
✓ ADTX: 3/3 sections
✓ ADP: 3/3 sections
✗ AEHL: no_file
✓ AAL: 3/3 sections
✓ AEHR: 3/3 sections
✓ ADV: 3/3 sections
✓ AEIS: 3/3 sections
✓ AEI: 3/3 sections
✓ ADMA: 3/3 se

: 

# STEP 5: VERIFICATION

In [None]:
import json
import os
import pandas as pd

OUTPUT_DIR = "../data/extracted_sections"

records = []

for fname in os.listdir(OUTPUT_DIR):
    if not fname.endswith(".json"):
        continue

    with open(os.path.join(OUTPUT_DIR, fname)) as f:
        data = json.load(f)

    ticker   = data['ticker']
    sections = data['sections']

    records.append({
        'ticker':         ticker,
        'business_chars': len(sections['business'])     if sections['business']     else 0,
        'risk_chars':     len(sections['risk_factors'])  if sections['risk_factors'] else 0,
        'mda_chars':      len(sections['mda'])           if sections['mda']          else 0,
        'sections_found': sum(v is not None for v in sections.values())
    })

audit_df = pd.DataFrame(records).sort_values('ticker').reset_index(drop=True)

# ── Summary stats ─────────────────────────────────────────────────────────────
print("=== Section Size Summary (characters) ===")
for col, label in [('business_chars', 'BUSINESS'), ('risk_chars', 'RISK FACTORS'), ('mda_chars', 'MD&A')]:
    s = audit_df[col][audit_df[col] > 0]
    print(f"\n{label}")
    print(f"  Mean:   {s.mean():>10,.0f}")
    print(f"  Median: {s.median():>10,.0f}")
    print(f"  Min:    {s.min():>10,.0f}")
    print(f"  Max:    {s.max():>10,.0f}")

# ── Coverage ──────────────────────────────────────────────────────────────────
print(f"\n=== Extraction Coverage ===")
for n in [3, 2, 1, 0]:
    count = (audit_df['sections_found'] == n).sum()
    print(f"  {n}/3 sections found: {count} companies")

# ── Flag partial extractions ──────────────────────────────────────────────────
partial = audit_df[audit_df['sections_found'] < 3]
if not partial.empty:
    print(f"\n=== Partial Extractions ({len(partial)} companies) ===")
    print(partial[['ticker', 'sections_found', 'business_chars', 'risk_chars', 'mda_chars']].to_string(index=False))

audit_df