# Import Needed Packages and then Pull Tickers from ESG Data

In [None]:
#%pip install sec-edgar-downloader

**Testing download on single ticker (MSFT)**

In [1]:
from sec_edgar_downloader import Downloader
from datetime import datetime

# Initialize downloader
dl = Downloader("Syracuse University", "ecradeck@syr.edu")

# Try Microsoft with lowercase ticker converted to uppercase
ticker = "msft"  # From your CSV
ticker_upper = ticker.upper()  # Convert to "MSFT"

try:
    dl.get("10-K", ticker_upper, 
           before=datetime(2022, 4, 1),
           limit=1)
    print(f"✓ Successfully downloaded {ticker_upper} 10-K")
    print("Check 'sec-edgar-filings/MSFT/10-K/' folder")
except Exception as e:
    print(f"✗ Failed to download: {e}")

✓ Successfully downloaded MSFT 10-K
Check 'sec-edgar-filings/MSFT/10-K/' folder


**Printing Table of Contents to find relevant ESG content/sections**

In [14]:
from bs4 import BeautifulSoup
import re

file_path = "sec-edgar-filings/MSFT/10-K/0001564590-21-039151/full-submission.txt"

with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
    content = f.read()

soup = BeautifulSoup(content, 'html.parser')
text = soup.get_text()
text = ' '.join(text.split())

# The TOC usually appears near the beginning
# Let's extract the first 100,000 characters and look for the full TOC
early_text = text[:100000]

# Find where TOC starts (usually after "TABLE OF CONTENTS" or at "PART I")
toc_start = re.search(r'(TABLE OF CONTENTS|PART I\s+Item)', early_text, re.IGNORECASE)

if toc_start:
    start_pos = toc_start.start()
    # Extract ~10,000 characters to get full TOC
    toc_section = early_text[start_pos:start_pos+10000]
    
    print("TABLE OF CONTENTS")
    print(toc_section)
else:
    # If no clear TOC, just show first 20k characters
    print("No clear TOC found, showing first 20,000 characters:")
    print(text[:20000])

TABLE OF CONTENTS
PART I Item 1. Business 3 Information about our Executive Officers 20 Item 1A. Risk Factors 22 Item 1B. Unresolved Staff Comments 36 Item 2. Properties 36 Item 3. Legal Proceedings 36 Item 4. Mine Safety Disclosures 36 PART II Item 5. Market for Registrant’s Common Equity, Related Stockholder Matters, and Issuer Purchases of Equity Securities 37 Item 6. [Reserved] 38 Item 7. Management’s Discussion and Analysis of Financial Condition and Results of Operations 39 Item 7A. Quantitative and Qualitative Disclosures about Market Risk 56 Item 8. Financial Statements and Supplementary Data 57 Item 9. Changes in and Disagreements with Accountants on Accounting and Financial Disclosure 99 Item 9A. Controls and Procedures 99 Report of Management on Internal Control over Financial Reporting 99 Report of Independent Registered Public Accounting Firm 100 Item 9B. Other Information 101 PART III Item 10. Directors, Executive Officers and Corporate Governance 101 Item 11. Executive C

**Verify correct sections are located and ready for extraction**

In [13]:
from bs4 import BeautifulSoup
import re

# Read and parse the filing
file_path = "sec-edgar-filings/MSFT/10-K/0001564590-21-039151/full-submission.txt"

with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
    content = f.read()

soup = BeautifulSoup(content, 'html.parser')
text = soup.get_text()
text = ' '.join(text.split())

sections = {}

# Extract Item 1: Business
business_match = re.search(r'ITEM\s*1\.\s*BUSINESS\s+GENERAL', text, re.IGNORECASE)
if business_match:
    sections['business'] = text[business_match.start():business_match.start()+30000]

# Extract Item 1A: Risk Factors
risk_match = re.search(r'ITEM\s*1A\.\s*RISK\s*FACTORS\s+[A-Z][a-z]', text, re.IGNORECASE)
if risk_match:
    sections['risk_factors'] = text[risk_match.start():risk_match.start()+30000]

# Extract Item 7: MD&A
mda_matches = list(re.finditer(r'ITEM\s*7\.\s*MANAGEMENT.*?DISCUSSION.*?ANALYSIS.*?OVERVIEW', 
                                text, re.IGNORECASE | re.DOTALL))
for match in mda_matches:
    mda_text = text[match.start():match.start()+30000]
    if "Item 8" not in mda_text[:500]:  # Skip TOC
        sections['mda'] = mda_text
        break

# Display results
print(f"Extracted {len(sections)}/3 sections\n")

for section_name, section_text in sections.items():
    print(f"{section_name.upper()}: {len(section_text):,} characters")
    print(section_text[:1000])  # Show first 1000 characters
    print("\n[...]\n")

Extracted 3/3 sections

BUSINESS: 30,000 characters
ITEM 1. BUSINESS GENERAL Embracing Our Future Microsoft is a technology company whose mission is to empower every person and every organization on the planet to achieve more. We strive to create local opportunity, growth, and impact in every country around the world. Our platforms and tools help drive small business productivity, large business competitiveness, and public-sector efficiency. They also support new startups, improve educational and health outcomes, and empower human ingenuity. We bring technology and products together into experiences and solutions that unlock value for our customers. Our ecosystem of customers and partners has stepped up to help people and organizations in every country use technology to be resilient and transform during the most trying of circumstances. Amid rapid change we’ve witnessed technology empower telehealth, remote manufacturing, and new ways of working from home and serving customers. These c

**Pulling tickers from ESG data, then iterating the 10-k filing pull process over those tickers**

In [15]:
#%pip install pandas

Collecting pandas
  Downloading pandas-3.0.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (79 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.4.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (6.6 kB)
Downloading pandas-3.0.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (10.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m48.4 MB/s[0m  [33m0:00:00[0m6m0:00:01[0m
[?25hDownloading numpy-2.4.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.6/16.6 MB[0m [31m59.3 MB/s[0m  [33m0:00:00[0m6m0:00:01[0m
[?25hInstalling collected packages: numpy, pandas
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [pandas]2m1/2[0m [pandas]
[1A[2KSuccessfully installed numpy-2.4.2 pandas-3.0.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is av

In [12]:
from sec_edgar_downloader import Downloader
import pandas as pd
from datetime import datetime

# Configuration
EMAIL = "ecradeck@syr.edu"
ESG_DATA_PATH = "../data/ESG_data.csv"

# Load data
df = pd.read_csv(ESG_DATA_PATH)
print(f"Loaded {len(df)} companies")

# Initialize downloader
dl = Downloader("Syracuse University", EMAIL, download_folder="../data/sec-filings")

# Download filings
success = []
failed = []

for ticker in df['ticker']:
    ticker_upper = ticker.upper()
    try:
        dl.get("10-K", ticker_upper, 
               before=datetime(2022, 4, 1),
               limit=1)
        print(f"✓ {ticker_upper}")
    except Exception as e:
        print(f"✗ {ticker_upper}: {e}")

# Summary
print(f"\nSuccess: {len(success)}/{len(df)}")
print(f"Failed: {len(failed)}")

ModuleNotFoundError: No module named 'pandas'

**Extraction functions: Business, Risk-Factors, MD&A**

def extract_business_section(text):
    """Extract ITEM 1 BUSINESS section from 10-K text"""
    pattern = r'ITEM\s*1\.\s*BUSINESS\s+GENERAL'
    match = re.search(pattern, text, re.IGNORECASE)
    
    if match:
        start = match.start()
        # Extract ~30,000 characters (enough for full section)
        return text[start:start+30000]
    return None

