In [29]:
import os
from sec_edgar_api import EdgarClients
import requests
from bs4 import BeautifulSoup
import time
import logging

In [15]:
url = "https://www.sec.gov/files/company_tickers.json"
headers = {
    "User-Agent": "Sanjay Srinivasa (ssrin054@ucr.edu)"
}
resp = requests.get(url, headers=headers)

print(resp.status_code)  # Check if you get 200
print(resp.text[:200])   # Print first 200 chars of the response

data = resp.json()

200
{"0":{"cik_str":320193,"ticker":"AAPL","title":"Apple Inc."},"1":{"cik_str":1045810,"ticker":"NVDA","title":"NVIDIA CORP"},"2":{"cik_str":789019,"ticker":"MSFT","title":"MICROSOFT CORP"},"3":{"cik_str


In [16]:
def get_top_500_companies():
    """
    Returns a list of (ticker, CIK, company_name) tuples for the first 500
    entries in the SEC's `company_tickers.json`.
    Note: The order is just the order the SEC lists them in the JSON,
    not sorted by market cap or anything else.
    """
    headers = {
        "User-Agent": "Sanjay Srinivasa (ssrin054@ucr.edu)"
    }
    resp = requests.get(url, headers=headers)
    data = resp.json()  # structured as { "0": {...}, "1": {...}, ... }

    top_500 = []
    counter = 0
    for _, info in data.items():
        if counter >= 500:
            break
        
        ticker = info["ticker"].upper()
        cik_str = str(info["cik_str"]).zfill(10)  # zero-pad to 10 digits
        company_name = info["title"]
        
        top_500.append((ticker, cik_str, company_name))
        counter += 1
    
    return top_500

In [34]:
top_500_companies = get_top_500_companies()
print(f"Retrieved {len(top_500_companies)} companies.")

Retrieved 500 companies.


In [31]:
# Configure logging to track progress and errors
logging.basicConfig(
    filename='edgar_download.log',
    level=logging.INFO,
    format='%(asctime)s:%(levelname)s:%(message)s'
)

# Initialize EdgarClient with a proper User-Agent
# Replace with your actual information
edgar = EdgarClient(user_agent="Sanjay Srinivasa (ssrin054@ucr.edu)")

# Define desired filing types
DESIRED_FORMS = ["10-K", "10-Q"]

# Define target years
TARGET_YEARS = ["2023", "2024"]

# Define the list of top 500 companies (example with a few entries)
top_500_companies = [
    ('AAPL', '0000320193', 'Apple Inc.'),
    ('AMZN', '0001067983', 'Amazon.com, Inc.'),
    ('MSFT', '0000789019', 'Microsoft Corporation'),
    # ... add up to 500 companies
]

def extract_text_from_html(file_path):
    """
    Extracts and cleans text from an HTML filing.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            html_content = f.read()
        soup = BeautifulSoup(html_content, 'html.parser')
        # Remove scripts and styles
        for script_or_style in soup(["script", "style"]):
            script_or_style.decompose()
        # Get text
        text = soup.get_text(separator='\n')
        # Optional: Further cleaning can be done here
        return text
    except Exception as e:
        logging.error(f"Error extracting text from {file_path}: {e}")
        return ""

def download_filing_html(cik, accession_number, primary_doc):
    """
    Downloads the primary HTML document of a filing.
    Returns the file path if successful, else None.
    """
    # Remove dashes from the accession number
    folder = accession_number.replace("-", "")
    filename = f"{cik}_{accession_number}_{primary_doc}"
    
    # Define a structured directory to save filings
    directory = os.path.join("filings", cik)
    os.makedirs(directory, exist_ok=True)
    filepath = os.path.join(directory, filename)
    
    if os.path.exists(filepath):
        print(f"Already downloaded {filepath}, skipping.")
        logging.info(f"Already downloaded {filepath}, skipping.")
        return filepath
    
    url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{folder}/{primary_doc}"
    headers = {
        "User-Agent": "Sanjay Srinivasa (ssrin054@ucr.edu)",
    }
    
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            with open(filepath, "w", encoding="utf-8") as f:
                f.write(response.text)
            print(f"Saved {filepath}")
            logging.info(f"Successfully downloaded {filepath}")
            return filepath
        else:
            print(f"Error: Could not download from {url}, status code {response.status_code}")
            logging.error(f"Failed to download {url}, status code {response.status_code}")
            return None
    except Exception as e:
        print(f"Exception while downloading {url}: {e}")
        logging.error(f"Exception while downloading {url}: {e}")
        return None

def process_filing(file_path, metadata):
    """
    Processes the downloaded filing: extracts text and prints a preview for verification.
    """
    text = extract_text_from_html(file_path)
    if text:
        words = text.split()
        chunk_size = 1000  # You can adjust this as needed
        chunks = []
        for i in range(0, len(words), chunk_size):
            chunk = ' '.join(words[i:i + chunk_size])
            chunks.append(chunk)
        
        print(f"Extracted {len(chunks)} chunks from {file_path}")
        logging.info(f"Extracted {len(chunks)} chunks from {file_path}")
        
        # For verification, print the first 200 characters of the first chunk
        if chunks:
            print(f"First chunk preview for {metadata['ticker']} {metadata['form']} ({metadata['date']}):")
            print(f"{chunks[0][:200]}...\n")
            logging.info(f"First chunk preview for {file_path}:\n{chunks[0][:200]}...\n")
    else:
        print(f"No text extracted from {file_path}")
        logging.warning(f"No text extracted from {file_path}")

def download_10k_10q_filings(cik, ticker, company_name, start_year="2023", end_year="2024", forms=DESIRED_FORMS):
    """
    Downloads specified filings for a given CIK within the date range.
    """
    try:
        data = edgar.get_submissions(cik=cik)
    except Exception as e:
        print(f"Error fetching submissions for CIK {cik}: {e}")
        logging.error(f"Error fetching submissions for CIK {cik}: {e}")
        return
    
    filings = data.get("filings", {}).get("recent", {})
    
    form_list = filings.get("form", [])
    accession_numbers = filings.get("accessionNumber", [])
    primary_docs = filings.get("primaryDocument", [])
    filing_dates = filings.get("filingDate", [])
    
    # Iterate over the filings
    for i in range(len(form_list)):
        form_type = form_list[i]
        filing_date = filing_dates[i]
        year = filing_date.split("-")[0]  # Extract year from date string
        
        if form_type in forms and year in TARGET_YEARS:
            acc_num = accession_numbers[i]
            pdoc = primary_docs[i]
            
            file_path = download_filing_html(cik, acc_num, pdoc)
            if file_path:
                metadata = {
                    "ticker": ticker,
                    "form": form_type,
                    "date": filing_date
                }
                process_filing(file_path, metadata)
                
                # Respect SEC's rate limiting
                time.sleep(0.2)  # 200ms delay


In [32]:
def download_filings_for_multiple_companies(companies, start_year="2023", end_year="2024", forms=DESIRED_FORMS):
    """
    Downloads filings for multiple companies.
    """
    for idx, (ticker, cik, company_name) in enumerate(companies, start=1):
        print(f"\n[{idx}/{len(companies)}] Fetching filings for {ticker} ({company_name}) with CIK: {cik}")
        logging.info(f"Fetching filings for {ticker} ({company_name}) with CIK: {cik}")
        download_10k_10q_filings(
            cik=cik,
            ticker=ticker,
            company_name=company_name,
            start_year=start_year,
            end_year=end_year,
            forms=forms
        )


In [33]:
# Execute the download and processing for top 500 companies
download_filings_for_multiple_companies(top_500_companies[:1], start_year="2023", end_year="2024", forms=DESIRED_FORMS)



[1/1] Fetching filings for AAPL (Apple Inc.) with CIK: 0000320193
Already downloaded filings\0000320193\0000320193_0000320193-24-000123_aapl-20240928.htm, skipping.
Extracted 32 chunks from filings\0000320193\0000320193_0000320193-24-000123_aapl-20240928.htm
First chunk preview for AAPL 10-K (2024-11-01):
aapl-20240928 false 2024 FY 0000320193 P1Y P1Y P1Y P1Y http://fasb.org/us-gaap/2024#MarketableSecuritiesCurrent http://fasb.org/us-gaap/2024#MarketableSecuritiesNoncurrent http://fasb.org/us-gaap/2024...

Already downloaded filings\0000320193\0000320193_0000320193-24-000081_aapl-20240629.htm, skipping.
Extracted 11 chunks from filings\0000320193\0000320193_0000320193-24-000081_aapl-20240629.htm
First chunk preview for AAPL 10-Q (2024-08-02):
aapl-20240629 false 2024 Q3 0000320193 --09-28 P1Y P1Y P1Y P1Y http://fasb.org/us-gaap/2023#MarketableSecuritiesCurrent http://fasb.org/us-gaap/2023#MarketableSecuritiesNoncurrent http://fasb.org/us-g...

Already downloaded filings\0000320193\000