In [1]:
import csv
import time
import requests
import json
import os
import re
import random
import xml.etree.ElementTree as ET
from datetime import datetime
import logging

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("sec_downloader.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Create output directory if it doesn't exist
output_dir = "8k_filings_json"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Configuration
TARGET_YEAR = "2024"       # Only download filings from this year
MIN_DELAY = 1.0            # Minimum delay between requests in seconds
MAX_DELAY = 3.0            # Maximum delay between requests in seconds
MAX_RETRIES = 3            # Maximum number of retries for a request

# ATOM namespace for parsing XML response
ATOM_NS = {'atom': 'http://www.w3.org/2005/Atom'}

# Configure request settings with user-agent
def get_headers(host="www.sec.gov"):
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    email = "he431641@ucf.edu"  # REPLACE WITH YOUR ACTUAL EMAIL
    
    return {
        "User-Agent": f"{email} {user_agent}",
        "Accept": "application/xml, text/html, application/xhtml+xml, application/json",
        "Accept-Encoding": "gzip, deflate, br",
        "Host": host,
        "Connection": "keep-alive"
    }

def download_sp500_list():
    """Download current S&P 500 companies list from a reliable source"""
    logger.info("Downloading S&P 500 companies list...")
    
    try:
        # Using Wikipedia as a source for S&P 500 constituents
        url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
        response = requests.get(url, headers=get_headers(host="en.wikipedia.org"))
        
        if response.status_code != 200:
            logger.error(f"Failed to download S&P 500 list: HTTP {response.status_code}")
            return set()
        
        # Extract tickers from the Wikipedia table
        # Look for the first table with S&P 500 components
        table_pattern = r'<table[^>]*>.*?<th[^>]*>Symbol</th>.*?</table>'
        table_match = re.search(table_pattern, response.text, re.DOTALL)
        
        if not table_match:
            logger.error("Could not find S&P 500 table in Wikipedia page")
            return set()
        
        table_html = table_match.group(0)
        
        # Extract tickers from table rows
        ticker_pattern = r'<tr[^>]*>.*?<td[^>]*>(.*?)</td>'
        tickers = re.findall(ticker_pattern, table_html)
        
        # Clean up tickers (they will be in the first column of each row)
        sp500_tickers = set()
        for i, ticker in enumerate(tickers):
            # Skip header row
            if i == 0:
                continue
                
            # Clean ticker (remove HTML tags and whitespace)
            clean_ticker = re.sub(r'<[^>]*>', '', ticker).strip().upper()
            if clean_ticker:
                sp500_tickers.add(clean_ticker)
        
        logger.info(f"Downloaded {len(sp500_tickers)} S&P 500 tickers")
        return sp500_tickers
        
    except Exception as e:
        logger.error(f"Error downloading S&P 500 list: {str(e)}")
        return set()

def load_companies():
    """Load company information from sec_companies2.csv"""
    companies = []
    
    try:
        with open('sec_companies2.csv', 'r') as f:
            reader = csv.DictReader(f)
            for row in reader:
                # Extract CIK and other information
                cik = row.get('CIK', '')
                ticker = row.get('Ticker', '').upper()
                name = row.get('Company Name', '')
                
                # Get the count of 8-Ks to download, default to 4 if not specified
                try:
                    count = int(row.get('Count', '4'))
                except (ValueError, TypeError):
                    count = 4
                
                if cik and (ticker or name):  # Ensure we have at least CIK and either ticker or name
                    # Clean up CIK (remove leading zeros for the URL)
                    clean_cik = str(cik).lstrip('0')
                    
                    companies.append({
                        'cik': clean_cik, 
                        'ticker': ticker,
                        'name': name,
                        'count': count,
                        'is_sp500': False  # Will be updated later
                    })
        
        logger.info(f"Loaded {len(companies)} companies from sec_companies2.csv")
        return companies
    
    except FileNotFoundError:
        logger.error(f"File not found: sec_companies2.csv")
        # Create a sample CSV file format to help the user
        with open('sec_companies2_sample.csv', 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(['CIK', 'Ticker', 'Company Name', 'Count'])
            writer.writerow(['1318605', 'TSLA', 'Tesla Inc', '4'])
            writer.writerow(['320193', 'AAPL', 'Apple Inc', '4'])
        logger.info(f"Created a sample file 'sec_companies2_sample.csv' for reference")
        return []
    except Exception as e:
        logger.error(f"Error loading companies: {str(e)}")
        return []

def sort_companies_with_sp500_priority(companies):
    """Sort companies list so S&P 500 companies come first"""
    # Download S&P 500 tickers
    sp500_tickers = download_sp500_list()
    
    # If we couldn't get the S&P 500 list, use a fallback list of major companies
    if not sp500_tickers:
        logger.warning("Using fallback S&P 500 list (top companies only)")
        fallback_sp500 = {
            "AAPL", "MSFT", "AMZN", "NVDA", "GOOGL", "GOOG", "META", "BRK.B", 
            "LLY", "TSLA", "V", "UNH", "JPM", "XOM", "AVGO", "MA", "PG", "HD", 
            "COST", "MRK", "CVX", "ABBV", "KO", "PEP", "WMT", "ACN", "ADBE", 
            "MCD", "BAC", "CRM", "TXN", "LIN", "AMD", "TMO", "CSCO", "ABT", 
            "DHR", "CMCSA", "NKE", "INTC", "NFLX", "VZ", "PM", "WFC", "DIS", 
            "COP", "INTU", "IBM", "ORCL", "QCOM", "SPGI", "CAT", "GE", "HON", 
            "AMGN", "LOW", "AXP", "BA", "DE", "UPS", "RTX", "MS", "TJX", "GS", 
            "BLK", "PLD", "AMAT", "SBUX", "MDT", "CVS", "GILD", "SYK", "AMT", 
            "MDLZ", "ELV", "C", "ADI", "BKNG", "ADP", "ISRG", "MMC", "LRCX", 
            "TGT", "REGN", "VRTX", "CI", "SO", "PGR", "ZTS", "BSX", "CB", "MO", 
            "BDX", "DUK", "ETN", "SCHW", "SLB"
        }
        sp500_tickers = fallback_sp500
    
    # Mark companies that are in the S&P 500
    for company in companies:
        if company['ticker'] in sp500_tickers:
            company['is_sp500'] = True
    
    # Sort companies - S&P 500 first, then others
    sp500_companies = [c for c in companies if c['is_sp500']]
    other_companies = [c for c in companies if not c['is_sp500']]
    
    # Log the results
    logger.info(f"Found {len(sp500_companies)} S&P 500 companies in the input file")
    logger.info(f"Found {len(other_companies)} non-S&P 500 companies in the input file")
    
    # Return the combined sorted list
    return sp500_companies + other_companies

def get_processed_companies():
    """Get already processed companies to avoid duplicates"""
    processed = set()
    if os.path.exists('processed_companies.txt'):
        with open('processed_companies.txt', 'r') as f:
            for line in f:
                processed.add(line.strip())
    return processed

def save_processed_company(cik):
    """Save a company as processed"""
    with open('processed_companies.txt', 'a') as f:
        f.write(f"{cik}\n")

def extract_accession_from_url(url):
    """Extract accession number from URL patterns common in SEC filings"""
    # Pattern 1: /data/CIK/ACCESSION/
    pattern1 = r'/data/\d+/(\d{10}-\d{2}-\d{6})/'
    match = re.search(pattern1, url)
    if match:
        return match.group(1)
    
    # Pattern 2: /ACCESSION-index.htm
    pattern2 = r'/(\d{10}-\d{2}-\d{6})-index\.html?'
    match = re.search(pattern2, url)
    if match:
        return match.group(1)
    
    # Pattern 3: Look for /XXXXXXXXXX/XXXXXXXXXX/XXXXXXXXXX/ pattern
    # This is typically /CIK/ACCESSION/ but without dashes
    pattern3 = r'/(\d{10})/(\d{10})/'
    match = re.search(pattern3, url)
    if match:
        # Format as standard accession with dashes
        acc = match.group(2)
        if len(acc) == 10:
            # Probably not an accession
            return None
            
        # Try to format as accession XXXXXXXXXX-XX-XXXXXX
        if len(acc) >= 12:
            return f"{acc[0:10]}-{acc[10:12]}-{acc[12:]}"
    
    # Pattern 4: /ACCESSION-index in the URL path
    pattern4 = r'/(\d{10}\d{2}\d{6})-index'
    match = re.search(pattern4, url)
    if match:
        acc = match.group(1)
        if len(acc) >= 18:
            return f"{acc[0:10]}-{acc[10:12]}-{acc[12:18]}"
    
    return None

def get_filings_atom(company):
    """Get 8-K filings for a company using the SEC's Atom feed"""
    cik = company['cik']
    ticker = company['ticker']
    company_name = company['name']
    count = company['count']
    is_sp500 = company['is_sp500']
    
    search_url = f"https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={cik}&type=8-K&count={count}&output=atom"
    
    sp500_marker = "[S&P 500]" if is_sp500 else ""
    logger.info(f"Requesting ATOM feed for {ticker or company_name} {sp500_marker} (CIK {cik})...")
    
    # Random delay before request
    time.sleep(random.uniform(MIN_DELAY, MAX_DELAY))
    
    try:
        response = requests.get(search_url, headers=get_headers())
        
        # Handle HTTP status codes
        if response.status_code != 200:
            logger.error(f"Error: HTTP {response.status_code} for {ticker or company_name}")
            return []
        
        # Save raw response for debugging if needed
        debug_dir = "debug_xml"
        if not os.path.exists(debug_dir):
            os.makedirs(debug_dir)
        with open(f"{debug_dir}/{cik}_atom_response.xml", "w", encoding="utf-8") as f:
            f.write(response.text)
        
        # Check if the response is XML
        if "<?xml" not in response.text[:100]:
            logger.warning(f"Response for {ticker or company_name} does not appear to be XML")
            return []
        
        # Parse XML
        try:
            root = ET.fromstring(response.content)
            
            # Find all entries (filings)
            entries = root.findall('./atom:entry', ATOM_NS)
            
            # Process entries
            filings = []
            for i, entry in enumerate(entries):
                # Extract filing information using proper namespace
                title_elem = entry.find('./atom:title', ATOM_NS)
                updated_elem = entry.find('./atom:updated', ATOM_NS)
                link_elem = entry.find('./atom:link', ATOM_NS)
                summary_elem = entry.find('./atom:summary', ATOM_NS)
                
                # Extract text from elements
                title_text = title_elem.text if title_elem is not None else "No Title"
                updated_text = updated_elem.text if updated_elem is not None else ""
                
                # Format the date (remove time part)
                filing_date = updated_text.split('T')[0] if updated_text else "No Date"
                
                # Skip filings that are not from the target year
                if not filing_date.startswith(TARGET_YEAR):
                    continue
                
                # Get link from attributes
                link_href = link_elem.get('href') if link_elem is not None else ""
                
                # Extract description from summary
                summary_text = summary_elem.text if summary_elem is not None else ""
                
                # Parse the filing date from summary if not available from updated
                if filing_date == "No Date" and summary_text:
                    date_match = re.search(r'<b>Filed:</b>\s+(\d{4}-\d{2}-\d{2})', summary_text)
                    if date_match:
                        filing_date = date_match.group(1)
                        # Skip if not from target year
                        if not filing_date.startswith(TARGET_YEAR):
                            continue
                
                # Extract the accession number from several possible sources
                accession_number = None
                
                # Method 1: From the summary text
                if summary_text:
                    accession_match = re.search(r'Accession Number:\s+(\d{10}-\d{2}-\d{6})', summary_text)
                    if accession_match:
                        accession_number = accession_match.group(1)
                
                # Method 2: From the title text
                if not accession_number and title_text:
                    title_match = re.search(r'(\d{10}-\d{2}-\d{6})', title_text)
                    if title_match:
                        accession_number = title_match.group(1)
                
                # Method 3: From the link URL
                if not accession_number and link_href:
                    accession_number = extract_accession_from_url(link_href)
                
                # Create a filing ID even if we don't have an accession number
                filing_id = accession_number if accession_number else f"filing_{i}"
                
                if link_href and filing_date != "No Date":
                    filings.append({
                        'title': title_text,
                        'date': filing_date,
                        'url': f"https://www.sec.gov{link_href}" if link_href.startswith('/') else link_href,
                        'summary': summary_text,
                        'accession_number': accession_number,
                        'filing_id': filing_id,
                        'cik': cik,
                        'ticker': ticker,
                        'company_name': company_name,
                        'is_sp500': is_sp500
                    })
            
            # Sort filings by date (newer first)
            filings.sort(key=lambda x: x['date'], reverse=True)
            
            sp500_marker = "[S&P 500]" if is_sp500 else ""
            logger.info(f"Found {len(filings)} 8-K filings from {TARGET_YEAR} for {ticker or company_name} {sp500_marker}")
            return filings
            
        except ET.ParseError as pe:
            logger.error(f"XML parsing error for {ticker or company_name}: {str(pe)}")
            return []
            
    except Exception as e:
        logger.error(f"Error with {ticker or company_name}: {str(e)}")
        return []

def get_filing_details(filing):
    """Get the filing details from the SEC website"""
    filing_url = filing['url']
    cik = filing['cik']
    ticker = filing['ticker']
    company_name = filing['company_name']
    filing_date = filing['date']
    filing_id = filing.get('filing_id', 'unknown')
    accession_number = filing.get('accession_number')
    is_sp500 = filing.get('is_sp500', False)
    
    # Debug information
    sp500_marker = "[S&P 500]" if is_sp500 else ""
    logger.info(f"Processing filing data for {filing_date} - {ticker or company_name} {sp500_marker}")
    logger.info(f"  URL: {filing_url}")
    logger.info(f"  Accession Number: {accession_number}")
    
    # Try to extract accession number from URL if not already available
    if not accession_number:
        accession_number = extract_accession_from_url(filing_url)
        if accession_number:
            logger.info(f"  Extracted accession number from URL: {accession_number}")
    
    # Generate a unique, clean identifier for this filing (for the filename)
    if accession_number:
        # Use accession number if available, removing dashes
        clean_id = accession_number.replace('-', '')
    else:
        # Use a generated ID if no accession number
        clean_id = filing_id.replace(' ', '_').replace('-', '_')
    
    # Random delay before request
    time.sleep(random.uniform(MIN_DELAY, MAX_DELAY))
    
    try:
        # First get the filing detail page
        detail_response = requests.get(filing_url, headers=get_headers())
        
        if detail_response.status_code != 200:
            logger.error(f"Failed to access filing detail page: HTTP {detail_response.status_code}")
            return None, None
        
        # Save the detail page for debugging
        debug_dir = "debug_html"
        if not os.path.exists(debug_dir):
            os.makedirs(debug_dir)
        with open(f"{debug_dir}/{ticker or cik}_{clean_id}_detail.html", "w", encoding="utf-8") as f:
            f.write(detail_response.text)
        
        # Try to extract accession number from detail page if not already available
        if not accession_number:
            detail_text = detail_response.text
            
            # Method 1: Look for accession number in the text
            acc_match = re.search(r'Accession Number:\s*(\d{10}-\d{2}-\d{6})', detail_text)
            if acc_match:
                accession_number = acc_match.group(1)
                clean_id = accession_number.replace('-', '')
                logger.info(f"  Found accession number from detail page: {accession_number}")
            
            # Method 2: Look for it in the URL of document links
            if not accession_number:
                doc_links = re.findall(r'href="([^"]+/\d{10}-\d{2}-\d{6}[^"]+)"', detail_text)
                if doc_links:
                    for link in doc_links:
                        extracted = extract_accession_from_url(link)
                        if extracted:
                            accession_number = extracted
                            clean_id = accession_number.replace('-', '')
                            logger.info(f"  Found accession number from document link: {accession_number}")
                            break
            
            # Method 3: Look for accession number pattern in the page content
            if not accession_number:
                # Look for any 10-2-6 digit pattern that might be an accession number
                acc_pattern = re.findall(r'(\d{10}-\d{2}-\d{6})', detail_text)
                if acc_pattern:
                    accession_number = acc_pattern[0]
                    clean_id = accession_number.replace('-', '')
                    logger.info(f"  Found accession number pattern in detail page: {accession_number}")
        
        # Try to get the filing text in different ways
        submissions = []
        
        # Method 1: Try the standard .txt submission file if we have an accession number
        if accession_number:
            clean_accession = accession_number.replace('-', '')
            txt_url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{clean_accession}/{accession_number}.txt"
            
            time.sleep(random.uniform(MIN_DELAY, MAX_DELAY))
            
            txt_response = requests.get(txt_url, headers=get_headers())
            
            if txt_response.status_code == 200:
                logger.info(f"  Successfully retrieved full text submission")
                submissions.append({
                    "type": "full_submission",
                    "url": txt_url,
                    "text": txt_response.text
                })
        
        # Method 2: Try to get individual document HTML files from the detail page
        document_links = extract_document_links(detail_response.text, cik)
        
        for doc in document_links:
            doc_url = doc['url']
            doc_type = doc['text']
            
            if '8-K' in doc_type or 'EX-' in doc_type.upper():
                time.sleep(random.uniform(MIN_DELAY, MAX_DELAY))
                
                try:
                    doc_response = requests.get(doc_url, headers=get_headers())
                    
                    if doc_response.status_code == 200:
                        logger.info(f"  Retrieved document: {doc_type}")
                        submissions.append({
                            "type": doc_type,
                            "url": doc_url,
                            "text": doc_response.text
                        })
                except Exception as e:
                    logger.warning(f"  Error getting document {doc_type}: {str(e)}")
        
        # Create our response structure
        if submissions:
            # Create a structured JSON with the filing data and all documents
            filing_data = {
                "companyName": company_name,
                "cik": cik,
                "ticker": ticker,
                "filingDate": filing_date,
                "formType": "8-K",
                "accessionNumber": accession_number if accession_number else "unknown",
                "filingUrl": filing_url,
                "isSP500": is_sp500,
                "documents": []
            }
            
            # Process each submission
            for sub in submissions:
                # Extract document type
                doc_type = sub.get("type", "UNKNOWN")
                
                # If this is the full submission, extract the individual document sections
                if doc_type == "full_submission":
                    sections = extract_sections(sub.get("text", ""))
                    for section in sections:
                        filing_data["documents"].append({
                            "type": section.get("type", "UNKNOWN"),
                            "description": section.get("description", ""),
                            "text": section.get("text", ""),
                            "url": sub.get("url", "")
                        })
                else:
                    # Add this as a separate document
                    filing_data["documents"].append({
                        "type": doc_type,
                        "description": "",
                        "text": sub.get("text", ""),
                        "url": sub.get("url", "")
                    })
            
            return filing_data, clean_id
        else:
            # If we couldn't get any documents, use the detail page
            logger.warning(f"Could not retrieve any documents. Using detail page only.")
            
            filing_data = {
                "companyName": company_name,
                "cik": cik,
                "ticker": ticker,
                "filingDate": filing_date,
                "formType": "8-K",
                "accessionNumber": accession_number if accession_number else "unknown",
                "filingUrl": filing_url,
                "isSP500": is_sp500,
                "detailPage": detail_response.text,
                "documentLinks": document_links
            }
            
            return filing_data, clean_id
        
    except Exception as e:
        logger.error(f"Error getting filing details: {str(e)}")
        return None, None

def extract_document_links(detail_page, cik):
    """Extract document links from the detail page"""
    links = []
    
    # Pattern 1: Look for document table rows with links
    doc_pattern = re.compile(r'<tr[^>]*>.*?<td[^>]*>.*?<a[^>]*href="([^"]+)"[^>]*>([^<]+)</a>.*?</tr>', re.DOTALL)
    matches = doc_pattern.findall(detail_page)
    
    for href, text in matches:
        # Clean up the text and href
        text = text.strip()
        href = href.strip()
        
        # Check if this is likely a document link
        if ('8-K' in text or 'EX-' in text.upper() or 'ex-' in text) and href:
            # Make the URL absolute if it's relative
            full_url = f"https://www.sec.gov{href}" if href.startswith('/') else href
            links.append({
                "text": text,
                "url": full_url
            })
    
    # Pattern 2: Look for HTML document links in any context
    html_pattern = re.compile(r'href=["\']([^"\']*?\.htm(?:l)?)["\']', re.IGNORECASE)
    html_matches = html_pattern.findall(detail_page)
    
    for href in html_matches:
        href = href.strip()
        if href and '8-K' in href.upper() or 'EX-' in href.upper():
            # Make the URL absolute if it's relative
            full_url = f"https://www.sec.gov{href}" if href.startswith('/') else href
            # Generate a description based on the URL
            description = "8-K"
            if "ex-" in href.lower() or "exhibit" in href.lower():
                description = f"Exhibit {href.split('/')[-1]}"
            
            # Check if it's already in the links
            if not any(link["url"] == full_url for link in links):
                links.append({
                    "text": description,
                    "url": full_url
                })
    
    return links

def extract_sections(full_text):
    """Extract document sections from the full text submission"""
    sections = []
    
    # Look for <DOCUMENT> tags
    doc_pattern = re.compile(r'<DOCUMENT>(.*?)</DOCUMENT>', re.DOTALL)
    doc_matches = doc_pattern.findall(full_text)
    
    for doc_text in doc_matches:
        # Try to extract document type
        type_match = re.search(r'<TYPE>(.*?)</TYPE>', doc_text)
        doc_type = type_match.group(1) if type_match else "UNKNOWN"
        
        # Try to extract document description
        desc_match = re.search(r'<DESCRIPTION>(.*?)</DESCRIPTION>', doc_text)
        description = desc_match.group(1) if desc_match else ""
        
        # Try to extract document text
        text_match = re.search(r'<TEXT>(.*?)</TEXT>', doc_text, re.DOTALL)
        text = text_match.group(1) if text_match else doc_text
        
        sections.append({
            "type": doc_type,
            "description": description,
            "text": text
        })
    
    # If no sections found, try to look for exhibit markers
    if not sections:
        # Common exhibit markers in 8-K filings
        exhibit_markers = [
            r'EXHIBIT\s+(\d+\.\d+)',
            r'ITEM\s+(\d+\.\d+)'
        ]
        
        for marker in exhibit_markers:
            pattern = re.compile(marker, re.IGNORECASE)
            matches = list(pattern.finditer(full_text))
            
            # Process each match to extract sections
            for i, match in enumerate(matches):
                section_type = match.group(0)
                start_pos = match.start()
                
                # Determine end position (next match or end of text)
                if i < len(matches) - 1:
                    end_pos = matches[i+1].start()
                else:
                    end_pos = len(full_text)
                
                # Extract the section text
                section_text = full_text[start_pos:end_pos].strip()
                
                sections.append({
                    "type": section_type,
                    "description": "",
                    "text": section_text
                })
    
    return sections

def main():
    # Timestamp the start of the process
    start_time = datetime.now()
    logger.info(f"Process started at {start_time.strftime('%Y-%m-%d %H:%M:%S')}")
    
    # Load companies from the CSV file
    companies = load_companies()
    
    if not companies:
        logger.error("No companies loaded. Check the sec_companies2.csv file.")
        return
    
    # Sort companies so S&P 500 companies are processed first
    companies = sort_companies_with_sp500_priority(companies)
    
    # Get already processed companies
    processed_companies = get_processed_companies()
    logger.info(f"Found {len(processed_companies)} already processed companies")
    
    # Create or append to output file for tracking
    file_exists = os.path.exists('8k_filings.csv')
    mode = 'a' if file_exists else 'w'
    
    with open('8k_filings.csv', mode, newline='') as outfile:
        writer = csv.writer(outfile)
        
        # Write header if new file
        if not file_exists:
            writer.writerow(['CIK', 'Ticker', 'Company Name', 'Filing Date', 'Form Type', 'Accession Number', 'Local File Path', 'Is S&P 500'])
        
        # Track overall stats
        total_downloaded = 0
        companies_processed = 0
        sp500_processed = 0
        other_processed = 0
        
        # Process each company
        for company_idx, company in enumerate(companies):
            cik = company['cik']
            ticker = company['ticker']
            company_name = company['name']
            is_sp500 = company['is_sp500']
            
            # Skip if already processed
            if cik in processed_companies:
                logger.info(f"Skipping {ticker or company_name} (CIK {cik}) - Already processed")
                companies_processed += 1
                if is_sp500:
                    sp500_processed += 1
                else:
                    other_processed += 1
                continue
            
            sp500_marker = "[S&P 500]" if is_sp500 else ""
            logger.info(f"\nProcessing {ticker or company_name} {sp500_marker} (CIK {cik})")
            
            # Try to get filings for this company
            retry_count = 0
            filings = []
            
            while retry_count < MAX_RETRIES and not filings:
                if retry_count > 0:
                    logger.info(f"Retry {retry_count}/{MAX_RETRIES} for {ticker or company_name}")
                    time.sleep(random.uniform(5.0, 10.0))
                
                filings = get_filings_atom(company)
                retry_count += 1
            
            # If we still don't have filings after retries, skip this company
            if not filings:
                logger.warning(f"No filings found for {ticker or company_name} after {MAX_RETRIES} attempts")
                # Mark as processed anyway to avoid repeated attempts
                save_processed_company(cik)
                companies_processed += 1
                if is_sp500:
                    sp500_processed += 1
                else:
                    other_processed += 1
                continue
            
            # Download filings
            company_downloaded = 0
            
            for filing_idx, filing in enumerate(filings):
                try:
                    filing_date = filing['date']
                    
                    # Get filing details with improved error handling
                    filing_data, clean_id = get_filing_details(filing)
                    
                    if not filing_data or not clean_id:
                        logger.warning(f"Failed to get filing data/ID for {ticker or company_name} filing on {filing_date}")
                        continue
                    
                    # Create filename with clean ID
                    clean_date = filing_date.replace('-', '')
                    filename = f"{ticker or cik}_{clean_date}_{clean_id}_8K.json"
                    local_path = os.path.join(output_dir, filename)
                    
                    # Get accession number from the filing data
                    accession_number = filing_data.get("accessionNumber", "unknown")
                    
                    # Check if already downloaded
                    if os.path.exists(local_path):
                        logger.info(f"Already downloaded: {filename}")
                        writer.writerow([cik, ticker, company_name, filing_date, '8-K', accession_number, local_path, is_sp500])
                        company_downloaded += 1
                        total_downloaded += 1
                        continue
                    
                    # Save to file as JSON
                    with open(local_path, 'w', encoding='utf-8') as f:
                        json.dump(filing_data, f, indent=2)
                    
                    writer.writerow([cik, ticker, company_name, filing_date, '8-K', accession_number, local_path, is_sp500])
                    company_downloaded += 1
                    total_downloaded += 1
                    
                    logger.info(f"Downloaded {filename} ({company_downloaded}/{len(filings)})")
                    
                except Exception as e:
                    logger.error(f"Error processing filing {filing_idx+1} for {ticker or company_name}: {str(e)}")
                    import traceback
                    logger.error(traceback.format_exc())
                    continue
            
            # Mark company as processed
            save_processed_company(cik)
            companies_processed += 1
            if is_sp500:
                sp500_processed += 1
            else:
                other_processed += 1
            
            sp500_marker = "[S&P 500]" if is_sp500 else ""
            logger.info(f"Completed {ticker or company_name} {sp500_marker} - Downloaded {company_downloaded} filings")
            logger.info(f"Progress: {companies_processed}/{len(companies)} companies processed ({companies_processed/len(companies)*100:.1f}%)")
            logger.info(f"S&P 500 companies: {sp500_processed}/{len([c for c in companies if c['is_sp500']])}")
            logger.info(f"Other companies: {other_processed}/{len([c for c in companies if not c['is_sp500']])}")
            logger.info(f"Total documents downloaded: {total_downloaded}")
            
            # Calculate estimated time remaining
            if companies_processed > 0:
                elapsed_time = (datetime.now() - start_time).total_seconds()
                avg_time_per_company = elapsed_time / companies_processed
                companies_remaining = len(companies) - companies_processed
                est_time_remaining = companies_remaining * avg_time_per_company
                
                # Format as hours and minutes
                hours, remainder = divmod(est_time_remaining, 3600)
                minutes, seconds = divmod(remainder, 60)
                logger.info(f"Estimated time remaining: {int(hours)} hours, {int(minutes)} minutes")
            
            # Add longer delay between companies to avoid rate limiting
            if company_idx < len(companies) - 1:
                delay_time = random.uniform(3.0, 8.0)
                logger.info(f"Waiting {delay_time:.2f} seconds before next company...")
                time.sleep(delay_time)
        
        # Calculate total elapsed time
        end_time = datetime.now()
        elapsed_time = (end_time - start_time).total_seconds()
        hours, remainder = divmod(elapsed_time, 3600)
        minutes, seconds = divmod(remainder, 60)
        
        logger.info(f"\nCompleted processing {companies_processed} companies")
        logger.info(f"S&P 500 companies processed: {sp500_processed}")
        logger.info(f"Other companies processed: {other_processed}")
        logger.info(f"Total documents downloaded: {total_downloaded}")
        logger.info(f"Total processing time: {int(hours)} hours, {int(minutes)} minutes, {int(seconds)} seconds")
        logger.info(f"Start time: {start_time.strftime('%Y-%m-%d %H:%M:%S')}")
        logger.info(f"End time: {end_time.strftime('%Y-%m-%d %H:%M:%S')}")
        logger.info(f"Files saved to {output_dir}")
        logger.info(f"Metadata saved to 8k_filings.csv")

if __name__ == "__main__":
    main()

2025-03-20 10:33:42,598 - INFO - Process started at 2025-03-20 10:33:42
2025-03-20 10:33:42,619 - INFO - Loaded 9720 companies from sec_companies2.csv
2025-03-20 10:33:42,619 - INFO - Downloading S&P 500 companies list...
2025-03-20 10:33:42,961 - ERROR - Could not find S&P 500 table in Wikipedia page
2025-03-20 10:33:42,964 - INFO - Found 96 S&P 500 companies in the input file
2025-03-20 10:33:42,964 - INFO - Found 9624 non-S&P 500 companies in the input file
2025-03-20 10:33:42,965 - INFO - Found 1272 already processed companies
2025-03-20 10:33:42,966 - INFO - Skipping AAPL (CIK 320193) - Already processed
2025-03-20 10:33:42,966 - INFO - Skipping NVDA (CIK 1045810) - Already processed
2025-03-20 10:33:42,966 - INFO - Skipping MSFT (CIK 789019) - Already processed
2025-03-20 10:33:42,967 - INFO - Skipping AMZN (CIK 1018724) - Already processed
2025-03-20 10:33:42,967 - INFO - Skipping GOOGL (CIK 1652044) - Already processed
2025-03-20 10:33:42,968 - INFO - Skipping META (CIK 1326801

KeyboardInterrupt: 