In [1]:

from sec_edgar_downloader import Downloader
import pandas as pd
import os

In [None]:
# -*- coding: utf-8 -*-
"""
Download 10-K HTML filings for firms in cik_list.csv
for 2017–2024 using sec-edgar-downloader.
"""

import pandas as pd
import os
from sec_edgar_downloader import Downloader
from time import sleep
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

# Get the current working directory (where the script is located)
cwd = os.getcwd()

# Define output directory for HTML filings
output_dir = os.path.join(cwd, "10k_html_filings_2017_2024") # Changed name slightly for clarity
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# --- MODIFIED SECTION: Load CIK Data ---
input_cik_file = "cik_list.xlsx"
try:
    # Assuming the column name in your CSV is 'cik' (lowercase) as specified.
    # If it's 'CIK' (uppercase), change 'cik' to 'CIK' below.
    cik_data = pd.read_excel(input_cik_file)
    if 'cik' not in cik_data.columns:
        # Try to find a CIK-like column if 'cik' is not present
        potential_cik_cols = [col for col in cik_data.columns if 'cik' in col.lower()]
        if potential_cik_cols:
            cik_column_name = potential_cik_cols[0]
            print(f"Info: Using column '{cik_column_name}' from '{input_cik_file}' for CIKs.")
        else:
            raise ValueError(f"Error: Column 'cik' not found in '{input_cik_file}'. Please ensure your CSV has a column named 'cik'. Found columns: {cik_data.columns.tolist()}")
    else:
        cik_column_name = 'cik'

except FileNotFoundError:
    print(f"Error: The file '{input_cik_file}' was not found in the current directory: {cwd}")
    print("Please create this file with a single column named 'cik' containing the CIK numbers.")
    exit()
except Exception as e:
    print(f"Error reading '{input_cik_file}': {e}")
    exit()

# Ensure CIK is a string and padded with zeros to 10 digits (SEC format)
# Also remove any non-numeric characters and convert to int first to handle potential floats like "12345.0"
try:
    cik_data['CIK_formatted'] = cik_data[cik_column_name].astype(str).str.replace(r'\.0$', '', regex=True).str.zfill(10)
except AttributeError as e:
    print(f"Error processing CIK column '{cik_column_name}'. Are you sure it contains CIK numbers? Error: {e}")
    exit()


print(f"Total CIKs to process from '{input_cik_file}': {len(cik_data['CIK_formatted'].unique())}")
print("Sample CIKs (first 5):")
print(cik_data['CIK_formatted'].head())
# --- END MODIFIED SECTION ---

## Change the username and email address below in the Downloader initialization


dl = Downloader(
    company_name="Tejaswi Kalaga", 
    email_address="kalagat23@iimb.ac.in",   
    download_folder=output_dir
)


total_downloaded = 0
failed_ciks = []


for cik_value in cik_data['CIK_formatted'].unique():
    try:
        print(f"Downloading 10-K filings for CIK {cik_value} from 2017 to 2024")
        num_downloaded = dl.get(
            "10-K",
            cik_value,       # Use CIK directly
            after="2017-01-01", # New start date
            before="2024-12-31",# New end date (inclusive)
            download_details=True
        )
        print(f"Download completed for CIK {cik_value}. Number of filings downloaded: {num_downloaded}")
        total_downloaded += num_downloaded
        sleep(1)  # Delay to avoid rate limiting (SEC allows 10 requests/second)
    except Exception as e:
        print(f"Error downloading 10-K for CIK {cik_value}: {e}")
        failed_ciks.append(cik_value)
        sleep(1)  # Delay to avoid rate limiting set by EDGAR for bulk downloads


print("\n--- Download Summary ---")
print(f"Finished downloading 10-K HTML filings.")
print(f"Total filings downloaded: {total_downloaded}")
print(f"Files are saved in {output_dir}")

if failed_ciks:
    print(f"\nFailed to download for the following CIKs ({len(failed_ciks)}):")
    for fcik in failed_ciks:
        print(f"- {fcik}")
else:
    print("\nAll CIKs processed successfully (though some may not have had filings in the period).")

Total CIKs to process from 'cik_list.xlsx': 487
Sample CIKs (first 5):
0    0000001800
1    0000002488
2    0000004962
3    0000004977
4    0000005272
Name: CIK_formatted, dtype: object
Downloading 10-K filings for CIK 0000001800 from 2017 to 2024
Download completed for CIK 0000001800. Number of filings downloaded: 8
Downloading 10-K filings for CIK 0000002488 from 2017 to 2024
Download completed for CIK 0000002488. Number of filings downloaded: 8
Downloading 10-K filings for CIK 0000004962 from 2017 to 2024
Download completed for CIK 0000004962. Number of filings downloaded: 8
Downloading 10-K filings for CIK 0000004977 from 2017 to 2024
Download completed for CIK 0000004977. Number of filings downloaded: 8
Downloading 10-K filings for CIK 0000005272 from 2017 to 2024
Download completed for CIK 0000005272. Number of filings downloaded: 8
Downloading 10-K filings for CIK 0000006201 from 2017 to 2024
Download completed for CIK 0000006201. Number of filings downloaded: 8
Downloading 10-K