Reproducible steps for getting 10-k filings from [EDGAR](https://www.sec.gov/edgar).


# Example

1. Navigate to [here](https://www.sec.gov/edgar/searchedgar/companysearch)
2. Enter "ENV" into the search box.
3. Right hand side, expand "10-K (annual reports) and 10-Q (quarterly reports)"
4. Get whatever is on top.

As of 2024/01/03

"ENV" is:

* https://www.sec.gov/edgar/browse/?CIK=1337619
* https://www.sec.gov/ix?doc=/Archives/edgar/data/1337619/000133761923000012/env-20221231.htm

"MSFT" is:

* https://www.sec.gov/edgar/browse/?CIK=789019
* https://www.sec.gov/ix?doc=/Archives/edgar/data/789019/000095017023035122/msft-20230630.htm


# Python

1. Get all the `tickers` in _tickers.csv_
2. For each `ticker` in `tickers`
   1. If _data/10-k/raw_ is missing the xhtml associated with the `ticker`, download it


In [1]:
import csv
import requests
import sec_downloader.types as sec_t #type: ignore
import typing as t
from sec_downloader import Downloader #type: ignore
from pathlib import Path
from tqdm.notebook import tqdm
from datetime import datetime

In [2]:
data_folder = Path('./data/10-k')
tickers_file = Path('./data/tickers.csv')
bad_tickers_file = Path('./data/tickers.bad.csv')
user_agent = 'TextCorpusLabs/EDGAR'
limit = 20
form_type = '10-K'

In [3]:
if not tickers_file.exists():
    print('Run GetTickers.ipynb first to generate the tickers file')
    exit(1)

In [4]:
with open(tickers_file, mode = 'r', encoding = 'utf-8') as fp:
    reader = csv.reader(fp)
    next(reader)
    tickers = [row[1] for row in reader]

if bad_tickers_file.exists():
    with open(bad_tickers_file, mode = 'r', encoding = 'utf-8') as fp:
        reader = csv.reader(fp)
        next(reader)
        bad_tickers = [row[0] for row in reader]
else:
    bad_tickers = []

tickers = [ticker for ticker in sorted(tickers) if ticker not in bad_tickers]

In [5]:
def get_filing_metadata(ticker: str, form_type: str, limit: int) -> t.Union[None, t.List[sec_t.FilingMetadata]]:
    downloader = Downloader(user_agent, '')
    try:
        return downloader.get_filing_metadatas(sec_t.RequestedFilings(ticker_or_cik  = ticker, form_type = form_type, limit = limit))
    except ValueError:
        return None

In [6]:
def get_xhtml_file_path(data_folder: Path, metadata: sec_t.FilingMetadata) -> Path:
    filing_date = metadata.filing_date
    report_year = datetime.strptime(metadata.report_date, '%Y-%m-%d').year
    exchange = metadata.tickers[0].exchange
    symbol = metadata.tickers[0].symbol
    return data_folder.joinpath(f'raw/{exchange}/{report_year}/{symbol}.{filing_date}.xhtml')

In [7]:
def get_filing_xhtml(session: requests.Session, metadata: sec_t.FilingMetadata) -> t.Union[None, str]:
    doc_url = metadata.primary_doc_url
    with session.get(doc_url) as response:
        if response.status_code == 200:
            return response.text
    return None

The following block downloads files into _./data/10-k/raw_.
You can find a cached version in "Datasets" under GitHub's [Release](https://github.com/TextCorpusLabs/Edgar/releases).
The file you want is called _10-K.raw.zip_.

In [8]:
with requests.Session() as session:
    session.headers['User-Agent'] = user_agent
    for ticker in tqdm(tickers):
        metadata = get_filing_metadata(ticker, form_type, limit)
        if metadata is None:
            bad_tickers.append(ticker)
        else:
            for meta in metadata:
                xhtml_file = get_xhtml_file_path(data_folder, meta)
                if not xhtml_file.parent.exists():
                    xhtml_file.parent.mkdir(parents = True)
                if xhtml_file.exists():
                    continue
                xhtml = get_filing_xhtml(session, meta)
                if xhtml is not None:
                    with open(xhtml_file, mode = 'w') as fp:
                        fp.write(xhtml)

  0%|          | 0/10568 [00:00<?, ?it/s]

In [9]:
with open(bad_tickers_file, mode = 'w', encoding = 'utf-8', newline = '') as fp:
    writer = csv.writer(fp, delimiter = ',', quotechar = '"', quoting = csv.QUOTE_ALL)
    writer.writerow(['Ticker'])
    for ticker in bad_tickers:
        writer.writerow([ticker])