In [None]:
import zipfile, orjson, polars as pl
from tqdm import tqdm

#Dowload zip file at https://www.sec.gov/search-filings/edgar-application-programming-interfaces, bulk data submissions.zip

zip_path = "../data/raw/submissions.zip"
rows = []

def to_str_list(x):
    if x is None:
        return []
    if isinstance(x, list):
        return [str(v) for v in x]
    # some feeds put "" instead of []:
    if x == "":
        return []
    return [str(x)]

def to_int_or_none(x):
    try:
        return int(x)
    except Exception:
        return None

with zipfile.ZipFile(zip_path, "r") as zf:
    for name in tqdm(zf.namelist(), desc="Parsing JSONs"):
        if not name.endswith(".json"):
            continue

        with zf.open(name) as f:
            data = orjson.loads(f.read())

        cik_raw = data.get("cik")
        cik_int = str(int(cik_raw)) if cik_raw else None
        company = data.get("name")
        tickers = to_str_list(data.get("tickers"))
        exchanges = to_str_list(data.get("exchanges"))
        sic = data.get("sic")
        sic_desc = data.get("sicDescription") or data.get("sic_description")

        recent = data.get("filings", {}).get("recent", {}) or {}
        forms      = recent.get("form", []) or []
        filing_dt  = recent.get("filingDate", []) or []
        accessions = recent.get("accessionNumber", []) or []
        prim_docs  = recent.get("primaryDocument", []) or []
        report_dt  = recent.get("reportDate", []) or []
        accept_ts  = recent.get("acceptanceDateTime", []) or []
        items      = recent.get("items", []) or []
        acts       = recent.get("act", []) or []
        sizes      = recent.get("size", []) or []
        file_no    = recent.get("fileNumber", []) or []
        film_no    = recent.get("filmNumber", []) or []
        prim_desc  = recent.get("primaryDocDescription", []) or []

        for i, (form, date, acc, doc) in enumerate(zip(forms, filing_dt, accessions, prim_docs)):
            if not form or not form.startswith("8-K"):
                continue

            item_val = items[i] if i < len(items) else None
            # sometimes 'items' is "", a list, or None → normalize to semicolon-joined string
            if isinstance(item_val, list):
                items_str = ";".join(map(str, item_val))
            elif item_val in (None, ""):
                items_str = None
            else:
                items_str = str(item_val)

            size_v = sizes[i] if i < len(sizes) else None
            size_int = to_int_or_none(size_v)

            acc_no_dash = acc.replace("-", "") if acc else ""
            base_dir = f"https://www.sec.gov/Archives/edgar/data/{cik_int}/{acc_no_dash}"

            rows.append({
                "cik": cik_raw,
                "cik_int": cik_int,
                "company_name": company,
                "tickers": ",".join(tickers),       # <- serialized to string
                "exchanges": ",".join(exchanges),   # <- serialized to string
                "sic": sic,
                "sic_description": sic_desc,
                "form": form,
                "filing_date": date,
                "report_date": report_dt[i] if i < len(report_dt) else None,
                "acceptance_datetime": accept_ts[i] if i < len(accept_ts) else None,
                "accession": acc,
                "primary_doc": doc,
                "primary_doc_description": prim_desc[i] if i < len(prim_desc) else None,
                "items": items_str,                 # <- serialized to string
                "act": acts[i] if i < len(acts) else None,
                "size_bytes": size_int,             # <- coerced to integer
                "file_number": file_no[i] if i < len(file_no) else None,
                "film_number": film_no[i] if i < len(film_no) else None,
                "url_html": f"{base_dir}/{doc}" if doc else None,
                "url_index": f"{base_dir}/index.html",
                "url_txt": f"{base_dir}.txt",
            })

# Build DataFrame safely
df = pl.DataFrame(rows)


df.write_parquet("sec_8k_filings_enriched.parquet", compression="zstd")
df.write_csv("sec_8k_filings_enriched.csv")
print(df.head())
print(f"Total 8-K rows: {df.height:,}")


Parsing JSONs: 100%|██████████| 941071/941071 [00:40<00:00, 23512.87it/s]


shape: (5, 22)
┌────────────┬─────────┬────────────┬─────────┬───┬────────────┬──────────┬────────────┬───────────┐
│ cik        ┆ cik_int ┆ company_na ┆ tickers ┆ … ┆ film_numbe ┆ url_html ┆ url_index  ┆ url_txt   │
│ ---        ┆ ---     ┆ me         ┆ ---     ┆   ┆ r          ┆ ---      ┆ ---        ┆ ---       │
│ str        ┆ str     ┆ ---        ┆ str     ┆   ┆ ---        ┆ str      ┆ str        ┆ str       │
│            ┆         ┆ str        ┆         ┆   ┆ str        ┆          ┆            ┆           │
╞════════════╪═════════╪════════════╪═════════╪═══╪════════════╪══════════╪════════════╪═══════════╡
│ 0000005405 ┆ 5405    ┆ AMERICAN   ┆         ┆ … ┆ 95584984   ┆ null     ┆ https://ww ┆ https://w │
│            ┆         ┆ MAIZE      ┆         ┆   ┆            ┆          ┆ w.sec.gov/ ┆ ww.sec.go │
│            ┆         ┆ PRODUCTS   ┆         ┆   ┆            ┆          ┆ Archives/e ┆ v/Archive │
│            ┆         ┆ CO         ┆         ┆   ┆            ┆          ┆ 

In [18]:
df

cik,cik_int,company_name,tickers,exchanges,sic,sic_description,form,filing_date,report_date,acceptance_datetime,accession,primary_doc,primary_doc_description,items,act,size_bytes,file_number,film_number,url_html,url_index,url_txt
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,str,str
"""0000005405""","""5405""","""AMERICAN MAIZE PRODUCTS CO""","""""","""""","""2040""","""Grain Mill Products""","""8-K""","""1995-10-27""","""1995-10-18""","""1995-10-27T00:00:00.000Z""","""0000950131-95-002984""","""""","""FORM 8-K""","""5,7""","""""",8451,"""001-06244""","""95584984""",,"""https://www.sec.gov/Archives/e…","""https://www.sec.gov/Archives/e…"
"""0000005405""","""5405""","""AMERICAN MAIZE PRODUCTS CO""","""""","""""","""2040""","""Grain Mill Products""","""8-K""","""1995-07-28""","""1995-06-30""","""1995-07-28T00:00:00.000Z""","""0000950109-95-002831""","""""","""FORM 8-K""","""5""","""""",116160,"""001-06244""","""95557157""",,"""https://www.sec.gov/Archives/e…","""https://www.sec.gov/Archives/e…"
"""0000005405""","""5405""","""AMERICAN MAIZE PRODUCTS CO""","""""","""""","""2040""","""Grain Mill Products""","""8-K""","""1995-07-11""","""1995-07-07""","""1995-07-11T00:00:00.000Z""","""0000890613-95-000092""","""""","""FORM 8-K""","""5,7""","""""",27032,"""001-06244""","""95553269""",,"""https://www.sec.gov/Archives/e…","""https://www.sec.gov/Archives/e…"
"""0000005405""","""5405""","""AMERICAN MAIZE PRODUCTS CO""","""""","""""","""2040""","""Grain Mill Products""","""8-K""","""1995-04-25""","""1995-04-10""","""1995-04-25T00:00:00.000Z""","""0000950123-95-001123""","""""","""FORM 8-K CURRENT REPORT""","""5,7""","""""",7025,"""001-06244""","""95531061""",,"""https://www.sec.gov/Archives/e…","""https://www.sec.gov/Archives/e…"
"""0000005405""","""5405""","""AMERICAN MAIZE PRODUCTS CO""","""""","""""","""2040""","""Grain Mill Products""","""8-K""","""1995-04-13""","""1995-04-12""","""1995-04-13T00:00:00.000Z""","""0000950123-95-001008""","""""","""FORM 8-K""","""5,7""","""""",6741,"""001-06244""","""95528563""",,"""https://www.sec.gov/Archives/e…","""https://www.sec.gov/Archives/e…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""0002086449""","""2086449""","""Exeter Select Automobile Recei…","""""","""""","""6189""","""Asset-Backed Securities""","""8-K""","""2025-10-17""","""2025-10-15""","""2025-10-17T19:57:49.000Z""","""0000929638-25-003926""","""esart2025-3_8k.htm""","""CURRENT REPORT""","""1.01,8.01,9.01""","""34""",3466885,"""333-268757-17""","""251400710""","""https://www.sec.gov/Archives/e…","""https://www.sec.gov/Archives/e…","""https://www.sec.gov/Archives/e…"
"""0002063141""","""2063141""","""Toyota Auto Receivables 2025-D…","""""","""""","""6189""","""Asset-Backed Securities""","""8-K""","""2025-10-17""","""2025-10-15""","""2025-10-17T18:34:58.000Z""","""0000929638-25-003924""","""taot2025-d_form8k.htm""","""CURRENT REPORT""","""1.01,8.01,9.01""","""34""",2462107,"""333-281727-04""","""251400270""","""https://www.sec.gov/Archives/e…","""https://www.sec.gov/Archives/e…","""https://www.sec.gov/Archives/e…"
"""0002089777""","""2089777""","""CarMax Auto Owner Trust 2025-4""","""""","""""","""6189""","""Asset-Backed Securities""","""8-K""","""2025-10-17""","""2025-10-15""","""2025-10-17T19:46:37.000Z""","""0001193125-25-242433""","""d23889d8k.htm""","""8-K""","""1.01,8.01,9.01""","""34""",2477710,"""333-288943-02""","""251400670""","""https://www.sec.gov/Archives/e…","""https://www.sec.gov/Archives/e…","""https://www.sec.gov/Archives/e…"
"""0002086742""","""2086742""","""Bridgecrest Lending Auto Secur…","""""","""""","""6189""","""Asset-Backed Securities""","""8-K""","""2025-10-17""","""2025-10-15""","""2025-10-17T20:50:33.000Z""","""0001104659-25-100440""","""tm2528191d5_8k.htm""","""FORM 8-K""","""8.01,9.01""","""34""",117237,"""333-271899-09""","""251401351""","""https://www.sec.gov/Archives/e…","""https://www.sec.gov/Archives/e…","""https://www.sec.gov/Archives/e…"


In [10]:
import zipfile, orjson



with zipfile.ZipFile(zip_path, "r") as zf:
    # Loop through JSON files until we find one that contains an 8-K
    for name in zf.namelist():
        if not name.endswith(".json"):
            continue
        with zf.open(name) as f:
            data = orjson.loads(f.read())

        filings = data.get("filings", {}).get("recent", {})
        if not filings:
            continue

        for form, date, acc, doc in zip(
            filings.get("form", []),
            filings.get("filingDate", []),
            filings.get("accessionNumber", []),
            filings.get("primaryDocument", []),
        ):
            if form.startswith("8-K"):  # first 8-K found
                cik = str(int(data["cik"]))  # remove leading zeros
                acc_clean = acc.replace("-", "")
                url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{acc_clean}/{doc}"
                print(f"Company: {data['name']}")
                print(f"CIK: {cik}")
                print(f"Form: {form}")
                print(f"Filing Date: {date}")
                print(f"Accession: {acc}")
                print(f"Primary Document: {doc}")
                print(f"URL: {url}")
                raise SystemExit  # stop after first 8-K


Company: AMERICAN MAIZE PRODUCTS CO
CIK: 5405
Form: 8-K
Filing Date: 1995-10-27
Accession: 0000950131-95-002984
Primary Document: 
URL: https://www.sec.gov/Archives/edgar/data/5405/000095013195002984/


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
