In [1]:
!uv pip install edgartools ipywidgets

[2mAudited [1m2 packages[0m [2min 137ms[0m[0m


## Download the data


In [9]:
from os import makedirs

import pandas as pd
from edgar import Company, set_identity

In [10]:
set_identity("Noel Jacob noeljacob91@gmail.com")
df = pd.read_csv("./mining.csv")

In [11]:
ticker_list: list[str] = df["Symbol"].tolist()
form_list = ["10-K", "10-Q", "8-K", "6-K", "DEF 14A"]

In [None]:
# topic_df = pd.DataFrame(columns=["Ticker", "Form", "Date", "Topics"])
for ticker in ticker_list:
    try:
        company = Company(ticker)
        filings = company.get_filings(form=form_list)
        for filing in filings:
            form = filing.form
            date = filing.filing_date

            # filing_obj = filing.data_object()
            # if filing_obj is not None and len(filing_obj.items) > 0:
            #     topics = ",".join(filing_obj.items)
            #     print(f'so"{topics}"ok')
            #     topic_df.loc[len(topic_df)] = [ticker, form, date, topics]

            md = filing.markdown()
            parent_dir = f"./edgartools-data/{ticker}/{form}"
            makedirs(parent_dir, exist_ok=True)
            with open(f"{parent_dir}/{date}.md", mode="w+") as f:
                f.write(md)
    except Exception as e:
        print(f"Error {ticker}: {e}")

# topic_df.to_csv("./topics.csv", index=False)

## Remove older than 2009


In [4]:
import os
from datetime import datetime

In [5]:
def delete_old_files(root_dir, cutoff_year=2009):
    cutoff_date = datetime(cutoff_year, 1, 1)
    for dirpath, _, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename.endswith(".md"):
                try:
                    file_date = datetime.strptime(filename[:-3], "%Y-%m-%d")
                    if file_date < cutoff_date:
                        file_path = os.path.join(dirpath, filename)
                        os.remove(file_path)
                        print(f"Deleted: {file_path}")
                except ValueError:
                    pass

In [None]:
delete_old_files("./edgartools-data")

## Filter using keywords


In [63]:
ENERGY_STEMS = [
    # Power & Energy basics
    r"electr",  # electric, electrical, electricity
    r"(?<!voting )(?<!proxy )(?<!attorney )power",
    r"energ",
    r"watt",  # watt, watts, wattage, kilowatt, megawatt, gigawatt
    r"volt",  # volt, voltage, kilovolt
    r"(?<!ex)(?<!sw)(?<!cl)(?<!st)(?<!tr)(?<!d)amp(?!le|lif)",  # avoid "example", "swamp", "clamp", "stamp", "tramp", "damp", "ample", "amplify" (unless you want amplify)
    r"joule",
    r"btu",
    r"therm",
    # Units
    r"kwh",
    r"mwh",
    r"gwh",
    r"twh",
    # Mining specific
    r"(?<!re)hash",  # avoid "rehash"
    r"\basic\b",  # use word boundary instead of leading space for ASIC
    # r"mine",  # miner, miners, mining
    r"s/s"
    r"h/s",
    r"/h",
    r"/m"
    r"/h",
    r"/kh",
    r"/mh"
    r"/gh",
    r"/th",
    # Generation & Sources
    r"generat",  # generate, generation, generator
    r"solar",
    r"wind",
    r"hydro",
    r"nuclear",
    r"coal",
    r"geotherm",
    r"bio",
    r"flare",  # flared gas
    r"renewab",  # renewable, renewables
    r"fossil",
    r"fuel",
    # Infrastructure
    r"grid",
    r"substation",
    r"transformer",
    r"transmiss",  # transmission
    r"data[ \-]?center",  # consolidated pattern
    r"datacenter",
    r"mining[ \-]facilit",  # facility, facilities
    r"interconnect",
    # Cooling & Efficiency
    r"cool(?![ \-]?off)",  # cooling, cooled, cooler
    r"hvac",
    r"pue",
    r"immersion",
    r"heat",
    # Costs & Contracts
    r"utilit",  # utility, utilities
    r"tariff",
    r"\bppa\b",  # word boundary instead of leading space
    r"curtail",  # curtail, curtailment
    r"wholesale",
    # r"rate",
    # Environmental
    r"\bepa\b",  # word boundary
    r"carbon",
    r"co2",
    r"emission",
    r"greenhouse",
    r"ghg",
    r"sustainab",  # sustainable, sustainability
    r"esg",
    r"footprint",
    r"net[ \-]?zero",  # consolidated
    r"scope [123]",  # consolidated
    r"(?<!super)(?<!un)(?<!pre)natural",  # avoid "supernatural", "unnatural", "preternatural"
    r"\bnatur",  # word boundary
    r"environment",
    # Storage & Backup
    r"batter(?:y|ies)",  # more specific
    r"(?<!cold )(?<!data )storage",  # may want to exclude "cold storage" (financial term)
    r"\bups\b",  # word boundary - avoid "ups and downs"
    r"diesel",
    r"backup",
    # Regulatory
    r"\bercot\b",
    r"\bferc\b",
    r"\bpjm\b",
    r"\bnerc\b",
    # r"\biso\b",  # careful - very common word
    r"\brto\b",
    # Consumption terms
    r"consum",  # consume, consumption, consumer
    r"usage",
    r"utiliz",  # utilize, utilization
    r"efficien",  # efficient, efficiency
    r"capacity",
    r"load",
    r"demand",
    r"(?<!money )(?<!blood )(?<!food )(?<!labor )supply",  # avoid non-energy supply
    r"procure",  # procure, procurement
    # noel
]

In [64]:
import re

ENERGY_PATTERN = re.compile("|".join(ENERGY_STEMS), re.IGNORECASE)

In [65]:
from pathlib import Path


def filter_energy_files(root_dir, dry_run=True):
    """Delete files without energy mentions. Set dry_run=False to actually delete."""
    deleted = []
    kept = []

    for filepath in Path(root_dir).rglob("*.md"):
        with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
            content = f.read()

        if ENERGY_PATTERN.search(content):
            kept.append(str(filepath))
        else:
            print(f"Deleted: {filepath}")
            deleted.append(str(filepath))
            if not dry_run:
                os.remove(filepath)

    print(f"Kept: {len(kept)} files")
    print(f"{'Would delete' if dry_run else 'Deleted'}: {len(deleted)} files")

    return kept, deleted


kept, to_delete = filter_energy_files("./edgartools-data")

Kept: 3231 files
Would delete: 0 files


## Find keywords in existing


In [66]:
from concurrent.futures import ProcessPoolExecutor, as_completed

import pandas as pd


def process_file(filepath: Path) -> list[dict]:
    """Process a single file and return matching records."""
    records = []
    try:
        with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
            lines = f.readlines()

        ticker = filepath.parts[1]
        form = filepath.parts[2]
        date = filepath.stem

        for line_num, line in enumerate(lines, start=1):
            matches = ENERGY_PATTERN.findall(line)
            if matches:
                records.append(
                    {
                        "ticker": ticker,
                        "form": form,
                        "date": date,
                        "filepath": str(filepath),
                        "line_num": line_num,
                        "matched_terms": ", ".join(set(m.lower() for m in matches)),
                        "line_text": line.strip()[:500],
                    }
                )
    except Exception as e:
        print(f"Error processing {filepath}: {e}")

    return records


def extract_matching_sentences_parallel(root_dir, max_workers=8):
    """Extract sentences with energy matches using parallel processing."""
    files = list(Path(root_dir).rglob("*.md"))
    all_records = []

    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_file, f): f for f in files}

        for i, future in enumerate(as_completed(futures)):
            records = future.result()
            all_records.extend(records)
            if (i + 1) % 300 == 0:
                print(f"Processed {i + 1}/{len(files)} files...")

    return pd.DataFrame(all_records)


# Run parallel extraction
matches_df = extract_matching_sentences_parallel("./edgartools-data", max_workers=8)
print(f"Found {len(matches_df)} matching lines")
print(f"Unique tickers: {matches_df['ticker'].nunique()}")
print(f"Unique terms matched:\n{matches_df['matched_terms'].value_counts().head(20)}")

matches_df.to_csv("./energy_matches_debug.csv", index=False)
print("Saved to energy_matches_debug.csv")

Processed 300/3231 files...
Processed 600/3231 files...
Processed 900/3231 files...
Processed 1200/3231 files...
Processed 1500/3231 files...
Processed 1800/3231 files...
Processed 2100/3231 files...
Processed 2400/3231 files...
Processed 2700/3231 files...
Processed 3000/3231 files...
Found 87819 matching lines
Unique tickers: 32
Unique terms matched:
matched_terms
power            8127
coal             6694
generat          5903
electr           3761
consum           3635
demand           2750
utiliz           2698
environment      2482
energ            2317
hash             2193
bio              2060
natur            2045
data center      1693
supply           1423
therm            1128
capacity         1113
efficien         1090
amp               753
storage           658
electr, power     639
Name: count, dtype: int64
Saved to energy_matches_debug.csv


In [67]:
matches_df["matched_terms"].value_counts().head(20)

ticker
GPUS    15749
ARLP    13165
RIOT     5727
BMNR     5626
CLSK     5381
GREE     5199
SLNH     4704
MARA     4318
WULF     3709
CORZ     3653
CIFR     3261
MIGI     2640
LMFA     2577
APLD     2161
ANY      2126
OLB      1916
ABTC     1596
ARBK     1222
IREN     1116
HUT      1041
Name: count, dtype: int64

In [None]:
# Flatten and count individual terms
matches_df["matched_terms"] = matches_df["matched_terms"].str.split(", ")
exploded_df = matches_df.explode("matched_terms")