# SCT CSV Normalization – Step-by-step Trial

This notebook previews normalization steps on extracted SEC Summary Compensation Table (SCT) CSVs, without modifying main code. It:
- Discovers `*_SCT.csv` files under `data/<TICKER>/<FORM>/extracted/`
- Reads a sample CSV robustly (handles leading numeric header row)
- Maps headers to canonical names while preserving all original columns
- Splits name/position and normalizes numeric fields
- Builds a normalized view for inspection

Run cells in order. Adjust the config block as needed.

In [27]:
# ===============================
# Block 1: Setup & File Discovery
# ===============================

from pathlib import Path
from typing import List, Optional
import re
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 200)

DATA_ROOT = "data"
TICKERS = None
MAX_FILES = 200

# Accept ANY folder that contains the string "DEF"
FORM_KEYWORD = "DEF"  # this makes discovery flexible

def find_extracted_csvs(
    data_root: str = DATA_ROOT,
    tickers: Optional[List[str]] = None,
    max_files: Optional[int] = MAX_FILES,
) -> List[Path]:
    
    root = Path(data_root)
    out = []

    if not root.exists():
        return out

    # determine tickers
    if tickers:
        tickers_to_scan = [t.upper() for t in tickers]
    else:
        tickers_to_scan = [
            d.name for d in root.iterdir()
            if d.is_dir() and not d.name.startswith(".")
        ]

    # walk structure
    for t in tickers_to_scan:
        tdir = root / t
        if not tdir.is_dir():
            continue

        for form_dir in tdir.iterdir():
            if not form_dir.is_dir():
                continue

            # flexible matching: folder must contain "DEF"
            if FORM_KEYWORD.lower() not in form_dir.name.lower():
                continue

            extracted = form_dir / "extracted"
            if not extracted.is_dir():
                continue

            for csv_file in extracted.glob("*_SCT.csv"):
                out.append(csv_file)
                if max_files and len(out) >= max_files:
                    return out

    return out

# discover
csv_files = find_extracted_csvs(DATA_ROOT, tickers=TICKERS, max_files=MAX_FILES)

print(f"Found {len(csv_files)} SCT file(s).")
for p in csv_files[:10]:
    print(" ", p)

# Optional debugging: list tickers + form folders
print("\nDEBUG — TICKER folders under 'data/':")
for d in Path(DATA_ROOT).iterdir():
    if d.is_dir():
        print(" ", d)

print("\nDEBUG — FORM folders under first ticker:")
first_ticker = next((d for d in Path(DATA_ROOT).iterdir() if d.is_dir()), None)
if first_ticker:
    for d in first_ticker.iterdir():
        if d.is_dir():
            print(" ", d)

Found 0 SCT file(s).

DEBUG — TICKER folders under 'data/':


FileNotFoundError: [Errno 2] No such file or directory: 'data'