# Simple Deterministic SCT → JSON (No LLM)

Step-by-step: each function in its own cell, run and print output after each stage.
Works even when the notebook CWD is not the repo root by resolving the "data/" folder.


In [1]:
# 1) Imports + robust data root resolver
import re, json
from pathlib import Path
from typing import List, Dict, Tuple, Optional
import pandas as pd
from lxml import html
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_colwidth', 200)

def find_data_root() -> Path:
    for p in [Path.cwd(), *Path.cwd().parents]:
        d = p / 'data'
        if d.exists():
            return d
    raise FileNotFoundError('Could not locate a data/ folder above this notebook.')

DATA_ROOT = find_data_root()
# Hard-coded default; edit these three values as needed
TICKER = 'ABMD'
FORM = 'DEF_14A'
FNAME = '2005-07-15_DEF_14A.html'
HTML_PATH = DATA_ROOT / TICKER / FORM / FNAME
if not HTML_PATH.exists():
    HTML_PATH = DATA_ROOT / TICKER / FORM.replace('_',' ') / FNAME
print('CWD =', Path.cwd())
print('DATA_ROOT =', DATA_ROOT)
print('HTML_PATH =', HTML_PATH, 'exists =', HTML_PATH.exists())
assert HTML_PATH.exists(), f'Missing file: {HTML_PATH}'
# Derive ticker/date + output dir
ticker = HTML_PATH.parent.parent.name
m = re.match(r'(\d{4}-\d{2}-\d{2})_', HTML_PATH.stem)
report_date = m.group(1) if m else 'UNKDATE'
out_dir = HTML_PATH.parent / 'extracted'
out_dir.mkdir(parents=True, exist_ok=True)
print({'ticker': ticker, 'report_date': report_date, 'out_dir': out_dir.as_posix()})


CWD = /Users/ruturaj_vasant/Desktop/PersonalProjects/Political-Economy-Of-Corporate-Fraud/scripts/Test-Trials-experiments
DATA_ROOT = /Users/ruturaj_vasant/Desktop/PersonalProjects/Political-Economy-Of-Corporate-Fraud/data
HTML_PATH = /Users/ruturaj_vasant/Desktop/PersonalProjects/Political-Economy-Of-Corporate-Fraud/data/ABMD/DEF_14A/2005-07-15_DEF_14A.html exists = True
{'ticker': 'ABMD', 'report_date': '2005-07-15', 'out_dir': '/Users/ruturaj_vasant/Desktop/PersonalProjects/Political-Economy-Of-Corporate-Fraud/data/ABMD/DEF_14A/extracted'}


In [2]:
# 2) Find SCT table via XPath
def find_sct_table_element(html_path: Path):
    content = html_path.read_text(encoding='utf-8', errors='ignore')
    tree = html.fromstring(content)
    xpath_expr = """
//tr[
  .//text()[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'name')]
  and .//text()[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'principal')]
  and .//text()[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'position')]
]
"""
    tr_nodes = tree.xpath(xpath_expr)
    if not tr_nodes:
        return None
    table = tr_nodes[0].getparent()
    while table is not None and getattr(table, 'tag', None) != 'table':
        table = table.getparent()
    return table

tbl_el = find_sct_table_element(HTML_PATH)
print('Found table via XPath:', tbl_el is not None)
if tbl_el is None:
    raise RuntimeError('SCT table not found via XPath. Edit TICKER/FORM/FNAME or add a fallback.')
df_raw = pd.read_html(html.tostring(tbl_el, encoding='unicode'))[0]
display(df_raw.head(8))


Found table via XPath: True


  df_raw = pd.read_html(html.tostring(tbl_el, encoding='unicode'))[0]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
0,,,,,Annual Compensation,Annual Compensation,Annual Compensation,Annual Compensation,Annual Compensation,Annual Compensation,Annual Compensation,Annual Compensation,,Long-Term Compensation Awards,Long-Term Compensation Awards,Long-Term Compensation Awards,Long-Term Compensation Awards,,,,
1,Name and Principal Position,,Fiscal Year Ended 3/31,,Salary($),Salary($),,Bonus($),Bonus($),,Other Annual Compensation ($),Other Annual Compensation ($),,Restricted Stock Award(s) ($)(1),Restricted Stock Award(s) ($)(1),,Securities Under- Lying Options (#),,All Other Compensation ($)(2),All Other Compensation ($)(2),
2,"Michael R. Minogue Chairman of the Board, President and Chief Executive Officer",,2005,,$,322508,,$,327500,,$,—,,$,259200,,400000,,$,475491,
3,,,,,,,,,,,,,,,,,,,,,
4,Dr. David M. Lederman,,2005,,$,200000,,$,—,,$,—,,$,—,,50000,,$,30240,
5,"Former Chairman of the Board,",,2004,,312000,312000,,156000,156000,,—,—,,—,—,,—,,30643,30643,
6,Former President and Former Chief Executive Officer,,2003,,309000,309000,,46800,46800,,—,—,,—,—,,50000,,36170,36170,
7,,,,,,,,,,,,,,,,,,,,,


In [3]:
# 3) Detect header row
CANON_ORDER = [
    'executive_name','position','year',
    'salary','bonus','stock_awards','option_awards',
    'non_equity_incentive','pension_value','all_other_comp','total',
]
KEY_TOKENS: Dict[str, List[str]] = {
    'salary': ['salary'],
    'bonus': ['bonus'],
    'stock_awards': ['stock awards','stock-awards'],
    'option_awards': ['option awards','option-awards'],
    'non_equity_incentive': ['non-equity incentive','non equity incentive'],
    'pension_value': ['change in pension','pension value','deferred compensation earnings'],
    'all_other_comp': ['all other compensation'],
    'total': ['total'],
    'year': ['year','fiscal year'],
    'name_position': ['name and principal position','name & principal position','principal position','name'],
}
PLACEHOLDER_HEADERS = {'','$','—','–','-'}

def detect_header_row(df: pd.DataFrame, max_rows: int = 6) -> pd.DataFrame:
    for i in range(min(max_rows, len(df))):
        row_low = df.iloc[i].astype(str).str.lower()
        joined = ' '.join(list(row_low))
        hits = 0
        for toks in KEY_TOKENS.values():
            if any(tok in joined for tok in toks):
                hits += 1
        if hits >= 2:
            df2 = df.copy()
            df2.columns = df2.iloc[i].astype(str).tolist()
            df2 = df2.iloc[i+1:].reset_index(drop=True)
            return df2
    return df

df_hdr = detect_header_row(df_raw)
print('After header detect: num rows/cols =', df_hdr.shape)
display(df_hdr.head(8))


After header detect: num rows/cols = (15, 21)


Unnamed: 0,Name and Principal Position,nan,Fiscal Year Ended 3/31,nan.1,Salary($),Salary($).1,nan.2,Bonus($),Bonus($).1,nan.3,Other Annual Compensation ($),Other Annual Compensation ($).1,nan.4,Restricted Stock Award(s) ($)(1),Restricted Stock Award(s) ($)(1).1,nan.5,Securities Under- Lying Options (#),nan.6,All Other Compensation ($)(2),All Other Compensation ($)(2).1,nan.7
0,"Michael R. Minogue Chairman of the Board, President and Chief Executive Officer",,2005.0,,$,322508.0,,$,327500,,$,—,,$,259200,,400000,,$,475491.0,
1,,,,,,,,,,,,,,,,,,,,,
2,Dr. David M. Lederman,,2005.0,,$,200000.0,,$,—,,$,—,,$,—,,50000,,$,30240.0,
3,"Former Chairman of the Board,",,2004.0,,312000,312000.0,,156000,156000,,—,—,,—,—,,—,,30643,30643.0,
4,Former President and Former Chief Executive Officer,,2003.0,,309000,309000.0,,46800,46800,,—,—,,—,—,,50000,,36170,36170.0,
5,,,,,,,,,,,,,,,,,,,,,
6,Dr. Karim Benali Vice President – Product Development,,2005.0,,$,113949.0,,$,140000,,$,—,,$,—,,105000,,$,57017.0,
7,,,,,,,,,,,,,,,,,,,,,


In [4]:
# 4) Flatten columns
def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
    if isinstance(df.columns, pd.MultiIndex):
        cols = [' '.join(str(x) for x in tup if (str(x) and 'unnamed' not in str(x).lower())).strip() for tup in df.columns.to_list()]
        df.columns = cols
    else:
        df.columns = [str(c) for c in df.columns]
    return df

df_flat = flatten_columns(df_hdr.copy())
print('After flatten: num rows/cols =', df_flat.shape)
print('Columns (first 20):', list(df_flat.columns)[:20])
display(df_flat.head(8))


After flatten: num rows/cols = (15, 21)
Columns (first 20): ['Name and Principal Position', 'nan', 'Fiscal  Year  Ended  3/31', 'nan', 'Salary($)', 'Salary($)', 'nan', 'Bonus($)', 'Bonus($)', 'nan', 'Other  Annual  Compensation  ($)', 'Other  Annual  Compensation  ($)', 'nan', 'Restricted  Stock  Award(s)  ($)(1)', 'Restricted  Stock  Award(s)  ($)(1)', 'nan', 'Securities  Under-  Lying  Options  (#)', 'nan', 'All Other  Compensation  ($)(2)', 'All Other  Compensation  ($)(2)']


Unnamed: 0,Name and Principal Position,nan,Fiscal Year Ended 3/31,nan.1,Salary($),Salary($).1,nan.2,Bonus($),Bonus($).1,nan.3,Other Annual Compensation ($),Other Annual Compensation ($).1,nan.4,Restricted Stock Award(s) ($)(1),Restricted Stock Award(s) ($)(1).1,nan.5,Securities Under- Lying Options (#),nan.6,All Other Compensation ($)(2),All Other Compensation ($)(2).1,nan.7
0,"Michael R. Minogue Chairman of the Board, President and Chief Executive Officer",,2005.0,,$,322508.0,,$,327500,,$,—,,$,259200,,400000,,$,475491.0,
1,,,,,,,,,,,,,,,,,,,,,
2,Dr. David M. Lederman,,2005.0,,$,200000.0,,$,—,,$,—,,$,—,,50000,,$,30240.0,
3,"Former Chairman of the Board,",,2004.0,,312000,312000.0,,156000,156000,,—,—,,—,—,,—,,30643,30643.0,
4,Former President and Former Chief Executive Officer,,2003.0,,309000,309000.0,,46800,46800,,—,—,,—,—,,50000,,36170,36170.0,
5,,,,,,,,,,,,,,,,,,,,,
6,Dr. Karim Benali Vice President – Product Development,,2005.0,,$,113949.0,,$,140000,,$,—,,$,—,,105000,,$,57017.0,
7,,,,,,,,,,,,,,,,,,,,,


In [5]:
# 5) Normalize headers + drop placeholder columns (robust for duplicate names)
def normalize_header(h: str) -> str:
    s = re.sub(r'\s+', ' ', str(h)).strip()
    s = re.sub(r'\([^)]*\)', '', s)
    s = s.replace(' ', ' ')
    s = re.sub(r'\s+', ' ', s).strip().lower()
    return s

def is_placeholder_col(obj) -> bool:
    import pandas as pd
    if isinstance(obj, pd.DataFrame):
        return all(is_placeholder_col(obj[c]) for c in obj.columns)
    sr = obj
    vals = sr.dropna().astype(str).str.strip().str.replace(' ',' ', regex=False)
    if vals.empty:
        return True
    return vals.str.fullmatch(r'(\$)?|—|–|-').all()

from typing import Tuple
def normalize_headers_and_drop_placeholders(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[str]]:
    df2 = df.dropna(how='all').reset_index(drop=True).copy()
    df2.columns = [normalize_header(c) for c in df2.columns]
    df2 = df2.loc[:, ~df2.columns.duplicated()]
    dropped: List[str] = []
    for c in list(df2.columns):
        if c in PLACEHOLDER_HEADERS:
            dropped.append(c)
            continue
        if is_placeholder_col(df2[c]):
            dropped.append(c)
    df2 = df2.drop(columns=list(set(dropped)), errors='ignore')
    return df2, dropped

df_norm, dropped_cols = normalize_headers_and_drop_placeholders(df_flat.copy())
print('Dropped placeholder columns:', dropped_cols)
print('After normalize+drop: num rows/cols =', df_norm.shape)
print('Columns (first 20):', list(df_norm.columns)[:20])
display(df_norm.head(8))


Dropped placeholder columns: ['nan', 'other annual compensation', 'restricted stock award']
After normalize+drop: num rows/cols = (10, 6)
Columns (first 20): ['name and principal position', 'fiscal year ended 3/31', 'salary', 'bonus', 'securities under- lying options', 'all other compensation']


Unnamed: 0,name and principal position,fiscal year ended 3/31,salary,bonus,securities under- lying options,all other compensation
0,"Michael R. Minogue Chairman of the Board, President and Chief Executive Officer",2005,$,$,400000,$
1,Dr. David M. Lederman,2005,$,$,50000,$
2,"Former Chairman of the Board,",2004,312000,156000,—,30643
3,Former President and Former Chief Executive Officer,2003,309000,46800,50000,36170
4,Dr. Karim Benali Vice President – Product Development,2005,$,$,105000,$
5,Javier Jimenez Vice President - Operations,2005,$,$,80000,$
6,Dr. Robert T.V. Kung,2005,$,$,20000,$
7,Senior Vice President - Chief,2004,208000,75000,16000,4602


In [6]:
# 6) Canonical mapping + best-column selection
def canonical_for(col: str) -> Optional[str]:
    c = normalize_header(col)
    if 'name' in c and 'position' in c:
        return 'name_position'
    for key, toks in KEY_TOKENS.items():
        for t in toks:
            if t in c:
                return key
    return None

def numeric_score(sr: pd.Series) -> int:
    return pd.to_numeric(
        sr.astype(str).str.replace(r'[\$,]','', regex=True)
          .str.replace(r'\s','', regex=True)
          .str.replace(r'^\((.*)\)$', r'-\1', regex=True),
        errors='coerce'
    ).notna().sum()

def select_best_columns(df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str,str]]:
    groups: Dict[str, List[str]] = {}
    for c in df.columns:
        key = canonical_for(c)
        if key:
            groups.setdefault(key, []).append(c)
    keep: Dict[str, str] = {}
    for key, cols in groups.items():
        if key in ('executive_name','position'):
            continue
        best = max(cols, key=lambda x: numeric_score(df[x])) if cols else None
        if best:
            keep[key] = best
    if 'year' in groups:
        keep['year'] = max(groups['year'], key=lambda x: numeric_score(df[x]))
    if 'name_position' in groups:
        keep['name_position'] = groups['name_position'][0]
    sel = df[list(keep.values())].copy() if keep else df.copy()
    sel.columns = list(keep.keys()) if keep else sel.columns
    return sel, keep

df_sel, kept_map = select_best_columns(df_norm.copy())
print('Kept (canonical → original):')
for k,v in kept_map.items():
    print(f'  {k:22s} <- {v}')
display(df_sel.head(8))


Kept (canonical → original):
  name_position          <- name and principal position
  year                   <- fiscal year ended 3/31
  salary                 <- salary
  bonus                  <- bonus
  all_other_comp         <- all other compensation


Unnamed: 0,name_position,year,salary,bonus,all_other_comp
0,"Michael R. Minogue Chairman of the Board, President and Chief Executive Officer",2005,$,$,$
1,Dr. David M. Lederman,2005,$,$,$
2,"Former Chairman of the Board,",2004,312000,156000,30643
3,Former President and Former Chief Executive Officer,2003,309000,46800,36170
4,Dr. Karim Benali Vice President – Product Development,2005,$,$,$
5,Javier Jimenez Vice President - Operations,2005,$,$,$
6,Dr. Robert T.V. Kung,2005,$,$,$
7,Senior Vice President - Chief,2004,208000,75000,4602


In [7]:
# 7) Finalize fields (numeric coercion, split name/position)
def to_number(x):
    s = str(x).strip().replace(' ',' ')
    if s in ('','-','–','—'):
        return pd.NA
    s = re.sub(r'[,$]','', s)
    m = re.fullmatch(r'\((.*)\)', s)
    if m:
        s = '-' + m.group(1)
    try:
        v = float(s)
        return int(v) if v.is_integer() else v
    except Exception:
        return pd.NA

def finalize_fields(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    if 'name_position' in out.columns:
        parts = out['name_position'].astype(str).str.split(',', n=1)
        out['executive_name'] = parts.str[0].str.strip()
        out['position'] = parts.str[1].str.strip() if (parts.apply(len) > 1).any() else ''
        out = out.drop(columns=['name_position'])
    if 'year' in out.columns:
        out['year'] = pd.to_numeric(out['year'], errors='coerce').astype('Int64')
    for k in ['salary','bonus','stock_awards','option_awards','non_equity_incentive','pension_value','all_other_comp','total']:
        if k in out.columns:
            out[k] = out[k].map(to_number)
    cols = [c for c in CANON_ORDER if c in out.columns]
    cols += [c for c in out.columns if c not in cols]
    return out[cols]

df_final = finalize_fields(df_sel.copy())
print('Final dtypes:')
print(df_final.dtypes)
display(df_final.head(8))


Final dtypes:
executive_name    object
position          object
year               Int64
salary            object
bonus             object
all_other_comp    object
dtype: object


Unnamed: 0,executive_name,position,year,salary,bonus,all_other_comp
0,Michael R. Minogue Chairman of the Board,President and Chief Executive Officer,2005,,,
1,Dr. David M. Lederman,,2005,,,
2,Former Chairman of the Board,,2004,312000.0,156000.0,30643.0
3,Former President and Former Chief Executive Officer,,2003,309000.0,46800.0,36170.0
4,Dr. Karim Benali Vice President – Product Development,,2005,,,
5,Javier Jimenez Vice President - Operations,,2005,,,
6,Dr. Robert T.V. Kung,,2005,,,
7,Senior Vice President - Chief,,2004,208000.0,75000.0,4602.0


In [8]:
# 8) Quality flags (totals)
def add_quality_flags(df: pd.DataFrame) -> pd.DataFrame:
    req = ['salary','bonus','stock_awards','option_awards','non_equity_incentive','pension_value','all_other_comp']
    out = df.copy()
    if 'total' in out.columns:
        for k in req:
            if k not in out.columns:
                out[k] = 0
        base_num = out[req].apply(pd.to_numeric, errors='coerce').fillna(0)
        out['total_calc'] = base_num.sum(axis=1)
        out['total_diff'] = (pd.to_numeric(out.get('total'), errors='coerce') - out['total_calc']).abs()
        out['total_ok'] = (out['total_diff'] <= 5)
    return out

df_checked = add_quality_flags(df_final.copy())
mismatch = df_checked[df_checked.get('total_ok') == False]  # noqa: E712
print('Rows with total mismatch:', len(mismatch))
display(df_checked.head(10))
display(mismatch.head(10))


KeyError: False

In [None]:
# 9) Build JSON and save
def df_to_json_payload(df: pd.DataFrame, ticker: str, report_date: str) -> dict:
    execs: Dict[str, Dict[str, dict]] = {}
    def to_val(v):
        if pd.isna(v):
            return None
        if isinstance(v, (int,)):
            return int(v)
        if isinstance(v, float) and v.is_integer():
            return int(v)
        return v
    for _, row in df.iterrows():
        name = str(row.get('executive_name') or '').strip()
        pos = str(row.get('position') or '').strip()
        year = row.get('year')
        if pd.isna(year) or not name:
            continue
        yr = str(int(year))
        slot = execs.setdefault(name, {})
        slot[yr] = {k: to_val(row.get(k)) for k in ['position','salary','bonus','stock_awards','option_awards','non_equity_incentive','pension_value','all_other_comp','total'] if k in df.columns}
        if pos:
            slot[yr]['position'] = pos
    return {
        'company': {
            'ticker': ticker.upper(),
            'report_year': report_date[:4] if len(report_date) >= 4 else '',
            'reports': [{
                'report_date': report_date,
                'executives': execs,
            }],
        }
    }

payload = df_to_json_payload(df_checked, ticker=ticker, report_date=report_date)
out_json = out_dir / f'{ticker}_{report_date}_SCT.det.json'
out_json.write_text(json.dumps(payload, indent=2, ensure_ascii=False))
print('Wrote JSON:', out_json)
payload
