In [1]:
pip install pdfplumber pandas

Note: you may need to restart the kernel to use updated packages.


In [7]:
import pdfplumber
import pandas as pd
import re

def _parse_number(s):
    """Parse a number string like '$1,234', '(1,234)', '…', '-' into int or None."""
    if s is None:
        return None
    s = str(s).strip()
    if s in ('', '…', '-'):
        return None
    s = s.replace('$', '').replace(',', '').strip()
    if s == '':
        return None
    if s.startswith('(') and s.endswith(')'):
        return -int(s[1:-1])
    return int(s)

def parse_summary_table(text):
    if not text:
        return pd.DataFrame(columns=[
            'Calendar year',
            'Total income',
            'Total cost',
            'Net change during year',
            'Asset Reserves at end of year'
        ])
    rows = []
    for line in text.splitlines():
        m = re.match(r'^\s*(\d{4})\s+([$\d,()\-…]+)\s+([$\d,()\-…]+)\s+([$\d,()\-…]+)\s+([$\d,()\-…]+)\s*$', line)
        if m:
            rows.append({
                'Calendar year': int(m.group(1)),
                'Total income': _parse_number(m.group(2)),
                'Total cost': _parse_number(m.group(3)),
                'Net change during year': _parse_number(m.group(4)),
                'Asset Reserves at end of year': _parse_number(m.group(5))
            })
    return pd.DataFrame(rows)

def parse_income_table(text):
    if not text:
        return pd.DataFrame(columns=[
            'Calendar year',
            'Total income (income table)',
            'Net payroll tax contributions',
            'Income from taxation of benefits',
            'General Fund Transfers',
            'Net interest'
        ])
    rows = []
    for line in text.splitlines():
        m = re.match(r'^\s*(\d{4})\s+([$\d,()\-…]+)\s+([$\d,()\-…]+)\s+([$\d,()\-…]+)\s+([$\d,()\-…]+)\s+([$\d,()\-…]+)\s*$', line)
        if m:
            rows.append({
                'Calendar year': int(m.group(1)),
                'Total income (income table)': _parse_number(m.group(2)),
                'Net payroll tax contributions': _parse_number(m.group(3)),
                'Income from taxation of benefits': _parse_number(m.group(4)),
                'General Fund Transfers': _parse_number(m.group(5)),
                'Net interest': _parse_number(m.group(6))
            })
    return pd.DataFrame(rows)

def parse_cost_table(text):
    if not text:
        return pd.DataFrame(columns=[
            'Calendar year',
            'Total cost (cost table)',
            'Benefit payments',
            'Administrative expenses',
            'Transfers to Railroad Retirement program'
        ])
    rows = []
    for line in text.splitlines():
        m = re.match(r'^\s*(\d{4})\s+([$\d,()\-…]+)\s+([$\d,()\-…]+)\s+([$\d,()\-…]+)\s+([$\d,()\-…]+)\s*$', line)
        if m:
            rows.append({
                'Calendar year': int(m.group(1)),
                'Total cost (cost table)': _parse_number(m.group(2)),
                'Benefit payments': _parse_number(m.group(3)),
                'Administrative expenses': _parse_number(m.group(4)),
                'Transfers to Railroad Retirement program': _parse_number(m.group(5))
            })
    return pd.DataFrame(rows)

def extract_oasi_data(pdf_path):
    """
    Extract and merge three tables from OASI Trust Fund PDF.
    Returns a pandas.DataFrame merged on 'Calendar year'.
    """
    with pdfplumber.open(pdf_path) as pdf:
        pages_text = [p.extract_text() or "" for p in pdf.pages]
    full_text = "\n".join(pages_text)

    # Headings used to split the document into sections (adjust if the PDF differs)
    summary_title = "Old-Age and Survivors Insurance Trust Fund, 1937-2024"
    income_title = "Old-Age and Survivors Insurance Trust Fund Income"
    cost_title = "Old-Age and Survivors Insurance Trust Fund Cost"

    s_idx = full_text.find(summary_title)
    i_idx = full_text.find(income_title)
    c_idx = full_text.find(cost_title)

    summary_text = full_text[s_idx:i_idx] if (s_idx != -1 and i_idx != -1) else None
    income_text = full_text[i_idx:c_idx] if (i_idx != -1 and c_idx != -1) else None
    cost_text = full_text[c_idx:] if (c_idx != -1) else None

    df_summary = parse_summary_table(summary_text)
    df_income = parse_income_table(income_text)
    df_cost = parse_cost_table(cost_text)

    merged = df_summary.merge(df_income, on='Calendar year', how='outer')
    merged = merged.merge(df_cost, on='Calendar year', how='outer')
    merged = merged.sort_values('Calendar year').reset_index(drop=True)
    return merged

if __name__ == "__main__":
    pdf_file = r"C:\Users\ia_mc\OneDrive\Documentos\CSUF ECON + ACCT\ACCT 404\Social Security Project\OASI Trust Fund, a Social Security fund.pdf"
    try:
        merged_data = extract_oasi_data(pdf_file)
        print("Successfully extracted and merged data!")
        print(f"Shape: {merged_data.shape}")
        print(f"Columns: {list(merged_data.columns)}")
        print(merged_data.head())
        output_file = "oasi_merged_data.csv"
        merged_data.to_csv(output_file, index=False)
        print(f"Data saved to {output_file}")
    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()

Successfully extracted and merged data!
Shape: (88, 14)
Columns: ['Calendar year', 'Total income', 'Total cost', 'Net change during year', 'Asset Reserves at end of year', 'Total income (income table)', 'Net payroll tax contributions', 'Income from taxation of benefits', 'General Fund Transfers', 'Net interest', 'Total cost (cost table)', 'Benefit payments', 'Administrative expenses', 'Transfers to Railroad Retirement program']
   Calendar year  Total income  Total cost  Net change during year  \
0           1937           767           1                     766   
1           1938           375          10                     366   
2           1939           607          14                     592   
3           1940           368          62                     306   
4           1941           845         114                     731   

   Asset Reserves at end of year  Total income (income table)  \
0                            766                        767.0   
1                