In [None]:
import requests
from bs4 import BeautifulSoup
import yaml

# URLs for different years
urls = {
    "2021-01-01": "https://web.archive.org/web/20220508102639/https://www.irs.gov/instructions/i1040sca",
    "2022-01-01": "https://web.archive.org/web/20230526201403/https://www.irs.gov/instructions/i1040sca",
    "2023-01-01": "https://web.archive.org/web/20240406111850/https://www.irs.gov/instructions/i1040sca",
    "2024-01-01": "https://www.irs.gov/instructions/i1040sca#en_US_2023_publink10005349"
}

# State abbreviations
state_abbreviations = {
    "Alabama": "AL", "Alaska": "AK", "Arizona": "AZ", "Arkansas": "AR", "California": "CA",
    "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE", "Florida": "FL", "Georgia": "GA",
    "Hawaii": "HI", "Idaho": "ID", "Illinois": "IL", "Indiana": "IN", "Iowa": "IA", "Kansas": "KS",
    "Kentucky": "KY", "Louisiana": "LA", "Maine": "ME", "Maryland": "MD", "Massachusetts": "MA",
    "Michigan": "MI", "Minnesota": "MN", "Mississippi": "MS", "Missouri": "MO", "Montana": "MT",
    "Nebraska": "NE", "Nevada": "NV", "New Hampshire": "NH", "New Jersey": "NJ",
    "New Mexico": "NM", "New York": "NY", "North Carolina": "NC", "North Dakota": "ND",
    "Ohio": "OH", "Oklahoma": "OK", "Oregon": "OR", "Pennsylvania": "PA", "Rhode Island": "RI",
    "South Carolina": "SC", "South Dakota": "SD", "Tennessee": "TN", "Texas": "TX", "Utah": "UT",
    "Vermont": "VT", "Virginia": "VA", "Washington": "WA", "West Virginia": "WV",
    "Wisconsin": "WI", "Wyoming": "WY", "District of Columbia": "DC"
}

# Income brackets for YAML formatting
income_brackets = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19"]

# Helper function to format numbers
def format_number_with_underscores(num_str):
    try:
        num = int(num_str.replace(",", ""))  # Remove commas before conversion
        return f"{num:_}"  # Format with underscores
    except ValueError:
        return num_str  # Return original string if conversion fails

In [None]:
# Dictionary to store combined state data
state_data_combined = {}

for year, url in urls.items():
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Locate the table containing tax data
    table = soup.find('div', {'class': 'table-contents'})
    rows = table.find_all('tr') if table else []

    data = []
    for row in rows:
        cells = row.find_all(['td', 'th'])
        data.append([cell.get_text(strip=True) for cell in cells])

    # Remove header rows
    del data[0:2]

    # Determine how many lists can be split (each set contains 20 rows)
    num_lists = len(data) // 20

    for i in range(num_lists):
        start_index = i * 20
        end_index = start_index + 20
        state_data = data[start_index:end_index]

        # Extract state names from first row
        states = state_data[0][1::3]  # Every 3rd column holds a state name

        for m, state in enumerate(states):
            state_abbr = state_abbreviations.get(state, state)  # Convert to abbreviation

            if state_abbr not in state_data_combined:
                state_data_combined[state_abbr] = {m+1: {} for m in range(6)}

            # Determine the number of rows to process
            num_rows = 19 if len(state_data) > 19 else len(state_data) - 1

            for j in range(num_rows):  # 19 data rows for regular states, adjust for Wyoming if needed
                for k in range(6):  # 6 categories per state
                    col_index = (m * 3) + 2 + k  # Determine the correct column index

                    # Special case: Wyoming (last section) may not follow the 3-column pattern
                    if state_abbr == "WY":
                        col_index = 2 + k  # Wyoming starts at column 2, not part of grouped states

                    if col_index >= len(state_data[j+1]):  # Prevent out-of-range errors
                        continue

                    if year not in state_data_combined[state_abbr][k+1]:
                        state_data_combined[state_abbr][k+1][year] = []

                    state_data_combined[state_abbr][k+1][year].append(
                        format_number_with_underscores(state_data[j+1][col_index])
                    )
            # Stop processing after Wyoming data
            if state_abbr == "WY":
                break

# Alaska residents do not have a state sales tax
if "AK" not in state_data_combined:
    state_data_combined["AK"] = {i+1: {year: ["0"] * 19 for year in urls.keys()} for i in range(6)}


In [None]:
# Format data for YAML
formatted_state_dicts = {}
for state, family_data in state_data_combined.items():
    formatted_state_dicts[state] = {}
    for family_size, tax_data in family_data.items():
        formatted_state_dicts[state][int(family_size)] = {
            income_brackets[i]: {
                year: tax_data[year][i] if year in tax_data else None
                for year in urls.keys()
            }
            for i in range(len(income_brackets))
        }

# Generate YAML output
yaml_output = yaml.dump(
    formatted_state_dicts,
    sort_keys=False,
    default_flow_style=False
).replace("'", "")

# Print YAML
print(yaml_output)