Import modules

In [None]:
import pandas as pd
import numpy as np
import os

Declare constants and import the dataset

In [None]:
DIR = '../dataset/'
OUT_DIR = '../udataset/acs'
DATASET = 'acs_5yr_est_selected_economic_characteristics_2010-2022.csv'
df = pd.read_csv(DIR + DATASET)

Rename columns and remove missing values

In [None]:
df.columns = ['label', 'category', 'state', 'estimate', 'moe', 'percent', 'percent moe', 'year']
df = df.replace("(X)", "")

Convert estimate to numeric

In [None]:
df['estimate'] = pd.to_numeric(df['estimate'].str.replace(',', ''), errors='coerce')

Convert margin or error columns to numeric

In [None]:
df['moe'] = pd.to_numeric(df['moe'].str.replace('±', '').str.replace(',', ''), errors='coerce')
df['percent moe'] = pd.to_numeric(df['percent moe'].str.replace('±', '').str.replace(',', ''), errors='coerce')

Clean the percent column by removing percentage signs and replacing any non-percentages with 100%

In [None]:
def filter_percent(val):
    if pd.isnull(val):
        return val
    if ',' in val and '%' not in val:
        return 100
    return val.replace('%', '')

df['percent'] = df['percent'].apply(filter_percent)
df['percent'] = pd.to_numeric(df['percent'], errors='coerce')

Preview the cleaned dataset

In [None]:
df

Look at the dtypes of the columns

In [None]:
df.dtypes

Group the data by category of question asked, sort it chronologically, and output it

In [None]:
os.makedirs(OUT_DIR, exist_ok=True)

grouping: dict[str,list] = {}
current = None

income_and_benefits = 'INCOME AND BENEFITS'
poverty_line = 'POVERTY LEVEL'

for _, row in df.iterrows():
    cat: str = row['category']
    label: str = row['label']

    if label.startswith(income_and_benefits):
        label = income_and_benefits
        
    if label.endswith(poverty_line):
        label = 'PERCENT POVERTY LINE'

    if cat == 'Header':
        if not label in grouping:
            grouping[label] = []
        current = label
    else:
        grouping[current].append(row)

for label, data in grouping.items():
    df_label = pd.DataFrame(data)
    df_label = df_label.drop(columns=['category'])
    df_label = df_label.sort_values(by=["year","state"], ignore_index=True)
    df_label.to_csv(f"../udataset/acs/{label}.csv", index=False)