In [1]:
import pandas as pd
import numpy as np

In [2]:
DIR = '../udataset/acs/'

income_data = 'INCOME AND BENEFITS.csv'
income_df = pd.read_csv(DIR + income_data)
poverty_data = 'PERCENT POVERTY LINE.csv'
poverty_df = pd.read_csv(DIR + poverty_data)

unemployment_data = 'unemployment.csv'
unemployment_df = pd.read_csv(DIR + unemployment_data)

def strip_columns(df):
    df.columns = df.columns.str.strip()
    df['state'] = df['state'].str.strip()

    df['state'] = df['state'].str.lower()

    if 'label' in df.columns:
        df['label'] = df['label'].str.strip()
        df['label'] = df['label'].str.lower()

    return df

In [3]:

unemployment_df = strip_columns(unemployment_df)
poverty_df = strip_columns(poverty_df)
income_df = strip_columns(income_df)

unemployment_df = unemployment_df.drop(['estimate', 'labour_force_percent'], axis=1)

keep_cols = ['state', 'percent', 'year']
keep_cols_income = ['state', 'estimate', 'year']

filtered_poverty_df = poverty_df[poverty_df['label'] == 'all people'][keep_cols]
filtered_poverty_df = filtered_poverty_df.reset_index()
filtered_income_df = income_df[income_df['label'] == 'median household income (dollars)'][keep_cols_income]
unemployment_df = unemployment_df.reset_index()
income_df = income_df.reset_index()

filtered_poverty_df.dtypes

index        int64
state       object
percent    float64
year         int64
dtype: object

In [4]:
filtered_income_df

Unnamed: 0,state,estimate,year
11,alabama,42081.0,2010
55,alaska,66521.0,2010
99,arizona,50448.0,2010
143,arkansas,39267.0,2010
187,california,60883.0,2010
...,...,...,...
29535,virginia,87249.0,2022
29579,washington,90325.0,2022
29623,west virginia,55217.0,2022
29667,wisconsin,72458.0,2022


In [5]:
unemployment_df = pd.merge(unemployment_df, filtered_poverty_df, on=['state', 'year'])
unemployment_df = unemployment_df.drop(columns=["index_x", "index_y"])

In [6]:
unemployment_df = unemployment_df.rename(columns={'percent':'below_poverty_line_percent'})

In [7]:
unemployment_df = pd.merge(unemployment_df, filtered_income_df, on=['state', 'year'])

In [8]:
unemployment_df = unemployment_df.rename(columns={'estimate':'median_household_income'})

In [9]:
def get_label_function(desired_attr, df, desired_col):

    def get_attr_column(row):
        row = df[(df['label'] == desired_attr) & (df['year'] == row['year']) & (df['state'] == row['state'])][desired_col]
        row = row.reset_index()
        row = np.array(row)
        to_return = row[0][1]

        return to_return

    return get_attr_column

In [10]:
attr = 'mean cash public assistance income (dollars)'
unemployment_df[attr] = unemployment_df.apply(get_label_function(attr, income_df, 'estimate'), axis=1)
unemployment_df.to_csv('../udataset/wealth_data.csv')
unemployment_df

Unnamed: 0,state,year,below_poverty_line_percent,median_household_income,mean cash public assistance income (dollars)
0,alabama,2010,17.1,42081.0,2462.0
1,alabama,2011,17.6,42934.0,2582.0
2,alabama,2012,18.1,43160.0,2603.0
3,alabama,2013,18.6,43253.0,2611.0
4,alabama,2014,18.9,43511.0,2632.0
...,...,...,...,...,...
671,wyoming,2018,11.1,62268.0,3229.0
672,wyoming,2019,11.0,64049.0,3256.0
673,wyoming,2020,10.8,65304.0,3232.0
674,wyoming,2021,10.7,68002.0,3461.0


In [11]:
for col in unemployment_df.columns[2:]:
    unemployment_df[col] = (unemployment_df[col] - unemployment_df[col].min())/(unemployment_df[col].max() - unemployment_df[col].min())

In [12]:
NEWNAME = 'wealth_data_normalised.csv'
unemployment_df.to_csv('../udataset/' + NEWNAME, index=False)