In [1]:
import pandas as pd
import numpy as np

In [2]:
DIR = '../udataset/acs/'


income_data = 'INCOME AND BENEFITS.csv'
income_df = pd.read_csv(DIR + income_data)
poverty_data = 'PERCENT_POVERTY_LINE.csv'
poverty_df = pd.read_csv(DIR + poverty_data)

print(poverty_df.columns)

industry_data = 'INDUSTRY.csv'
industry_df = pd.read_csv(DIR + industry_data)
health_data = 'HEALTH INSURANCE COVERAGE.csv'
health_df = pd.read_csv(DIR + health_data)
unemployment_data = '../dataset/unemployment.csv'
unemployment_df = pd.read_csv(unemployment_data)
worker_class_data = 'CLASS OF WORKER.csv'
worker_class_df = pd.read_csv(DIR + worker_class_data)

def strip_columns(df):
    df.columns = df.columns.str.strip()
    df['state'] = df['state'].str.strip()

    df['state'] = df['state'].str.lower()

    if 'label' in df.columns:
        df['label'] = df['label'].str.strip()
        df['label'] = df['label'].str.lower()

    return df

Index(['label', 'state', 'estimate', 'moe', 'percent', 'percent moe', 'year'], dtype='object')


In [3]:
unemployment_df = strip_columns(unemployment_df)
poverty_df = strip_columns(poverty_df)
industry_df = strip_columns(industry_df)
income_df = strip_columns(income_df)
worker_class_df = strip_columns(worker_class_df)

print(unemployment_df.columns)
unemployment_df = unemployment_df.drop(['estimate'], axis=1)

keep_cols = ['state', 'percent', 'year']
keep_cols_income = ['state', 'estimate', 'year']

filtered_poverty_df = poverty_df[poverty_df['label'] == 'all people'][keep_cols]
filtered_poverty_df = filtered_poverty_df.reset_index()
filtered_income_df = income_df[income_df['label'] == 'median household income (dollars)'][keep_cols_income]
unemployment_df = unemployment_df.reset_index()
income_df = income_df.reset_index()

filtered_poverty_df.dtypes

Index(['state', 'estimate', 'year', 'labour_force_percent'], dtype='object')


index       int64
state      object
percent    object
year        int64
dtype: object

In [4]:
filtered_income_df

Unnamed: 0,state,estimate,year
11,alabama,42081,2010
55,alaska,66521,2010
99,arizona,50448,2010
143,arkansas,39267,2010
187,california,60883,2010
...,...,...,...
29535,virginia,87249,2022
29579,washington,90325,2022
29623,west virginia,55217,2022
29667,wisconsin,72458,2022


In [5]:
unemployment_df = pd.merge(unemployment_df, filtered_poverty_df, on=['state', 'year'])
unemployment_df = unemployment_df.drop(columns=["index_x", "index_y"])

In [6]:
unemployment_df['percent'] = unemployment_df['percent'].str.replace("%", "")
unemployment_df['percent'] = unemployment_df['percent'].astype(float)
unemployment_df = unemployment_df.rename(columns={'percent':'below_poverty_line_percent'})

In [7]:
unemployment_df = pd.merge(unemployment_df, filtered_income_df, on=['state', 'year'])
unemployment_df = unemployment_df.rename(columns={'estimate':'median_household_income'})
unemployment_df

Unnamed: 0,state,year,labour_force_percent,below_poverty_line_percent,median_household_income
0,alabama,2010,8.7,17.1,42081
1,alabama,2011,9.6,17.6,42934
2,alabama,2012,10.3,18.1,43160
3,alabama,2013,10.8,18.6,43253
4,alabama,2014,10.2,18.9,43511
...,...,...,...,...,...
671,wyoming,2018,4.5,11.1,62268
672,wyoming,2019,4.5,11.0,64049
673,wyoming,2020,4.4,10.8,65304
674,wyoming,2021,4.2,10.7,68002


In [8]:
def get_label_function(desired_attr, df, is_string, desired_col):

    def get_attr_column(row):
        row = df[(df['label'] == desired_attr) & (df['year'] == row['year']) & (df['state'] == row['state'])][desired_col]
        row = row.reset_index()
        row = np.array(row)
        to_return = row[0][1]

        if is_string:
            to_return = to_return.strip().replace('%', '')
            to_return = float(to_return)
        
        return to_return
    
    return get_attr_column

In [9]:
attr = 'mean cash public assistance income (dollars)'
unemployment_df[attr] = unemployment_df.apply(get_label_function(attr, income_df, False, 'estimate'), axis=1)
unemployment_df

Unnamed: 0,state,year,labour_force_percent,below_poverty_line_percent,median_household_income,mean cash public assistance income (dollars)
0,alabama,2010,8.7,17.1,42081,2462
1,alabama,2011,9.6,17.6,42934,2582
2,alabama,2012,10.3,18.1,43160,2603
3,alabama,2013,10.8,18.6,43253,2611
4,alabama,2014,10.2,18.9,43511,2632
...,...,...,...,...,...,...
671,wyoming,2018,4.5,11.1,62268,3229
672,wyoming,2019,4.5,11.0,64049,3256
673,wyoming,2020,4.4,10.8,65304,3232
674,wyoming,2021,4.2,10.7,68002,3461


In [10]:
desired_attributes = industry_df['label'].unique()

for i in range(1, len(desired_attributes)):
    attr = desired_attributes[i]
    unemployment_df[attr] = unemployment_df.apply(get_label_function(attr, industry_df, True, 'percent'), axis=1)

In [11]:
desired_attributes = worker_class_df['label'].unique()
print(desired_attributes)

for i in range(1, len(desired_attributes)):
    attr = desired_attributes[i]
    unemployment_df[attr] = unemployment_df.apply(get_label_function(attr, worker_class_df, True, 'percent'), axis=1)

['civilian employed population 16 years and over'
 'private wage and salary workers' 'government workers'
 'self-employed in own not incorporated business workers'
 'unpaid family workers']


In [12]:
unemployment_df = unemployment_df.drop(columns = ['labour_force_percent'])
unemployment_df

Unnamed: 0,state,year,below_poverty_line_percent,median_household_income,mean cash public assistance income (dollars),"agriculture, forestry, fishing and hunting, and mining",construction,manufacturing,wholesale trade,retail trade,...,"finance and insurance, and real estate and rental and leasing","professional, scientific, and management, and administrative and waste management services","educational services, and health care and social assistance","arts, entertainment, and recreation, and accommodation and food services","other services, except public administration",public administration,private wage and salary workers,government workers,self-employed in own not incorporated business workers,unpaid family workers
0,alabama,2010,17.1,42081,2462,1.9,7.9,14.5,3.1,12.0,...,5.8,8.7,20.8,7.6,5.1,5.4,77.5,16.3,6.0,0.2
1,alabama,2011,17.6,42934,2582,1.9,7.5,14.2,2.9,12.0,...,5.7,8.9,21.4,7.7,5.2,5.6,77.4,16.6,5.8,0.2
2,alabama,2012,18.1,43160,2603,1.8,7.2,13.8,2.8,12.1,...,5.6,9.0,21.7,7.9,5.2,5.7,77.5,16.8,5.6,0.1
3,alabama,2013,18.6,43253,2611,1.8,6.9,13.7,2.6,12.0,...,5.6,9.1,22.2,8.1,5.2,5.7,77.4,16.9,5.5,0.2
4,alabama,2014,18.9,43511,2632,1.8,6.6,13.8,2.6,12.1,...,5.6,9.1,22.3,8.3,5.2,5.8,77.7,16.7,5.4,0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
671,wyoming,2018,11.1,62268,3229,11.5,8.0,4.1,2.0,11.0,...,4.0,6.7,23.6,10.7,4.4,6.1,73.1,20.3,6.3,0.3
672,wyoming,2019,11.0,64049,3256,11.0,8.2,4.1,1.9,11.0,...,4.1,6.8,24.3,10.5,4.6,5.8,72.7,20.4,6.6,0.3
673,wyoming,2020,10.8,65304,3232,10.5,8.2,4.3,1.9,10.8,...,4.5,6.5,25.1,9.8,4.6,5.9,72.3,20.6,6.6,0.4
674,wyoming,2021,10.7,68002,3461,10.1,8.3,4.2,1.8,11.3,...,4.6,6.6,24.8,9.6,4.7,6.3,71.9,21.1,6.7,0.4


In [13]:
print(unemployment_df.dtypes)
for col in unemployment_df.columns[2:]:
    if unemployment_df[col].dtype != 'float64':
        unemployment_df[col] = unemployment_df[col].str.strip()
        unemployment_df[col] = unemployment_df[col].str.replace(',', '')
        unemployment_df[col] = unemployment_df[col].astype(float)
    unemployment_df[col] = (unemployment_df[col] - unemployment_df[col].min())/(unemployment_df[col].max() - unemployment_df[col].min())

state                                                                                          object
year                                                                                            int64
below_poverty_line_percent                                                                    float64
median_household_income                                                                        object
mean cash public assistance income (dollars)                                                   object
agriculture, forestry, fishing and hunting, and mining                                        float64
construction                                                                                  float64
manufacturing                                                                                 float64
wholesale trade                                                                               float64
retail trade                                                                      

In [14]:
for col in unemployment_df.columns[2:5]:
    unemployment_df[col] = unemployment_df[col]/3

for col in unemployment_df.columns[5:18]:
    unemployment_df[col] = unemployment_df[col]/13

for col in unemployment_df.columns[18:22]:
    unemployment_df[col] = unemployment_df[col]/4

In [15]:
NEWNAME = 'State_Data.csv'
unemployment_df.to_csv('../udataset/' + NEWNAME, index=False)