# Join the poverty data, which is at the county level

In [1]:
import pandas as pd

In [2]:
unemployment_data = pd.read_excel('../Datasets/Unemployment.xlsx', skiprows=4) # first 4 rows are irrelevant

## The FIPS_code in the unemployment data is the 5-digit county fips code
It needs to be reformatted, as FIPS code with leading 0's were truncated

In [3]:
unemployment_data[['FIPS_code', 'State', 'Area_name']].head(5)

Unnamed: 0,FIPS_code,State,Area_name
0,0,US,United States
1,1000,AL,Alabama
2,1001,AL,"Autauga County, AL"
3,1003,AL,"Baldwin County, AL"
4,1005,AL,"Barbour County, AL"


In [4]:
# need to get rid of the state-level columns. 
unemployment_data.drop(0, inplace=True)
unemployment_data[['FIPS_code', 'State', 'Area_name']].head(5)

Unnamed: 0,FIPS_code,State,Area_name
1,1000,AL,Alabama
2,1001,AL,"Autauga County, AL"
3,1003,AL,"Baldwin County, AL"
4,1005,AL,"Barbour County, AL"
5,1007,AL,"Bibb County, AL"


# Reformat FIPS_code and rename as "CountyFIPS"

In [5]:
unemployment_data['FIPS_code'] = unemployment_data['FIPS_code'].astype(str).str.pad(width=5, side='left', fillchar='0')

In [6]:
unemployment_data[['FIPS_code', 'State', 'Area_name']].head(5)

Unnamed: 0,FIPS_code,State,Area_name
1,1000,AL,Alabama
2,1001,AL,"Autauga County, AL"
3,1003,AL,"Baldwin County, AL"
4,1005,AL,"Barbour County, AL"
5,1007,AL,"Bibb County, AL"


In [7]:
unemployment_data.rename(columns={'FIPS_code': 'CountyFIPS'}, inplace=True)

# Check the county overlap with the training data

In [8]:
# need to make sure these columns get read as strings
fips_columns = [
    'CountyFIPS',
    'CityFIPS',
    'TractFIPS'
]
dtypes_map = {x: 'str' for x in fips_columns}
training_data = pd.read_csv('../Datasets/joined_health_data.csv', dtype=dtypes_map)

In [9]:
unemployment_data_counties = set(unemployment_data['CountyFIPS'])
training_data_counties = set(training_data['CountyFIPS'])

In [10]:
assert len(training_data_counties.intersection(unemployment_data_counties)) ==\
        len(training_data_counties)
print('All of the counties in the training data are accounted for in the unemployment data')

All of the counties in the training data are accounted for in the unemployment data


# Engineer some columns into the unemployment dataset, and then join to the training dataset

In [11]:
# only keep these columns
unemployment_data = unemployment_data[[
    'CountyFIPS',
    'Unemployment_rate_2016',
    'Unemployment_rate_2017'
]]

In [12]:
unemployment_data['Unemployment_trend'] =\
    (unemployment_data['Unemployment_rate_2017'] - unemployment_data['Unemployment_rate_2016']) / unemployment_data['Unemployment_rate_2016']

In [13]:
unemployment_data = unemployment_data.set_index('CountyFIPS') # set as index for join
unemployment_data = unemployment_data.drop(columns=['Unemployment_rate_2016'])
unemployment_data.head(3)

Unnamed: 0_level_0,Unemployment_rate_2017,Unemployment_trend
CountyFIPS,Unnamed: 1_level_1,Unnamed: 2_level_1
1000,4.5,-0.237288
1001,4.0,-0.215686
1003,4.2,-0.222222


In [14]:
training_data.shape[0]

27119

In [15]:
training_data = training_data.join(unemployment_data, on='CountyFIPS') 

In [16]:
new_column_names = unemployment_data.columns
for new_column_name in new_column_names:
    col = training_data.pop(new_column_name)
    training_data.insert(6, new_column_name, col)

In [17]:
training_data.head(3)

Unnamed: 0,StateAbbr,StateDesc,CityName,CountyFIPS,TractFIPS,BPMED_2017,Unemployment_trend,Unemployment_rate_2017,BPMED_trend,CHOLSCREEN_2017,...,BPHIGH_trend,STROKE_2017,STROKE_trend,KIDNEY_2017,KIDNEY_trend,MHLTH_2017,MHLTH_trend,COPD_2017,COPD_trend,diabetes_change_rate_2020
0,AL,Alabama,Birmingham,1073,1073000100,77.8,-0.22807,4.4,-0.0275,78.6,...,0.032468,5.5,-0.017857,4.3,0.02381,20.5,0.102151,11.3,0.036697,0.097701
1,AL,Alabama,Birmingham,1073,1073000300,78.5,-0.22807,4.4,-0.022416,76.0,...,0.057018,6.1,0.033898,4.6,0.069767,21.2,0.152174,11.9,0.101852,0.10101
2,AL,Alabama,Birmingham,1073,1073000400,79.4,-0.22807,4.4,-0.031707,80.2,...,-0.01996,5.6,-0.017544,4.2,0.0,18.8,0.074286,10.5,0.019417,0.027174


In [18]:
training_data.to_csv('../Datasets/training_data_full.csv', index=False)