# Filter down the raw dataset to create the training dataset, with labels

In [1]:
import pandas as pd
import swifter
import numpy as np

In [2]:
y2017_data = pd.read_csv("../Datasets/2017_data.csv")
y2020_data = pd.read_csv("../Datasets/2020_data.csv")

### Aggregate 2017 data

The data is originally split by both the census tract as well as the measurement (diabetes rate, obesity rate, etc)\
These rows should be aggregated into a single row, that contains several of those measurements

In [3]:
measure_ids = list(y2017_data['MeasureId'].unique())

## MeasureId's with their associated descriptions, in the 2017 data

In [4]:
dict(zip(y2017_data['MeasureId'], y2017_data['Measure']))

{'ARTHRITIS': 'Arthritis among adults aged >=18 Years',
 'CSMOKING': 'Current smoking among adults aged >=18 Years',
 'CHD': 'Coronary heart disease among adults aged >=18 Years',
 'OBESITY': 'Obesity among adults aged >=18 Years',
 'CHOLSCREEN': 'Cholesterol screening among adults aged >=18 Years',
 'BINGE': 'Binge drinking among adults aged >=18 Years',
 'COPD': 'Chronic obstructive pulmonary disease among adults aged >=18 Years',
 'DIABETES': 'Diagnosed diabetes among adults aged >=18 Years',
 'CASTHMA': 'Current asthma among adults aged >=18 Years',
 'ACCESS2': 'Current lack of health insurance among adults aged 18–64 Years',
 'KIDNEY': 'Chronic kidney disease among adults aged >=18 Years',
 'STROKE': 'Stroke among adults aged >=18 Years',
 'LPA': 'No leisure-time physical activity among adults aged >=18 Years',
 'BPHIGH': 'High blood pressure among adults aged >=18 Years',
 'CANCER': 'Cancer (excluding skin cancer) among adults aged >=18 Years',
 'CHECKUP': 'Visits to doctor for r

#### General location information. these should be the same across a single census tract

In [5]:
location_columns = [
    'StateAbbr',
    'StateDesc',
    'CityName',
    'CityFIPS',
    'TractFIPS',
]

In [6]:
# takes the result of the groupby, and combine into a single row
def aggregate_census_tract_data(census_tract_data):
    new_row = {}

    # add location information to new row
    for location_column in location_columns:
        new_row[location_column] = census_tract_data[location_column].values[0]
    
    # also get the 2017 population data. this can be used later for weighted averaging
    new_row['Population_2017'] = census_tract_data['PopulationCount'].values[0]
    
    # each row represents a specific measurement from this census tract
    for index, row in census_tract_data.iterrows():
        measure_id = row['MeasureId']
        measure_value = row['Data_Value']
        new_row[f'{measure_id}_2017'] = measure_value
    
    return new_row

In [7]:
row_list = list(y2017_data.groupby(['TractFIPS']).apply(aggregate_census_tract_data))

In [8]:
training_data = pd.DataFrame(row_list)

# Now bring in the 2020 data, mainly looking at the diabetes rate in each census tract

NOTE: In the 2020 dataset, LocationID is used instead of TractFIPS

In [9]:
diabetes_data_2020 = y2020_data.query('MeasureId == "DIABETES"')
diabetes_rate_map = dict(zip(
    diabetes_data_2020['LocationID'], # synonymous with TractFIPS
    diabetes_data_2020['Data_Value']
))

# this is just a map of TractFIPS to 2020 diabetes rates
[print(v) for i, v in enumerate(diabetes_rate_map.items()) if i < 3];

(1055000300, 24.2)
(1073000100, 19.1)
(1073000500, 23.7)


In [10]:
training_data['DIABETES_2020'] = training_data['TractFIPS'].swifter.apply(diabetes_rate_map.get)

Pandas Apply:   0%|          | 0/27648 [00:00<?, ?it/s]

### Some of the census tracts don't have the 2020 diabetes information, so will drop them,
(don't want to impute the target prediction)

In [11]:
null_counts = training_data['DIABETES_2020'].isna().sum()
print(f'{null_counts} of the census tracts do not have 2020 diabetes rates')

111 of the census tracts do not have 2020 diabetes rates


In [12]:
training_data = training_data.query('DIABETES_2020.notnull()').copy()

## Rearrange the columns (want 2017/2020 diabetes both on the far right)

In [13]:
columns = list(training_data.columns)
columns.remove('DIABETES_2017')
columns.remove('DIABETES_2020')
columns.append('DIABETES_2017')
columns.append('DIABETES_2020')

In [14]:
training_data = training_data[columns]

## Finally add the diabetes_change_rate

IE if a census tract in 2017 had 10% diabetes rate, and then had 15% diabetes rate in 2020\
then the diabetes_change_rate would be 0.5

In [15]:
training_data['diabetes_change_rate'] =\
    (training_data['DIABETES_2020'] - training_data['DIABETES_2017']) / training_data['DIABETES_2017']

# FIPS code re-formatting
FIPS is how various government entities label areas. The data in raw csv format loses some information.\
IE 01125012502 becomes 1125012502.0\
Need to convert this back to string format, and parse out other codes:\

In [16]:
# original format
training_data[['CityFIPS', 'TractFIPS']].head(3)

Unnamed: 0,CityFIPS,TractFIPS
0,107000.0,1073000000.0
1,107000.0,1073000000.0
2,107000.0,1073000000.0


In [17]:
# reformat the TractFIPS, adding back '0' at the front if necessary
def reformat_tract_fips(tract_fips):
    str_form = str(tract_fips).split('.')[0]
    return str_form.zfill(11) # zfill fills the left with 0's

training_data['TractFIPS'] = training_data['TractFIPS'].swifter.apply(reformat_tract_fips)

Pandas Apply:   0%|          | 0/27537 [00:00<?, ?it/s]

In [18]:
def reformat_city_fips(city_fips):
    str_form = str(city_fips).split('.')[0]
    return str_form.zfill(7) # zfill fills the left with 0's
training_data['CityFIPS'] = training_data['CityFIPS'].swifter.apply(reformat_city_fips)

Pandas Apply:   0%|          | 0/27537 [00:00<?, ?it/s]

In [19]:
# original format
training_data[['CityFIPS', 'TractFIPS']].head(3)

Unnamed: 0,CityFIPS,TractFIPS
0,107000,1073000100
1,107000,1073000300
2,107000,1073000400


# FIPS code parsing

Now that the FIPS code is back in the normal string form, can parse out some information\
According to www.census.gov, the TractFIPS has 3 parts:\
- State Code (+2)
- County Code (+3)
- Tract Code (+6)

IE 01073000100 would become
- State Code "01"
- County Code "01073" (this will be used later to join with the poverty data)
- Tract Code "01073000100"

In [20]:
def get_county_fips(tract_fips):
    return tract_fips[0:5]

In [21]:
training_data['CountyFIPS'] = training_data['TractFIPS'].swifter.apply(get_county_fips)

Pandas Apply:   0%|          | 0/27537 [00:00<?, ?it/s]

In [22]:
training_data[['TractFIPS', 'CountyFIPS']].head(3)

Unnamed: 0,TractFIPS,CountyFIPS
0,1073000100,1073
1,1073000300,1073
2,1073000400,1073


In [23]:
county_fips_col = training_data.pop('CountyFIPS')
training_data.insert(3, 'CountyFIPS', county_fips_col)

In [24]:
training_data.head(3)

Unnamed: 0,StateAbbr,StateDesc,CityName,CountyFIPS,CityFIPS,TractFIPS,Population_2017,COPD_2017,STROKE_2017,LPA_2017,...,HIGHCHOL_2017,CSMOKING_2017,BINGE_2017,ARTHRITIS_2017,ACCESS2_2017,BPHIGH_2017,PHLTH_2017,DIABETES_2017,DIABETES_2020,diabetes_change_rate
0,AL,Alabama,Birmingham,1073,107000,1073000100,3042,11.3,5.5,44.0,...,35.3,27.9,10.3,31.2,24.4,47.7,20.5,17.4,19.1,0.097701
1,AL,Alabama,Birmingham,1073,107000,1073000300,2735,11.9,6.1,47.5,...,36.8,29.3,10.5,30.5,30.4,48.2,22.2,19.8,21.8,0.10101
2,AL,Alabama,Birmingham,1073,107000,1073000400,3338,10.5,5.6,43.0,...,35.6,25.9,10.4,31.5,24.7,49.1,19.4,18.4,18.9,0.027174


# Check for nulls in the target variable (diabetes_change_rate)

In [25]:
# original number of rows
training_data.shape[0] 

27537

In [26]:
# keep only rows with the target variable
training_data = training_data.dropna()
training_data.shape[0]

26878

In [27]:
# lost about 700 rows after dropping nulls, should be fine
training_data.to_csv('../Datasets/training_data.csv', index=False)