# Week 2 Exercise 4
## Create a County (Community) Dataset
- Each row is one county
- Features (columns) are added from the following sources: AHRF, CHR, SVI
- Features are engineered for: 
    - A couple of ratios (from the AHRF data)
    - Community Health Needs Score (sum of z-scores for a number of features)

In [1]:
# Import needed modules
import pandas as pd
import sqlite3 as sql

In [2]:
# Connect to database (need the current working directory for systems that treat the local path as virtual)
import os
con = sql.connect(os.path.join(os.getcwd(), "synthea_and_county_ga.db"))

## Dataframe: Each County as a Row

In [3]:
# Start the data frame with the counties (for the selected state)
df = pd.read_sql_query("""\
    select fips, state as state_abbr, name as county_name
    from us_counties_fips
    where state = 'GA'
""", con)

In [4]:
# Quick settings and clean-up
pd.options.display.max_columns = None   # Display all columns instead of only a selected set
pd.options.display.max_rows = None      # Display all rows
df = df.rename(columns=str.lower)       # Convert column names to lowercase
df = df.set_index('fips')               # Set the index to the id, rather than having pandas create an autocount index

In [5]:
# Check the dataframe
df.shape

(159, 2)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 159 entries, 13001 to 13321
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   state_abbr   159 non-null    object
 1   county_name  159 non-null    object
dtypes: object(2)
memory usage: 3.7+ KB


In [7]:
df.head(5)

Unnamed: 0_level_0,state_abbr,county_name
fips,Unnamed: 1_level_1,Unnamed: 2_level_1
13001,GA,Appling County
13003,GA,Atkinson County
13005,GA,Bacon County
13007,GA,Baker County
13009,GA,Baldwin County


## Add the AHRF data to our dataframe

In [8]:
# AHRF
df_query = pd.read_sql_query("""\
    select fips, 
        pop_est_2020,
        pop_den_2010,
        phys_pricare,
        phys_total,
        hosp_total,
        hosp_cah,
        hosp_with_ed,
        hosp_with_offcampus_ed,
        hosp_with_pedsed,
        hosp_with_traumactr,
        ed_visits
    from us_counties_ahrf_2021
""", con)

df_query = df_query.rename(columns=str.lower)
df_query = df_query.add_prefix('ahrf_')
df_query = df_query.set_index('ahrf_fips')

In [9]:
df = pd.merge(df, df_query, left_index=True, right_index=True, how='left')

In [10]:
df.head(5)

Unnamed: 0_level_0,state_abbr,county_name,ahrf_pop_est_2020,ahrf_pop_den_2010,ahrf_phys_pricare,ahrf_phys_total,ahrf_hosp_total,ahrf_hosp_cah,ahrf_hosp_with_ed,ahrf_hosp_with_offcampus_ed,ahrf_hosp_with_pedsed,ahrf_hosp_with_traumactr,ahrf_ed_visits
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
13001,GA,Appling County,18325,36.0,6,9,1,0,0,0,0,0,7740
13003,GA,Atkinson County,8393,24.7,0,0,0,0,0,0,0,0,0
13005,GA,Bacon County,11036,42.9,7,10,1,1,0,0,0,0,3759
13007,GA,Baker County,2971,10.1,0,0,0,0,0,0,0,0,0
13009,GA,Baldwin County,45099,177.3,20,63,2,0,1,0,0,0,30686


## Add calcuated features (that are based on features from the AHRF data)

In [11]:
# Generate ratios

# ED Visits per captia
df['ahrf_ed_visits_per_capita'] = df['ahrf_ed_visits'] / df['ahrf_pop_est_2020']

In [12]:
# Primary care per 2000 people (should be about 1 primary care physician per 2000 people: see https://www.jabfm.org/content/jabfp/29/4/496.full.pdf)
df['ahrf_phys_pricare_per_2000'] = df['ahrf_phys_pricare'] / (df['ahrf_pop_est_2020'] / 2000)

In [13]:
# Rounding
df['ahrf_ed_visits_per_capita'] = df['ahrf_ed_visits_per_capita'].astype(float).round(2)
df['ahrf_phys_pricare_per_2000'] = df['ahrf_phys_pricare_per_2000'].astype(float).round(2)

In [14]:
df.head(5)

Unnamed: 0_level_0,state_abbr,county_name,ahrf_pop_est_2020,ahrf_pop_den_2010,ahrf_phys_pricare,ahrf_phys_total,ahrf_hosp_total,ahrf_hosp_cah,ahrf_hosp_with_ed,ahrf_hosp_with_offcampus_ed,ahrf_hosp_with_pedsed,ahrf_hosp_with_traumactr,ahrf_ed_visits,ahrf_ed_visits_per_capita,ahrf_phys_pricare_per_2000
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
13001,GA,Appling County,18325,36.0,6,9,1,0,0,0,0,0,7740,0.42,0.65
13003,GA,Atkinson County,8393,24.7,0,0,0,0,0,0,0,0,0,0.0,0.0
13005,GA,Bacon County,11036,42.9,7,10,1,1,0,0,0,0,3759,0.34,1.27
13007,GA,Baker County,2971,10.1,0,0,0,0,0,0,0,0,0,0.0,0.0
13009,GA,Baldwin County,45099,177.3,20,63,2,0,1,0,0,0,30686,0.68,0.89


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 159 entries, 13001 to 13321
Data columns (total 15 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   state_abbr                   159 non-null    object 
 1   county_name                  159 non-null    object 
 2   ahrf_pop_est_2020            159 non-null    object 
 3   ahrf_pop_den_2010            159 non-null    object 
 4   ahrf_phys_pricare            159 non-null    object 
 5   ahrf_phys_total              159 non-null    object 
 6   ahrf_hosp_total              159 non-null    object 
 7   ahrf_hosp_cah                159 non-null    object 
 8   ahrf_hosp_with_ed            159 non-null    object 
 9   ahrf_hosp_with_offcampus_ed  159 non-null    object 
 10  ahrf_hosp_with_pedsed        159 non-null    object 
 11  ahrf_hosp_with_traumactr     159 non-null    object 
 12  ahrf_ed_visits               159 non-null    object 
 13  ahrf_ed_visits

## Add the CHR data to our dataframe

In [17]:
# County health rankings 
# Note: 1 is the healthiest quartile, 1 is the healthiest rank
# Source: https://www.countyhealthrankings.org/explore-health-rankings/our-methods/calculating-ranks
df_query = pd.read_sql_query("""\
    select fips, 
        healthoutcomes_rank, healthoutcomes_quartile,
        healthfactors_rank, healthfactors_quartile, 
        lengthoflife_rank, lengthoflife_quartile,
        qualityoflife_rank, qualityoflife_quartile,
        healthbehaviors_rank, healthbehaviors_quartile,
        clinicalcare_rank, clinicalcare_quartile,
        sesfactors_rank, sesfactors_quartile,
        physenv_rank, physenv_quartile
       from us_counties_chr_2022
""", con)

df_query = df_query.rename(columns=str.lower)
df_query = df_query.add_prefix('chr_')
df_query = df_query.set_index('chr_fips')

In [18]:
df = pd.merge(df, df_query, left_index=True, right_index=True, how='left')

In [19]:
df.head(5)

Unnamed: 0_level_0,state_abbr,county_name,ahrf_pop_est_2020,ahrf_pop_den_2010,ahrf_phys_pricare,ahrf_phys_total,ahrf_hosp_total,ahrf_hosp_cah,ahrf_hosp_with_ed,ahrf_hosp_with_offcampus_ed,ahrf_hosp_with_pedsed,ahrf_hosp_with_traumactr,ahrf_ed_visits,ahrf_ed_visits_per_capita,ahrf_phys_pricare_per_2000,chr_healthoutcomes_rank,chr_healthoutcomes_quartile,chr_healthfactors_rank,chr_healthfactors_quartile,chr_lengthoflife_rank,chr_lengthoflife_quartile,chr_qualityoflife_rank,chr_qualityoflife_quartile,chr_healthbehaviors_rank,chr_healthbehaviors_quartile,chr_clinicalcare_rank,chr_clinicalcare_quartile,chr_sesfactors_rank,chr_sesfactors_quartile,chr_physenv_rank,chr_physenv_quartile
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
13001,GA,Appling County,18325,36.0,6,9,1,0,0,0,0,0,7740,0.42,0.65,113,3,118,3,108,3,116,3,131,4,152,4,79,2,14,1
13003,GA,Atkinson County,8393,24.7,0,0,0,0,0,0,0,0,0,0.0,0.0,112,3,137,4,122,4,108,3,130,4,154,4,115,3,15,1
13005,GA,Bacon County,11036,42.9,7,10,1,1,0,0,0,0,3759,0.34,1.27,139,4,123,4,144,4,115,3,118,3,159,4,92,3,22,1
13007,GA,Baker County,2971,10.1,0,0,0,0,0,0,0,0,0,0.0,0.0,77,2,128,4,74,2,82,3,104,3,109,3,141,4,104,3
13009,GA,Baldwin County,45099,177.3,20,63,2,0,1,0,0,0,30686,0.68,0.89,91,3,95,3,98,3,84,3,71,2,59,2,110,3,159,4


## Add the SVI data to our dataframe

In [20]:
# Social Vulnerability Index (SVI)
df_query = pd.read_sql_query("""\
    select fips,
        svi_perc,
        svi_ses_perc,
        svi_hcd_perc,
        svi_msl_perc,
        svi_htt_perc
    from us_counties_svi_2018
""", con)

df_query = df_query.rename(columns=str.lower)
df_query = df_query.set_index('fips')

In [21]:
df = pd.merge(df, df_query, left_index=True, right_index=True, how='left')

In [22]:
df.head(5)

Unnamed: 0_level_0,state_abbr,county_name,ahrf_pop_est_2020,ahrf_pop_den_2010,ahrf_phys_pricare,ahrf_phys_total,ahrf_hosp_total,ahrf_hosp_cah,ahrf_hosp_with_ed,ahrf_hosp_with_offcampus_ed,ahrf_hosp_with_pedsed,ahrf_hosp_with_traumactr,ahrf_ed_visits,ahrf_ed_visits_per_capita,ahrf_phys_pricare_per_2000,chr_healthoutcomes_rank,chr_healthoutcomes_quartile,chr_healthfactors_rank,chr_healthfactors_quartile,chr_lengthoflife_rank,chr_lengthoflife_quartile,chr_qualityoflife_rank,chr_qualityoflife_quartile,chr_healthbehaviors_rank,chr_healthbehaviors_quartile,chr_clinicalcare_rank,chr_clinicalcare_quartile,chr_sesfactors_rank,chr_sesfactors_quartile,chr_physenv_rank,chr_physenv_quartile,svi_perc,svi_ses_perc,svi_hcd_perc,svi_msl_perc,svi_htt_perc
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
13001,GA,Appling County,18325,36.0,6,9,1,0,0,0,0,0,7740,0.42,0.65,113,3,118,3,108,3,116,3,131,4,152,4,79,2,14,1,0.9669,0.8818,0.9605,0.8153,0.894
13003,GA,Atkinson County,8393,24.7,0,0,0,0,0,0,0,0,0,0.0,0.0,112,3,137,4,122,4,108,3,130,4,154,4,115,3,15,1,0.9615,0.979,0.7803,0.9395,0.7042
13005,GA,Bacon County,11036,42.9,7,10,1,1,0,0,0,0,3759,0.34,1.27,139,4,123,4,144,4,115,3,118,3,159,4,92,3,22,1,0.9389,0.8408,0.9535,0.7978,0.8204
13007,GA,Baker County,2971,10.1,0,0,0,0,0,0,0,0,0,0.0,0.0,77,2,128,4,74,2,82,3,104,3,109,3,141,4,104,3,0.7424,0.729,0.5782,0.6759,0.6068
13009,GA,Baldwin County,45099,177.3,20,63,2,0,1,0,0,0,30686,0.68,0.89,91,3,95,3,98,3,84,3,71,2,59,2,110,3,159,4,0.8666,0.9131,0.1576,0.7084,0.9494


## Add some provider data from Synthea
- Note: We don't need to do "organizations" from Synthea as Synthea only has one provider per organization, which makes the counts and utilizations averages per county the same between organizations and providers
- Note: Did not include 'utilization' as this is for all time and is relatively confusing

In [23]:
df_query = pd.read_sql_query("""\
    select uscc.county_fips, COUNT(p.Id) as provider_count
    from providers p
    inner join us_states_cities_counties uscc on p.state = uscc.state_id COLLATE NOCASE and p.city = uscc.city COLLATE NOCASE
    group by uscc.county_fips
""", con)

df_query = df_query.rename(columns=str.lower)
df_query = df_query.add_prefix('syn_')
df_query = df_query.set_index('syn_county_fips')

In [24]:
df = pd.merge(df, df_query, left_index=True, right_index=True, how='left')

In [25]:
df['syn_provider_count'] = df['syn_provider_count'].fillna(0)
df['syn_provider_count'] = df['syn_provider_count'].astype('int64')

In [26]:
df.head(5)

Unnamed: 0_level_0,state_abbr,county_name,ahrf_pop_est_2020,ahrf_pop_den_2010,ahrf_phys_pricare,ahrf_phys_total,ahrf_hosp_total,ahrf_hosp_cah,ahrf_hosp_with_ed,ahrf_hosp_with_offcampus_ed,ahrf_hosp_with_pedsed,ahrf_hosp_with_traumactr,ahrf_ed_visits,ahrf_ed_visits_per_capita,ahrf_phys_pricare_per_2000,chr_healthoutcomes_rank,chr_healthoutcomes_quartile,chr_healthfactors_rank,chr_healthfactors_quartile,chr_lengthoflife_rank,chr_lengthoflife_quartile,chr_qualityoflife_rank,chr_qualityoflife_quartile,chr_healthbehaviors_rank,chr_healthbehaviors_quartile,chr_clinicalcare_rank,chr_clinicalcare_quartile,chr_sesfactors_rank,chr_sesfactors_quartile,chr_physenv_rank,chr_physenv_quartile,svi_perc,svi_ses_perc,svi_hcd_perc,svi_msl_perc,svi_htt_perc,syn_provider_count
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
13001,GA,Appling County,18325,36.0,6,9,1,0,0,0,0,0,7740,0.42,0.65,113,3,118,3,108,3,116,3,131,4,152,4,79,2,14,1,0.9669,0.8818,0.9605,0.8153,0.894,11
13003,GA,Atkinson County,8393,24.7,0,0,0,0,0,0,0,0,0,0.0,0.0,112,3,137,4,122,4,108,3,130,4,154,4,115,3,15,1,0.9615,0.979,0.7803,0.9395,0.7042,3
13005,GA,Bacon County,11036,42.9,7,10,1,1,0,0,0,0,3759,0.34,1.27,139,4,123,4,144,4,115,3,118,3,159,4,92,3,22,1,0.9389,0.8408,0.9535,0.7978,0.8204,7
13007,GA,Baker County,2971,10.1,0,0,0,0,0,0,0,0,0,0.0,0.0,77,2,128,4,74,2,82,3,104,3,109,3,141,4,104,3,0.7424,0.729,0.5782,0.6759,0.6068,1
13009,GA,Baldwin County,45099,177.3,20,63,2,0,1,0,0,0,30686,0.68,0.89,91,3,95,3,98,3,84,3,71,2,59,2,110,3,159,4,0.8666,0.9131,0.1576,0.7084,0.9494,18


## Generate a "Community Health Needs Score" (from some of the existing features)

In [27]:
# Convert all <object> columns that are numeric to a numeric datatype
cols = df.columns.drop(['state_abbr', 'county_name']) # don't want string cols
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce', axis=1)

In [28]:
# Change all numeric columns to their respective z-scores
import scipy.stats as stats
df_zscore = df.select_dtypes(include='number').apply(stats.zscore)

In [29]:
# For comparison, what is the mean and std of one of the columns?
temp_mean = df['ahrf_pop_est_2020'].mean()
temp_std = df['ahrf_pop_est_2020'].std()
print(f'The mean of ahrf_pop_est_2020 is {temp_mean} and the std is {temp_std}')

The mean of ahrf_pop_est_2020 is 67358.59748427673 and the std is 146050.71327830883


In [30]:
df_zscore.head(5)

Unnamed: 0_level_0,ahrf_pop_est_2020,ahrf_pop_den_2010,ahrf_phys_pricare,ahrf_phys_total,ahrf_hosp_total,ahrf_hosp_cah,ahrf_hosp_with_ed,ahrf_hosp_with_offcampus_ed,ahrf_hosp_with_pedsed,ahrf_hosp_with_traumactr,ahrf_ed_visits,ahrf_ed_visits_per_capita,ahrf_phys_pricare_per_2000,chr_healthoutcomes_rank,chr_healthoutcomes_quartile,chr_healthfactors_rank,chr_healthfactors_quartile,chr_lengthoflife_rank,chr_lengthoflife_quartile,chr_qualityoflife_rank,chr_qualityoflife_quartile,chr_healthbehaviors_rank,chr_healthbehaviors_quartile,chr_clinicalcare_rank,chr_clinicalcare_quartile,chr_sesfactors_rank,chr_sesfactors_quartile,chr_physenv_rank,chr_physenv_quartile,svi_perc,svi_ses_perc,svi_hcd_perc,svi_msl_perc,svi_htt_perc,syn_provider_count
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
13001,-0.336791,-0.417278,-0.286879,-0.24578,-0.049265,-0.492125,-0.556094,-0.079556,-0.217786,-0.280009,-0.334818,0.10522,-0.308396,0.718979,0.4489,0.827915,0.4489,0.611985,0.4489,0.78434,0.4489,1.111149,1.341089,1.568681,1.341089,-0.021787,-0.443289,-1.437957,-1.335478,1.011171,0.669981,1.351231,0.770054,0.945364,-0.122098
13003,-0.405009,-0.447199,-0.331293,-0.260038,-0.651814,-0.492125,-0.556094,-0.079556,-0.217786,-0.280009,-0.456792,-0.869338,-1.39583,0.697191,0.4489,1.241872,1.341089,0.91695,1.341089,0.610043,0.4489,1.089362,1.341089,1.612255,1.341089,0.762553,0.4489,-1.41617,-1.335478,0.990082,1.092072,0.718766,1.402941,0.34473,-0.417925
13005,-0.386856,-0.399007,-0.279476,-0.244196,-0.049265,2.032002,-0.556094,-0.079556,-0.217786,-0.280009,-0.397555,-0.080411,0.728849,1.285447,1.341089,0.936851,1.341089,1.396181,1.341089,0.762553,0.4489,0.827915,0.4489,1.721191,1.341089,0.261447,0.4489,-1.26366,-1.335478,0.90182,0.491938,1.326662,0.680879,0.712452,-0.270011
13007,-0.442251,-0.485859,-0.331293,-0.260038,-0.651814,-0.492125,-0.556094,-0.079556,-0.217786,-0.280009,-0.456792,-0.869338,-1.39583,-0.065362,-0.443289,1.045787,1.341089,-0.128644,-0.443289,0.043574,0.4489,0.522894,0.4489,0.63183,0.4489,1.329021,1.341089,0.522894,0.4489,0.134407,0.006445,0.009437,0.059713,0.036502,-0.491881
13009,-0.152892,-0.043126,-0.183245,-0.160233,0.553284,-0.492125,0.577482,-0.079556,-0.217786,-0.280009,0.026785,0.708517,0.093118,0.23966,0.4489,0.326809,0.4489,0.394153,0.4489,0.087149,0.4489,-0.196085,-0.443289,-0.457532,-0.443289,0.653617,0.4489,1.721191,1.341089,0.619459,0.805901,-1.466783,0.225323,1.12068,0.13675


In [31]:
# Community needs score (higher score = more needs)
# Variables where positive is better are reversed coded
df['comm_health_needs_score'] = (-1 * df_zscore['ahrf_phys_pricare_per_2000']) + df_zscore['ahrf_ed_visits_per_capita'] + df_zscore['chr_healthoutcomes_rank'] + df_zscore['chr_healthfactors_rank'] + df_zscore['svi_perc']

In [32]:
df['comm_health_needs_score'] = df['comm_health_needs_score'].round(2)

In [33]:
df.head(5)

Unnamed: 0_level_0,state_abbr,county_name,ahrf_pop_est_2020,ahrf_pop_den_2010,ahrf_phys_pricare,ahrf_phys_total,ahrf_hosp_total,ahrf_hosp_cah,ahrf_hosp_with_ed,ahrf_hosp_with_offcampus_ed,ahrf_hosp_with_pedsed,ahrf_hosp_with_traumactr,ahrf_ed_visits,ahrf_ed_visits_per_capita,ahrf_phys_pricare_per_2000,chr_healthoutcomes_rank,chr_healthoutcomes_quartile,chr_healthfactors_rank,chr_healthfactors_quartile,chr_lengthoflife_rank,chr_lengthoflife_quartile,chr_qualityoflife_rank,chr_qualityoflife_quartile,chr_healthbehaviors_rank,chr_healthbehaviors_quartile,chr_clinicalcare_rank,chr_clinicalcare_quartile,chr_sesfactors_rank,chr_sesfactors_quartile,chr_physenv_rank,chr_physenv_quartile,svi_perc,svi_ses_perc,svi_hcd_perc,svi_msl_perc,svi_htt_perc,syn_provider_count,comm_health_needs_score
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1
13001,GA,Appling County,18325.0,36.0,6.0,9.0,1.0,0.0,0.0,0.0,0.0,0.0,7740.0,0.42,0.65,113.0,3.0,118.0,3.0,108.0,3.0,116.0,3.0,131.0,4.0,152.0,4.0,79.0,2.0,14.0,1.0,0.9669,0.8818,0.9605,0.8153,0.894,11.0,2.97
13003,GA,Atkinson County,8393.0,24.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,112.0,3.0,137.0,4.0,122.0,4.0,108.0,3.0,130.0,4.0,154.0,4.0,115.0,3.0,15.0,1.0,0.9615,0.979,0.7803,0.9395,0.7042,3.0,3.46
13005,GA,Bacon County,11036.0,42.9,7.0,10.0,1.0,1.0,0.0,0.0,0.0,0.0,3759.0,0.34,1.27,139.0,4.0,123.0,4.0,144.0,4.0,115.0,3.0,118.0,3.0,159.0,4.0,92.0,3.0,22.0,1.0,0.9389,0.8408,0.9535,0.7978,0.8204,7.0,2.31
13007,GA,Baker County,2971.0,10.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,77.0,2.0,128.0,4.0,74.0,2.0,82.0,3.0,104.0,3.0,109.0,3.0,141.0,4.0,104.0,3.0,0.7424,0.729,0.5782,0.6759,0.6068,1.0,1.64
13009,GA,Baldwin County,45099.0,177.3,20.0,63.0,2.0,0.0,1.0,0.0,0.0,0.0,30686.0,0.68,0.89,91.0,3.0,95.0,3.0,98.0,3.0,84.0,3.0,71.0,2.0,59.0,2.0,110.0,3.0,159.0,4.0,0.8666,0.9131,0.1576,0.7084,0.9494,18.0,1.8


## Output to CSV (for use in Tableau)

In [33]:
df.to_csv('w2e4_county_dataset.csv')