In [1]:
# Import libraries
#pip install sodapy

import requests as res
import pandas as pd
from sodapy import Socrata

### Step 1: Problem statement

PROBLEM: To identify emerging health problems in US amoung adults and to inform development and implementation of effective, targeted public health prevention activities.

SOLUTION: To provide quality health estimates for 500 cities in the US by providing data estimates for 27 measures of chronic disease related to unhealthy behaviors (5), health outcomes (13), and use of preventive services (9) and focus their efforts to improve Health.

DATASOURCE: Data sources used to generate these measures include:

. Behavioral Risk Factor Surveillance System (BRFSS) data (2017, 2016), \n
. Census Bureau 2010 census population data, and
. American Community Survey (ACS) 2013-2017, 2012-2016 estimates
We are using structured dataset we have obtained from CDC website

PREDICTING:Emerging Health Problems in 500 cities in the US.

FEATURES:State, CityName, GeoLocation, Population Count,caterory,measure,Crude Prevalance

TARGET: datavalue

We are using a supervised model and trying to predict using regression model.

we are using multiple features such as the city/census tract, measures of cronic illness,population count to define the target.

### Step 2: Data Acquisition

DataSource : https://chronicdata.cdc.gov/500-Cities-Places/500-Cities-Local-Data-for-Better-Health-2019-relea/6vp6-wxuq

In [2]:
#https://dev.socrata.com/foundry/chronicdata.cdc.gov/6vp6-wxuq
#API to download data
client = Socrata("chronicdata.cdc.gov", None)
results = client.get("6vp6-wxuq", limit = 810200)
chronicdata = pd.DataFrame.from_records(results)
chronicdata.to_csv('500_Cities__Local_Data_for_Better_Health.csv')



In [3]:
#read chronic data
chronicdata_df = pd.read_csv('500_Cities__Local_Data_for_Better_Health.csv')   
chronicdata_df.head()

Unnamed: 0.1,Unnamed: 0,year,stateabbr,statedesc,cityname,geographiclevel,datasource,category,uniqueid,measure,...,geolocation,categoryid,measureid,cityfips,tractfips,short_question_text,:@computed_region_bxsw_vy29,:@computed_region_he4y_prf8,data_value_footnote_symbol,data_value_footnote
0,0,2017,CA,California,Hawthorne,Census Tract,BRFSS,Health Outcomes,0632548-06037602504,Arthritis among adults aged >=18 Years,...,"{'latitude': '33.905547923', 'longitude': '-11...",HLTHOUT,ARTHRITIS,632548.0,6037603000.0,Arthritis,8.0,1195.0,,
1,1,2017,CA,California,Hawthorne,City,BRFSS,Unhealthy Behaviors,0632548,Current smoking among adults aged >=18 Years,...,"{'latitude': '33.914667701', 'longitude': '-11...",UNHBEH,CSMOKING,632548.0,,Current Smoking,8.0,1195.0,,
2,2,2017,CA,California,Hayward,City,BRFSS,Health Outcomes,0633000,Coronary heart disease among adults aged >=18 ...,...,"{'latitude': '37.6329591551', 'longitude': '-1...",HLTHOUT,CHD,633000.0,,Coronary Heart Disease,8.0,1141.0,,
3,3,2017,CA,California,Hayward,City,BRFSS,Unhealthy Behaviors,0633000,Obesity among adults aged >=18 Years,...,"{'latitude': '37.6329591551', 'longitude': '-1...",UNHBEH,OBESITY,633000.0,,Obesity,8.0,1141.0,,
4,4,2017,CA,California,Hemet,City,BRFSS,Prevention,0633182,Cholesterol screening among adults aged >=18 Y...,...,"{'latitude': '33.7352277311', 'longitude': '-1...",PREVENT,CHOLSCREEN,633182.0,,Cholesterol Screening,8.0,1177.0,,


### Step 3: Data Dictionary

| original column name       | column name           | description                       | datatype   |
|----------------------------|-----------------------|-----------------------------------|------------|
| Year                       | year                  | year                              | int        |
| StateAbbr                  | state                 | state abbrevation                 | string     |
| StateDesc                  | state_name            | state name                        | string     |
| CityName                   | city_name             | city name                         | string     |
| GeographicalLevel          | geographical_level    | US/City/Census Tract              | string     |
| DataSource                 | data_source           | data_source                       | string     |
| Category                   | category              | topic                             | string     |
| UniqueId                   | unique_id             | cityFIPS/tractFIPS                | string     |
| Measure                    | measure               | measure full name                 | string     |
| Data_Value_Unit            | unit                  | data value unit % for percent     | string     |
| DataValueTypeID            | data_type_id          | identifier for data value type    | string     |
| Data_Value_Type            | data_type             | data type                         | string     |
| Data_Value                 | data                  | data value                        | int        |
| Low_Confidence_Limit       | low_confidence_limit  | low confidence limit              | int        |
| High_Confidence_Limit      | high_confidence_limit | high confidence limit             | int        |
| Data_Value_Footnote_Symbol | footnote_symbol       | footnote symbol                   | string     |
| Data_Value_Footnote        | footnote_text         | footnote text                     | string     |
| PopulationCount            | population_count      | population count from census 2020 | int        |
| GeoLocation                | geolocation           | latitude and longitude            | dictionary |
| CategoryID                 | category              | identifier for topic              | string     |
| MeasureID                  | measure               | measure identifier                | string     |
| cityFIPS                   | city_fips             | FIPS code                         | string     |
| TractFIPS                  | tract_fips            | FIPS code                         | string     |
| Short_Question_Text        | measure_name          | measure short name                | string     |

Step 4: Feature Extraction
Structured Dataset

Step 5: Data cleaning

In [12]:
#dropping columns not needed
df1=chronicdata.drop(columns=['year', 'statedesc', 'datasource', 'measure','data_value_unit', 'data_value_footnote', 'data_value_type', 'low_confidence_limit', 'high_confidence_limit', 'data_value_footnote_symbol', 'categoryid', 'short_question_text'])
df1
# dropping all rows with any Null/NaN/NaT Values
df2=df1.dropna()
df2


Unnamed: 0,stateabbr,cityname,geographiclevel,category,uniqueid,datavaluetypeid,data_value,populationcount,geolocation,measureid,cityfips,tractfips,:@computed_region_bxsw_vy29,:@computed_region_he4y_prf8
0,CA,Hawthorne,Census Tract,Health Outcomes,0632548-06037602504,CrdPrv,14.6,4407,"{'latitude': '33.905547923', 'longitude': '-11...",ARTHRITIS,0632548,06037602504,8,1195
5,CA,Indio,Census Tract,Health Outcomes,0636448-06065045213,CrdPrv,22.0,5006,"{'latitude': '33.7144617083', 'longitude': '-1...",ARTHRITIS,0636448,06065045213,8,1177
8,CA,Inglewood,Census Tract,Health Outcomes,0636546-06037601801,CrdPrv,12.7,2472,"{'latitude': '33.9439711273', 'longitude': '-1...",DIABETES,0636546,06037601801,8,1195
15,AL,Hoover,Census Tract,Prevention,0135896-01073014302,CrdPrv,81.9,1636,"{'latitude': '33.3923792867', 'longitude': '-8...",MAMMOUSE,0135896,01073014302,29,1583
18,AL,Huntsville,Census Tract,Health Outcomes,0137000-01089002922,CrdPrv,9.3,4387,"{'latitude': '34.612755588', 'longitude': '-86...",DIABETES,0137000,01089002922,29,1588
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
810096,WA,Vancouver,Census Tract,Health Outcomes,5374060-53011041011,CrdPrv,24.2,2994,"{'latitude': '45.658182142', 'longitude': '-12...",ARTHRITIS,5374060,53011041011,6,2977
810098,WI,Madison,Census Tract,Health Outcomes,5548000-55025000202,CrdPrv,3.8,3016,"{'latitude': '43.0683437287', 'longitude': '-8...",COPD,5548000,55025000202,41,867
810099,WA,Tacoma,Census Tract,Prevention,5370000-53053061900,CrdPrv,74.2,1961,"{'latitude': '47.2276735186', 'longitude': '-1...",CHOLSCREEN,5370000,53053061900,6,3210
810100,WA,Vancouver,Census Tract,Health Outcomes,5374060-53011041331,CrdPrv,28.1,2565,"{'latitude': '45.6495869118', 'longitude': '-1...",HIGHCHOL,5374060,53011041331,6,2977
