## Why 500city_synthesize Workbook?
The purpose of this workbook is to extract the 500city data from the cdc, clean the information into a tidy format, and saved to csv to explore in other workbooks.

In [1]:
#Standard Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Pull data from cdc using their Socrata api import
To use, pip install sodapy prior to running

In [2]:
#Import api pull Socrata to pull from CDC system
from sodapy import Socrata

# Get Data From CDC Socrata App
app_token = 'pqDy7wkIUuirGdAd00ekO0bh5'
client = Socrata("chronicdata.cdc.gov", app_token=app_token)
results = client.get("csmm-fdhi", limit=900000)

# Convert data pull results to pandas DataFrame
df = pd.DataFrame.from_records(results)

# Convert data value columns to numeric and change to percent
df['data_value'] =pd.to_numeric(df['data_value'])/100.0
df['high_confidence_limit'] = pd.to_numeric(df['high_confidence_limit'])/100.0
df['low_confidence_limit'] = pd.to_numeric(df['low_confidence_limit'])/100.0

# Convert population column to numeric
df['populationcount'] = pd.to_numeric(df['populationcount'])

print('Done!')

Done!


## Create lists of columns for 3 data frames, locations data, data definitions, and actual data

In [3]:
locations_columns = ['uniqueid','stateabbr',
                     'cityname','geographiclevel',
                     'tractfips','cityfips',
                     'geolocation','statedesc']

data_def_columns = ['categoryid','category',
                    'measureid','short_question_text',
                    'measure','data_value_type']

data_columns = ['categoryid', 'data_value', 
               'data_value_footnote',
               'data_value_footnote_symbol', 'datavaluetypeid', 
               'geolocation','high_confidence_limit', 
               'low_confidence_limit', 'populationcount',
               'stateabbr', 'uniqueid','measureid', 'year']
print('Done!')

Done!


## Build locations Dataframe and save to csv

In [4]:
def coordinates(x,pos):
    '''Function to separate coordinates of geolocation column, x is pd.series, pos is list position'''
    if type(x)==dict:
        return x['coordinates'][pos]
    else:
        return np.nan

#Build Locations DF and save to csv
locations_df = df[locations_columns].copy()
locations_df['latitude'] = locations_df['geolocation'].apply(coordinates,pos=0)
locations_df['longitude'] = locations_df['geolocation'].apply(coordinates,pos=1)
del locations_df['geolocation']
locations_df = locations_df.drop_duplicates().sort_values('uniqueid')
locations_df.to_csv('../data/cleansed/locations.csv')
print('Done!')

Done!


## Build Data Definitions Dataframe and Save To Csv

In [5]:
df_data_def = df[data_def_columns].copy()
df_data_def = df_data_def.drop_duplicates()
df_data_def.to_csv('../data/cleansed/data_def.csv')
print('Done!')

Done!


## Build Data Dataframe, pivot to tidy standards, and save to csv

In [7]:
df_data = df[data_columns].copy()

df_data_piv = df_data.pivot_table(values='data_value',
                          index=['uniqueid','datavaluetypeid','populationcount'],
                          columns='measureid',aggfunc=np.sum)
df_data_piv.reset_index(inplace=True)

# Save data where population count is > 50 (CDC website withholds data for population counts < 50)
df_data_piv[df_data_piv['populationcount']>50].to_csv('../data/cleansed/data.csv')

## Finished synthesizing data - see 500city_tidy to explore the data