In [1]:
import pandas as pd

# Links to original source data

CDC's SVI index:      
https://www.atsdr.cdc.gov/placeandhealth/svi/data_documentation_download.html
- Most recent year for our research was 2018
   
San Antonio COVID-19 Dashboard:     
https://cosacovid-cosagis.hub.arcgis.com/datasets/bexar-county-covid-19-data-by-zip-code/data?geometry=-100.416%2C29.018%2C-96.502%2C29.855&showData=true
- San Antonio data is only available for current day by zip code, our data was downloaded on December 8th, 2020

Dallas COVID-19 Dashboard:      
https://www.dallascounty.org/covid-19/
- Dallas does not allow download by zip code, and only shows range of case counts per zip code.
- On December 8th, 2020 we took the median of the range for each zip code and created a list of case counts by zip code

HUD Crosswalk:     
https://www.huduser.gov/portal/datasets/usps_crosswalk.html
- used tract to zip file to find percentage of addresses within a particular zip code that a specific census tract accounted for


# Filter SVI for Bexar and Dallas 

- write to separate .csv files

# Create Acquire scripts

- Read San Antonio case data and return dataframe (get_san_antonio_data)
- Read Dallas case data and return dataframe (get_dallas_data)
- Read San Antonio SVI info and return dataframe (get_sa_svi_data)
- Read Dallas SVI info and return dataframe (get_dallas_svi_data)
- Send city case dataframe in and read HUD file. Get zip codes for only those present in the city dataframe. Group by the tract and get the maximum total addresses percentage for each tract within a zip code. Return dataframe with zip code and maximum address percentage by census tract (get_HUD)
- Compile the dataset merging on tract and zip (compile_sa_data and compile_dallas_data)

# Create Prepare scripts

# creating SVI history.csv

- get 2018, 2016, 2014 historical SVI data .csv
    - note: 2010 is not available sorted by state
    - need to watch for possible issues with tract/FIPS changes creating null values
- get the rank/raw score for each theme and total by census track for each year available
- merge these back together on FIPS and watch for nulls
- change up or down from previous year
- stddev
- %/amount change year over year?


Groupings/Themes
- theme1 = socioeconomic status
- theme2 = household composition and disability
- theme3 = minority and language status
- theme4 = housing type and transportation


- NOTE: excluding 2010 because shape and columns in .csv are substantially different, will only got back to 2014

In [None]:
# get 2018 by census tract
SVI2018 = pd.read_csv('data_csv_files/SVI2018_US.csv')

In [None]:
SVI2018.head()

In [None]:
# filter for just TX
SVI2018 = SVI2018[SVI2018.STATE == 'TEXAS']

In [None]:
SVI2018.head()

In [None]:
# get list of all raw/rank SVI score columns
rpl_list18 = [col for col in SVI2018.columns if col.startswith('R')]

In [None]:
rpl_list18

In [None]:
# create dataframe with just the raw/rank score columns
df2018 = SVI2018[[c for c in SVI2018.columns if c in rpl_list18]]

In [None]:
# rename columns
df2018.rename(columns = {'RPL_THEMES': 'raw_svi2018', 
                     "RPL_THEME1": "r_soci_total2018", 
                     "RPL_THEME2": "r_comp_total2018", 
                     "RPL_THEME3": "r_status_total2018", 
                     "RPL_THEME4": "r_trans_total2018"}, inplace = True)

In [None]:
# add back county, FIPS, population
df2018['county'] = SVI2018.COUNTY
df2018['tract'] = SVI2018.FIPS
df2018['est_population2018'] = SVI2018.E_TOTPOP

In [None]:
df2018.head()

In [None]:
df2018.shape

In [None]:
# note 2016 is already filtered down to TX only
SVI2016 = pd.read_csv('data_csv_files/TX_SVI_census2016.csv')

In [None]:
SVI2016.head()

In [None]:
rpl_list16 = [col for col in SVI2016.columns if col.startswith('R')]

In [None]:
rpl_list16

In [None]:
# create dataframe with just the raw/rank score columns
df2016 = SVI2016[[c for c in SVI2016.columns if c in rpl_list16]]

In [None]:
# rename columns
df2016.rename(columns = {'RPL_THEMES': 'raw_svi2016', 
                     "RPL_THEME1": "r_soci_total2016", 
                     "RPL_THEME2": "r_comp_total2016", 
                     "RPL_THEME3": "r_status_total2016", 
                     "RPL_THEME4": "r_trans_total2016"}, inplace = True)

In [None]:
# add back county, FIPS, population
#df2016['county2016'] = SVI2016.COUNTY
df2016['tract'] = SVI2016.FIPS
df2016['est_population2016'] = SVI2016.E_TOTPOP

In [None]:
df2016.head()


In [None]:
df2016.shape

In [None]:
# note 2014 is already filtered down to TX only
SVI2014 = pd.read_csv('TX_SVI_census2014.csv')

In [None]:
rpl_list14 = [col for col in SVI2014.columns if col.startswith('R')]
rpl_list14

In [None]:
# create dataframe with just the raw/rank score columns
df2014 = SVI2014[[c for c in SVI2014.columns if c in rpl_list14]]

In [None]:
# rename columns
df2014.rename(columns = {'RPL_THEMES': 'raw_svi2014', 
                     "RPL_THEME1": "r_soci_total2014", 
                     "RPL_THEME2": "r_comp_total2014", 
                     "RPL_THEME3": "r_status_total2014", 
                     "RPL_THEME4": "r_trans_total2014"}, inplace = True)

In [None]:
# add back county, FIPS, population
#df2014['county2014'] = SVI2014.COUNTY
df2014['tract'] = SVI2014.FIPS
df2014['est_population2014'] = SVI2014.E_TOTPOP

In [None]:
df2014.head()


In [None]:
df2014.shape

In [None]:
# get 2010 by census tract
SVI2010 = pd.read_csv('SVI2010_US.csv')

In [None]:
# DO NOT USE - not worth time to sort
SVI2010.head()

In [None]:
# merge the 2018, 2016, and 2014 dateframes on census_tract
# all are same shape = (5254, 8), use left merge just in case so don't lose any 2018 data
SVI_hist = pd.merge(df2018, df2016, on='tract', how='left')

In [None]:
SVI_hist.shape

In [None]:
SVI_hist.columns

In [None]:
SVI_hist_full = pd.merge(SVI_hist, df2014, on='tract', how='left')

In [None]:
SVI_hist_full.shape

In [None]:
SVI_hist_full.head()

In [None]:
# now drop any -999.0 observations, per CDC these were removed during ranking
SVI_hist_full = SVI_hist_full[SVI_hist_full.raw_svi2018 != -999.0]

In [None]:
SVI_hist_full.head()

In [None]:
SVI_hist_full.shape

In [None]:
SVI_hist_full.columns

In [None]:
svi_histdf = SVI_hist_full[['county', 'tract', 'raw_svi2018', 'raw_svi2016', 'raw_svi2014', 
               'r_soci_total2018', 'r_comp_total2018', 'r_status_total2018', 'r_trans_total2018',
              'r_soci_total2016', 'r_comp_total2016', 'r_status_total2016', 'r_trans_total2016', 
              'r_soci_total2014', 'r_comp_total2014', 'r_status_total2014', 'r_trans_total2014',
              'est_population2018', 'est_population2016', 'est_population2014']]

In [None]:
svi_histdf.head()

In [None]:
# write combined file to csv
svi_histdf.to_csv('SVI_history.csv')

# create San Antonio .csv for Tableau mapping

In [None]:

import pandas as pd
import seaborn as sns

from scripts_python import wrangle
from scripts_python import explore
from scripts_python import model_MAE, model_classification

import matplotlib.pyplot as plt
import numpy as np


from math import sqrt
from scipy import stats

In [None]:
df, train_exp, X_train_scaled, y_train, X_test_scaled, y_test = wrangle.wrangle_data()

In [None]:
cluster_vars = ['spl_theme1_scaled', 'ep_pov_scaled', 'e_pov_scaled']
explore.elbow_plot(X_train_scaled, cluster_vars)

In [None]:
train_clusters, kmeans = explore.run_kmeans(train_exp, X_train_scaled, k=3, cluster_vars=cluster_vars, cluster_col_name = 'poverty_cluster')
test_clusters = explore.kmeans_transform(X_test_scaled, kmeans, cluster_vars, cluster_col_name = 'poverty_cluster')

In [None]:

train_clusters, kmeans = explore.run_kmeans(train_exp, X_train_scaled, k=3, cluster_vars=cluster_vars, cluster_col_name = 'poverty_cluster')
test_clusters = explore.kmeans_transform(X_test_scaled, kmeans, cluster_vars, cluster_col_name = 'poverty_cluster')

In [None]:

centroids = explore.get_centroids(cluster_vars, cluster_col_name='poverty_cluster', kmeans= kmeans)

In [None]:

train_exp = explore.add_to_train(train_clusters, centroids, train_exp, cluster_col_name = 'poverty_cluster')

In [None]:
# export train_exp for Tableau
#train_exp.to_csv('train_exp_mapping.csv')

In [None]:
train_exp.head()

In [None]:
long_lat = pd.read_csv('FIPS_long_lat.csv')

In [None]:
long_lat

In [None]:
merge_ll_train = pd.merge(train_exp, long_lat, left_on='tract', right_on='GEOID', how='left')

In [None]:
merge_ll_train.head()

In [None]:
# export train_exp for Tableau
merge_ll_train.to_csv('train_exp_wll.csv')

## create Dallas .csv for Tableau

In [None]:
ddf, dtrain_exp, dX_train_scaled, dy_train, dX_test_scaled, yd_test = wrangle.wrangle_dallas_data()

In [None]:
dcluster_vars = ['spl_theme1_scaled', 'ep_pov_scaled', 'e_pov_scaled']
#explore.elbow_plot(dX_train_scaled, cluster_vars)

In [None]:

dtrain_clusters, dkmeans = explore.run_kmeans(dtrain_exp, dX_train_scaled, k=4, cluster_vars=dcluster_vars, cluster_col_name = 'dpoverty_cluster')


In [None]:
dcentroids = explore.get_centroids(dcluster_vars, cluster_col_name='dpoverty_cluster', kmeans= dkmeans)

In [None]:
dtrain_exp = explore.add_to_train(dtrain_clusters, dcentroids, dtrain_exp, cluster_col_name = 'dpoverty_cluster')


In [None]:
dmerge_ll_train = pd.merge(dtrain_exp, long_lat, left_on='tract', right_on='GEOID', how='left')

In [None]:
# export train_exp for Tableau
dmerge_ll_train.to_csv('Dallas_train_exp_wll.csv')