# County-level COVID19 analysis

In [None]:
import os
import re
import json
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from census import Census
from collections import OrderedDict

In [None]:
# required env variables
# os.environ['DB_URL']

### Download county shape files and load into postgres database

In [None]:
!mkdir -p big-data/geo-county/
!wget -O big-data/geo-county/tl_2019_us_county.zip https://www2.census.gov/geo/tiger/TIGER2019/COUNTY/tl_2019_us_county.zip
!unzip big-data/geo-county/tl_2019_us_county.zip -d big-data/geo-county/
!shp2pgsql -s 4269 big-data/geo-county/tl_2019_us_county.shp geo_us_county > big-data/geo-county/geo_us_county.sql
!rm big-data/geo-county/tl_2019_us_county*

In [None]:
# load the data into a database
cmd = "cat big-data/geo-county/geo_us_county.sql | psql %s" % os.environ['DB_URL']
os.system(cmd)

### Download census variables/fields and load into a database

In [None]:
!mkdir -p big-data/us-census/
!wget -O big-data/us-census/acs5-variables.json https://api.census.gov/data/2018/acs/acs5/variables.json

In [None]:
with open('big-data/us-census/acs5-variables.json') as f:
    acs5_data = json.load(f)
data = []
for k, v in acs5_data['variables'].items():
    row = v.copy()
    row['name'] = k
    data.append(row)
df = pd.DataFrame(data)
engine = create_engine(os.environ['DB_URL'], echo=False)
df.to_sql('us_census_field', con=engine, method='multi', index=False, if_exists='replace')

### Load county-level US census data into postgres DB

In [None]:
c = Census("5fdf56abf43997adf0d8533a71dea339e4ac5974", year=2018)

fields = ('NAME',
'B01001_001E', 'B01001_002E', 'B01001_003E', 'B01001_004E', 'B01001_005E', 
'B01001_006E', 'B01001_007E', 'B01001_008E', 'B01001_009E', 'B01001_010E', 'B01001_011E', 
'B01001_012E', 'B01001_013E', 'B01001_014E', 'B01001_015E', 'B01001_016E', 'B01001_017E', 
'B01001_018E', 'B01001_019E', 'B01001_020E', 'B01001_021E', 'B01001_022E', 'B01001_023E', 
'B01001_024E', 'B01001_025E','B01001_027E', 'B01001_028E', 'B01001_029E', 'B01001_030E', 
'B01001_031E', 'B01001_032E', 'B01001_033E', 'B01001_034E', 'B01001_035E', 'B01001_036E', 
'B01001_037E', 'B01001_038E', 'B01001_039E', 'B01001_040E', 'B01001_041E', 'B01001_042E', 
'B01001_043E', 'B01001_044E', 'B01001_045E', 'B01001_046E', 'B01001_047E', 'B01001_048E', 
'B01001_049E',

'B15003_001E', 'B15003_002E', 'B15003_003E', 'B15003_004E', 'B15003_005E', 
'B15003_006E', 'B15003_007E', 'B15003_008E', 'B15003_009E', 'B15003_010E', 'B15003_011E', 
'B15003_012E', 'B15003_013E', 'B15003_014E', 'B15003_015E', 'B15003_016E', 'B15003_017E', 
'B15003_018E', 'B15003_019E', 'B15003_020E', 'B15003_021E', 'B15003_022E', 'B15003_023E', 
'B15003_024E', 'B15003_025E', 
         
'B19101_014E', 'B19101_013E', 'B19101_012E', 'B19101_011E', 'B19101_017E', 'B19101_016E', 
'B19101_015E', 'B19101_010E', 'B19101_002E', 'B19101_001E', 'B19101_006E', 'B19101_005E',
'B19101_004E', 'B19101_003E', 'B19101_009E', 'B19101_008E', 'B19101_007E')
d = c.acs5.state_county(fields, Census.ALL, Census.ALL)
data = []
for row in d:
    add_row = OrderedDict()
    add_row['fips'] = row['state'] + row['county']
    for k, v in sorted(row.items(), key=lambda x: '0' if x[0][0] != 'B' else '1' + x[0]):
        add_row[k.lower()] = v if k[0] != 'B' else 0 if v is None else int(v)
    data.append(add_row)
df = pd.DataFrame(data)
engine = create_engine(os.environ['DB_URL'], echo=False)
df.to_sql('us_census_county_stats', con=engine, method='multi', index=False, if_exists='replace')

#### Summarize and add percentages to census data

In [None]:
sql = """
DROP TABLE IF EXISTS us_census_county_summary1;
CREATE TABLE us_census_county_summary1 AS
SELECT fips
, age_total
, age_lt_20
, age_total - (age_lt_20+age_gt_70) age_20_to_70
, age_gt_70

, edu_total
, edu_hs
, edu_some_college
, edu_college
, edu_grad

, income_total
, income_lt_30000
, income_lt_60000
, income_lt_100000
, income_lt_200000
, income_gt_200000

, age_lt_20 / age_total::FLOAT age_lt_20_perc
, (age_total - (age_lt_20+age_gt_70)) / age_total::FLOAT age_20_to_70_perc
, age_gt_70 / age_total::FLOAT age_gt_70_perc

, edu_hs / edu_total::FLOAT edu_hs_perc
, edu_some_college / edu_total::FLOAT edu_some_college_perc
, edu_college / edu_total::FLOAT edu_college_perc
, edu_grad / edu_total::FLOAT edu_grad_perc

, CASE WHEN income_total > 0 THEN income_lt_30000 / income_total::FLOAT END income_lt_30000_perc
, CASE WHEN income_total > 0 THEN income_lt_60000 / income_total::FLOAT END income_lt_60000_perc
, CASE WHEN income_total > 0 THEN income_lt_100000 / income_total::FLOAT END income_lt_100000_perc
, CASE WHEN income_total > 0 THEN income_lt_200000 / income_total::FLOAT END income_lt_200000_perc
, CASE WHEN income_total > 0 THEN income_gt_200000 / income_total::FLOAT END income_gt_100000_perc

FROM
(
SELECT fips
, B01001_001E age_total
, B01001_022E+B01001_023E+B01001_024E+B01001_025E+B01001_046E+B01001_047E+B01001_048E+B01001_049E "age_gt_70"
, B01001_003E+B01001_004E+B01001_005E+B01001_006E+B01001_007E+B01001_027E+B01001_028E+B01001_029E+B01001_030E+B01001_031E "age_lt_20"
, B15003_001E edu_total
, B15003_017E+B15003_018E edu_hs
, B15003_019E+B15003_020E+B15003_021E edu_some_college
, B15003_022E edu_college
, B15003_023E+B15003_024E+B15003_025E edu_grad
, B19101_001E income_total
, B19101_002E+B19101_003E+B19101_004E+B19101_005E+B19101_006E income_lt_30000
, B19101_007E+B19101_008E+B19101_009E+B19101_010E+B19101_011E income_lt_60000
, B19101_012E+B19101_013E income_lt_100000
, B19101_014E+B19101_015E+B19101_016E income_lt_200000
, B19101_017E income_gt_200000
FROM us_census_county_stats
) t
"""

cmd = "psql %s -c \"%s\"" % (os.environ['DB_URL'], re.sub(r'[\r\n\t]+', ' ', sql))
os.system(cmd)
print("Created table: us_census_county_summary1")

### County-level, daily COVID data from NYTimes

In [None]:
!wget -O big-data/us-county-covid.csv https://github.com/nytimes/covid-19-data/raw/master/us-counties.csv

In [None]:
df = pd.read_csv('big-data/us-county-covid.csv')
df['fips'] = df['fips'].replace(np.nan, 0).apply(lambda x: str(int(x)).zfill(5))
df.drop(['county', 'state'], axis=1, inplace=True)
engine = create_engine(os.environ['DB_URL'], echo=False)
df.to_sql('us_county_daily_covid_stats', con=engine, method='multi', 
          chunksize=10000, index=False, if_exists='replace')

### Census tract geometry/shapes

In [None]:
from urllib.request import urlretrieve
import zipfile
for n in range(1, 58):
    st_fips = str(n).zfill(2)
    fn = "tl_2019_%s_tract" % st_fips
    try:
        url = "https://www2.census.gov/geo/tiger/TIGER2019/TRACT/tl_2019_%s_tract.zip" % st_fips
        urlretrieve(url, fn + ".zip")
    except:
        print("FAILED: %s" % url)
        continue
    with zipfile.ZipFile(fn + ".zip", 'r') as z:
        for f in z.namelist():
            z.extract(f)
    append = "-d"
    if n > 1:
        append = "-a"
    cmd = "shp2pgsql -D %s -s 4269 %s.shp us_tract > %s.sql" % (append, fn, fn)
    os.system(cmd)
    cmd = "cat %s.sql | psql %s" % (fn, os.environ['DB_URL'])
    os.system(cmd)
    print("loaded: %s.sql" % fn)
    os.system("rm %s.*" % fn)


### Census tract populations and household counts

In [None]:
c = Census("5fdf56abf43997adf0d8533a71dea339e4ac5974", year=2010)
fields = ('P001001', 'H001001', 'H002001', 'H002002', 'H002003', 'H002004', 'H002005', 'H002006')
tract_file = open('tract-data.csv', 'w')
for n in range(1, 58):
    st_fips = str(n).zfill(2)
    try:
        d = c.sf1.state_county_tract(fields, st_fips, Census.ALL, Census.ALL)
    except:
        continue
    print("%s" % st_fips)
    for row in d:
        tract_file.write('%(state)s%(county)s\t%(tract)s' % row)
        for field in fields:
            tract_file.write("\t%s" % int(row[field]))
        tract_file.write("\n")
tract_file.close()


In [62]:
sql = """CREATE TABLE us_tract_pop(fips CHAR(5), tract VARCHAR(12), 
         P001001 INT,
         H001001 INT,
         H002001 INT,
         H002002 INT,
         H002003 INT,
         H002004 INT,
         H002005 INT,
         H002006 INT
         )"""
cmd = "psql %s -c \"%s\"" % (os.environ['DB_URL'], re.sub(r'[\r\n\t]+', ' ', sql))
os.system(cmd)
cmd = "cat 'tract-data.csv' | psql %s -c \"COPY us_tract_pop FROM STDIN\"" % (os.environ['DB_URL'])
os.system(cmd)
print("Created table us_tract_pop")

0