In [216]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import scipy.sparse as sp
import dask.dataframe as dd
from tqdm import tqdm, trange
from cenpy.products import ACS
from download_lodes import download_lodes

acs = ACS()

states = ['al', 'az', 'ar', 'ca', 'co', 'ct', 'dc', 'de', 'fl', 'ga', 'id', 'il', 'in', 'ia', 
          'ks', 'ky', 'la', 'me', 'md', 'ma', 'mi', 'mn', 'ms', 'mo', 'mt', 'ne', 'nv', 'nh', 
          'nj', 'nm', 'ny', 'nc', 'nd', 'oh', 'ok', 'or', 'pa', 'ri', 'sc', 'sd', 'tn', 'tx', 
          'ut', 'vt', 'va', 'wa', 'wv', 'wi', 'wy']
statestrs = ['Alabama', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'District of Columbia', 'Florida', 'Georgia', 
             'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 
             'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 
             'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 
             'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']

# create lambda function for aggregation 
bg_to_cty = lambda geoid : geoid[:5]  # converts census block group FIPS code to county FIPS code

In [3]:
# Get all geoids from county_adjacency.txt
cties = pd.read_csv('county_adjacency.txt', sep=r'\t', header=None)
geoids = pd.unique(cties[1])

In [4]:
# Create data matrices -- DO NOT RUN THIS CELL AGAIN
OD = pd.DataFrame(columns=geoids, index=geoids).fillna(0)
OD.to_csv('total_od.csv')

attrs = pd.DataFrame(columns=['o_attr', 'd_attr'], index=geoids).fillna(0)
attrs.to_csv('total_attr.csv')

In [None]:
# Read data matrices
OD = pd.read_csv('total_od.csv')
attrs = pd.read_csv('total_attr.csv')

In [7]:
# Get all intra-state flows TODO speed up any of these loops??
for state in tqdm(states):
    filenames = download_lodes(state, contained=True)  # download LODES data within a state

    # Aggregate census block group data to counties
    converters = {'w_geocode' : str, 'h_geocode' : str}
    odfile = dd.read_csv(filenames['od'], compression='gzip', converters=converters)
    for _, row in odfile.iterrows():
        OD.loc[bg_to_cty(row['w_geocode']), bg_to_cty(row['h_geocode'])] += row['S000']

    # Scrape origin attributes
    oattrfile = dd.read_csv(filenames['rac'], compression='gzip', converters={'h_geocode' : str})
    for _, row in oattrfile.iterrows(): attrs.loc[bg_to_cty(row['h_geocode'])] += row['C000']
    
    # Scrape destination attributes
    dattrfile = dd.read_csv(filenames['wac'], compression='gzip', converters={'w_geocode' : str})
    for _, row in dattrfile.iterrows(): attrs.loc[bg_to_cty(row['w_geocode'])] += row['C000']

    for v in filenames.values(): os.remove(v)  # delete old archives

    # Save your work!
    OD.to_csv('total_od.csv')
    attrs.to_csv('total_attr.csv')

100%|██████████| 49/49 [14:45:36<00:00, 1084.42s/it]


In [5]:
# Get all extra-state flows (attributes have already been recorded)
for state in tqdm(states):
    filenames = download_lodes(state, contained=False)  # download extra-state LODES data

    # Aggregate census block group data to counties
    converters = {'w_geocode' : str, 'h_geocode' : str}
    odfile = dd.read_csv(filenames['od'], compression='gzip', converters=converters)
    for _, row in odfile.iterrows():
        OD.loc[bg_to_cty(row['w_geocode']), bg_to_cty(row['h_geocode'])] += row['S000']

    os.remove(filenames['od'])  # delete old archives

    # Save your work!
    OD.to_csv('total_od.csv')

100%|██████████| 49/49 [49:33<00:00, 60.68s/it]


## Reformat data

In [59]:
# Read data matrices
OD = pd.read_csv('total_od.csv')       # i forgot to read these in with the index column but one should do that normally!!
geoids = np.delete(OD.columns.values, 0)

In [157]:
cty_shapes = gpd.read_file('tl_2018_us_county.shp')

In [130]:
# Create new flow dataframe
od = sp.coo_matrix(OD.values[:, 1:].astype(int))
newOD = pd.DataFrame(np.array([od.row, od.col, od.data]).T, columns=['origin', 'dest', 'count'])
mapping = {i : v for i, v in enumerate(geoids)}
newOD = newOD.replace({'origin' : mapping, 'dest' : mapping})

In [132]:
newOD.to_csv('lodes-flows.csv')

In [189]:
newloc = pd.read_csv('attrs.csv', converters={'name' : str})

# Drop all non CONUS locs
name_check = lambda series : pd.Series([int(n[:2]) for n in series])
loc = newloc.drop((newloc[name_check(newloc['name']) > 56]).index)
loc

Unnamed: 0,id,name,o_attr,d_attr,lat,lon
0,0,01001,35115,35115,-86.642749,32.534920
1,1,01021,27140,27140,-86.718814,32.847853
2,2,01047,26388,26388,-87.106476,32.325974
3,3,01051,53498,53498,-86.149147,32.596648
4,4,01085,5335,5335,-86.650108,32.154750
...,...,...,...,...,...,...
3109,3109,56043,7216,7216,-107.682861,43.904997
3110,3110,56013,30700,30700,-108.630418,43.040528
3111,3111,56025,79809,79809,-106.798494,42.962240
3112,3112,56017,4207,4207,-108.442097,43.718929


In [224]:
# Get pop data and add to attributes
pops = {}
for state in tqdm(statestrs):
    df = acs.from_state(state, variables=['B00001_001E'], level='county')
    tempdict = {id : df[df['GEOID'] == id]['B00001_001E'].values[0] for id in df['GEOID'].values}  # get all (geoid, value pairs)
    pops.update(tempdict)  # add all (geoid, value) pairs to pops

loc['pop'] = loc['name'].map(pops)  # add this as a column to the dataframe
loc.to_csv('attrs.csv')

100%|██████████| 49/49 [13:36<00:00, 16.67s/it]


In [200]:
# Add lat/lon to attributes
lat = np.zeros((loc.shape[0], 1))
lon = np.zeros((loc.shape[0], 1))
for i in trange(loc.shape[0]):
    name = loc['name'].iloc[i]
    if name == '46113' or name == '51515': continue  # not sure why these are in here but skip them
    lat[i] = cty_shapes[cty_shapes['GEOID'] == name].centroid.values.x
    lon[i] = cty_shapes[cty_shapes['GEOID'] == name].centroid.values.y

loc['lat'] = lat
loc['lon'] = lon
loc.to_csv('attrs.csv')

100%|██████████| 3114/3114 [01:44<00:00, 29.82it/s]
