In [None]:
from pathlib import Path
import requests
import zipfile
import io
import geopandas as gpd

import logging
import time
from datetime import datetime

from shapely.geometry import Polygon, box
import numpy as np
import pandas as pd
import dask.dataframe as dd
from distributed import LocalCluster, Client


### Download zip code shapefile

In [None]:
# create title and url for zip code data
title = 'US Census Data - Zip Code Boundaries'
url = 'https://www2.census.gov/geo/tiger/TIGER2019/ZCTA5/tl_2019_us_zcta510.zip'

# construct an output directory for the data
zip_dir = Path(url).stem

print(f"Downloading {title} \n    From: {url}\n    To: {zip_dir}")

# get the remote data
r = requests.get(url)
# convert to zipfile format
z = zipfile.ZipFile(io.BytesIO(r.content))
# extract the zip contents
z.extractall(zip_dir)

# construct path to the shapefile
shapefile = f"{zip_dir}/{zip_dir}.shp"
# load the shapefile into geopandas
zips_all = gpd.read_file(shapefile, driver="shapefile").to_crs('epsg:4326')
# view the head
zips_all.head()

### Filter zipcodes to continguous US

In [None]:
data_dir = Path('data')

In [None]:
# create bounding box for continuous US
contiguous_us_bounding_box = box(-124.848974, 24.396308, -66.885444, 49.384358)
# create geodataframe with contiguous bounding box
contiguous_us_bbox_gdf = gpd.GeoDataFrame(geometry=[contiguous_us_bounding_box], crs='epsg:4326')

# quick and dirty filter of multipolygons (spatialpandas can't handle them yet)
# gpdf.geometry = gpdf.geometry.apply(lambda x: x if type(x) == shapely.geometry.Polygon else x[0])
zips_subset = zips_all[zips_all.geometry.apply(lambda x: True if type(x)==Polygon else False)]

# extract only the zip code number and geometry columns
zips_subset = zips_subset[['ZCTA5CE10','geometry']].copy(deep=True)

# Filter out Zip Codes outside of Contiguous US
zips_subset = gpd.sjoin(zips_subset, contiguous_us_bbox_gdf, op='within')

### Create subsampled zip code datasets

In [None]:
subsample_sizes = [1, 10, 100, 1000, 10000, len(zips_subset)]
# Save various size subsets of the zip code data
for sample_size in subsample_sizes:
    zips_subset.sample(sample_size, random_state=42).to_file(data_dir.joinpath(f'zips_{sample_size}.geojson'),
                                                             driver='GeoJSON')

### Download OpenStreetMap point data

In [None]:
# set the path to the openstreetmap data
# todo replace this with either a download of raw data and/or processing to parquet
raw_path = '/work/spd-scipy2020/data/simple-gps-points.parquet'

### Extract the contiguous US and save as parquet

In [None]:
%%time 
t0 = time.time()
# read the osm raw data
ddf = dd.read_parquet(raw_path)
# reduce osm data to continguous us
usdf = ddf[ddf.latitude.between(24.396308, 49.384358) & ddf.longitude.between(-124.848974, -66.885444)]
# write intermediate file
usdf.to_parquet('data/contiguous_us.parquet', engine='pyarrow', compression='snappy')
dt_hr = (time.time() - t0)/60/60 # 6min 25s