In [None]:
import geopandas as gpd
import hvplot.pandas
import io
from pathlib import Path
import requests
from shapely.geometry import Polygon, box
import zipfile

In [None]:
# set up data paths
base_path = Path().cwd().parent
data_dir = base_path.joinpath('data')
zip_dir = data_dir.joinpath('zip_codes')

### Download the zip code polygons

In [None]:
# create title and url for zip code data
title = 'US Census Data - Zip Code Boundaries'
url = 'https://www2.census.gov/geo/tiger/TIGER2019/ZCTA5/tl_2019_us_zcta510.zip'

print(f"Downloading {title} \n    From: {url}\n    To: {zip_dir}")

# get the remote data (may take up to 30 minutes)
r = requests.get(url)
# convert to zipfile format
z = zipfile.ZipFile(io.BytesIO(r.content))
# extract the zip contents
z.extractall(zip_dir)

### View the polygons

In [None]:
# construct path to the shapefile
shapefile = f"{zip_dir}/tl_2019_us_zcta510.shp"
# load the shapefile into geopandas
gdf = gpd.read_file(shapefile, driver="shapefile")
# view the head
display(gdf.head())
gdf.head().hvplot(geo=True, tiles='OSM')

### Filter out Multipolygons (for simplicity) and zip codes outside the contiguous US

In [None]:
contiguous_us_bounding_box = box(-124.848974, 24.396308, -66.885444, 49.384358)
contiguous_us_bbox_gdf = gpd.GeoDataFrame(geometry=[contiguous_us_bounding_box], crs='epsg:4326')
zips_all = gdf.loc[:, ['GEOID10', 'geometry']].to_crs('epsg:4326')
zips_subset = gpd.sjoin(zips_all, contiguous_us_bbox_gdf, op='within')
keep_indices = zips_subset.geometry.apply(lambda x: True if type(x)==Polygon else False)
zips_subset = zips_subset[keep_indices]

### Create subsets

In [None]:
# Save various size subsets of the zip code data
for sample_size in [1, 10, 100, 1000, 10000, len(zips_subset)]:
    zips_subset.sample(sample_size, random_state=42).to_file(zip_dir.joinpath(f'zips_{sample_size}.geojson'),
                                                             driver='GeoJSON')