# Create Zip Code Subsets

In [None]:
import logging
import time
from datetime import datetime
from pathlib import Path

from shapely.geometry import Polygon, box
import numpy as np
import pandas as pd
import geopandas as gpd
import dask.dataframe as dd
from distributed import LocalCluster, Client

In [None]:
base_path = Path('../')

In [None]:
contiguous_us_bounding_box = box(-124.848974, 24.396308, -66.885444, 49.384358)
contiguous_us_bbox_gdf = gpd.GeoDataFrame(geometry=[contiguous_us_bounding_box], crs='epsg:4326')
zips_all = gpd.read_file(base_path / 'data/zip_codes/all_zip').loc[:, ['GEOID10', 'geometry']].to_crs('epsg:4326')

In [None]:
# Filter out Multipolygons and Zip Codes outside of Contiguous US
zips_subset = gpd.sjoin(zips_all, contiguous_us_bbox_gdf, op='within')
keep_indices = zips_subset.geometry.apply(lambda x: True if type(x)==Polygon else False)
zips_subset = zips_subset[keep_indices]

In [None]:
# Save various size subsets of the zip code data
for sample_size in [1, 10, 100, 1000, 10000, len(zips_subset)]:
    zips_subset.sample(sample_size, random_state=42).to_file(base_path / f'data/zip_codes/zips_{sample_size}.geojson',
                                                             driver='GeoJSON')