In [3]:
import geopandas as gpd

# Cluster Sample

In [4]:
from cluster_sampling import ClusterSampler

In [None]:
ADMIN_IDS = {
    'STATEFP': 'state',
    'STATE_NAME': 'state',
    'COUNTYFP': 'county',
    'COUNTY_NAME': 'county'
}

for label in ['population', 'treecover']:

    data_path = f"/home/libe2152/optimizedsampling/0_data/admin_gdfs/usavars/{label}/gdf_counties_2015.geojson"
    gdf = gpd.read_file(data_path)

    out_path = f'/home/libe2152/optimizedsampling/0_data/initial_samples/usavars/{label}/cluster_sampling'

    country_shape_file = '/home/libe2152/optimizedsampling/0_data/boundaries/us/us_states_provinces/ne_110m_admin_1_states_provinces.shp'
    exclude_names = ['Alaska', 'Hawaii', 'Puerto Rico']

    strata_col = 'STATEFP'
    cluster_col = ['COUNTY_NAME', 'COUNTYFP']

    sampler = ClusterSampler(gdf, id_col='id', strata_col=strata_col, cluster_col=cluster_col, ADMIN_IDS=ADMIN_IDS)

    n_strata = 10
    for points_per_cluster in [2, 5, 10, 25]:
        sampler.cluster_col = cluster_col
        for total_sample_size in range(100, 1100, 100):
            
            for seed in [1, 42, 123, 456, 789, 1234, 5678, 9101, 1213, 1415]:
                try:
                    sampler.sample(total_sample_size, points_per_cluster, seed, n_strata=n_strata)
                    sampler.save_sampled_ids(out_path)
                    sampler.plot(country_shape_file=country_shape_file, exclude_names=exclude_names)
                except Exception as e:
                    print(e)
                sampler.reset_sample()

[Init] Initializing ClusterSampler...
[Stratify] Stratifying points by column: STATEFP
[Stratify] Unique strata found: ['05' '39' '12' '13' '21' '29' '46' '17' '18' '16' '32' '38' '22' '48'
 '26' '55' '04' '37' '27' '19' '20' '49' '30' '23' '08' '51' '36' '28'
 '31' '40' '41' '01' '50' '53' '06' '42' '45' '47' '56' '35' '33' '25'
 '54' '09' '34' '24' '44' '10' '11']
[Init] Found 49 strata.
[Sample] Starting sampling process...
[Determine Clusters] Total desired sample size: 100
[Determine Clusters] Cluster allocation per stratum:
{np.str_('33'): 5, np.str_('40'): 5, np.str_('46'): 5, np.str_('56'): 5, np.str_('05'): 5, np.str_('06'): 5, np.str_('49'): 5, np.str_('35'): 5, np.str_('53'): 5, np.str_('36'): 5}
[Sample] Processing stratum: 33
[Sample Clusters] Sampling 5 clusters
[Sample Clusters] Ignoring clusters with < 2 points. Remaining: 11
[Sample Clusters] Sampled cluster IDs: ['015', '003', '009', '007', '011']
[Sample Points] Sampling up to 2 points from each of 5 clusters...
  [S

# Convenience Sample

In [None]:
from convenience_sampling import UrbanConvenienceSampler

In [None]:
id_col = 'id'
ADMIN_IDS = {
    'STATEFP': 'state',
    'STATE_NAME': 'state',
    'COUNTYFP': 'county',
    'COUNTY_NAME': 'county'
}

for label in ['population', 'treecover']:
    gdf_path = f"/home/libe2152/optimizedsampling/0_data/admin_gdfs/usavars/{label}/gdf_counties_2015.geojson"
    gdf = gpd.read_file(gdf_path)

    n_urban = 10
    pop_col = 'POP'
    gdf_urban_path = '/home/libe2152/optimizedsampling/0_data/boundaries/us/us_urban_area_census_2020/tl_2020_us_uac20_with_pop.shp'
    gdf_urban = gpd.read_file(gdf_urban_path)

    distances_dir = f'/home/libe2152/optimizedsampling/0_data/distances/usavars/{label}/distance_to_top{n_urban}_urban.pkl'

    country_shape_file = '/home/libe2152/optimizedsampling/0_data/boundaries/us/us_states_provinces/ne_110m_admin_1_states_provinces.shp'
    exclude_names = ['Alaska', 'Hawaii', 'Puerto Rico']

    out_path = f'/home/libe2152/optimizedsampling/0_data/initial_samples/usavars/{label}/convenience_sampling'

    print("Reading GeoDataFrame...")
    gdf = gpd.read_file(gdf_path)

    method = 'probabilistic'
    if label=="population":
        temp=0.025
    else:
        temp=0.001
    for desired_sample_size in range(100, 1100, 100):
        for seed in [1, 42, 123, 456, 789, 1234, 5678, 9101, 1213, 1415]:

            print("Initializing UrbanConvenienceSampler...")
            sampler = UrbanConvenienceSampler(
                id_col=id_col,
                gdf_points=gdf,
                gdf_urban=gdf_urban,
                n_urban=n_urban,
                pop_col=pop_col,
                distances_dir=distances_dir
            )

            sampler.sample(n_samples=desired_sample_size, method=method, temp=temp, seed=seed)
            sampler.save_sampled_ids(out_path)
            sampler.plot(country_shape_file=country_shape_file, exclude_names=exclude_names)

            print("Initializing RegionConvenienceSampler...")
            sampler = RegionConvenienceSampler(
                    gdf_points=gdf,
                    id_col=id_col,
                    region_col='STATEFP',
                    ADMIN_IDS=ADMIN_IDS)

            sampler.sample(total_sample_size=desired_sample_size, region_val='06', seed=seed) #California
            sampler.save_sampled_ids(out_path)
            sampler.plot(country_shape_file=country_shape_file, exclude_names=exclude_names)

# Random Sampling

In [None]:
from random_sampling import RandomSampler

In [None]:
for label in ['population', 'treecover']:

    data_path = f"/home/libe2152/optimizedsampling/0_data/admin_gdfs/usavars/{label}/gdf_counties_2015.geojson"
    gdf = gpd.read_file(data_path)

    out_path = f'/home/libe2152/optimizedsampling/0_data/initial_samples/usavars/{label}/random_sampling'

    country_shape_file = '/home/libe2152/optimizedsampling/0_data/boundaries/us/us_states_provinces/ne_110m_admin_1_states_provinces.shp'
    exclude_names = ['Alaska', 'Hawaii', 'Puerto Rico']

    sampler = RandomSampler(gdf, id_col="id")

    for total_sample_size in range(10, 1000, 100):
        for seed in [1, 42, 123, 456, 789, 1234, 5678, 9101, 1213, 1415]:
            sampler.sample(total_sample_size=total_sample_size, seed=seed)
            sampler.save_sampled_ids(out_path)
            sampler.plot(country_shape_file, exclude_names=["Alaska", "Hawaii", "Puerto Rico"])
            sampler.reset_sample()