In [1]:
import geopandas as gpd

# Cluster Sample

In [2]:
from cluster_sampling import ClusterSampler

In [3]:
ADMIN_IDS = {
    'STATEFP': 'state',
    'STATE_NAME': 'state',
    'COUNTYFP': 'county',
    'COUNTY_NAME': 'county'
}

for label in ['population', 'treecover']:

    data_path = f"/home/libe2152/optimizedsampling/0_data/admin_gdfs/usavars/{label}/gdf_counties_2015.geojson"
    gdf = gpd.read_file(data_path)

    out_path = f'/home/libe2152/optimizedsampling/0_data/initial_samples/usavars/{label}/cluster_sampling'

    country_shape_file = '/home/libe2152/optimizedsampling/0_data/boundaries/us/us_states_provinces/ne_110m_admin_1_states_provinces.shp'
    exclude_names = ['Alaska', 'Hawaii', 'Puerto Rico']

    strata_col = 'STATEFP'
    cluster_col = ['COUNTY_NAME', 'COUNTYFP']

    sampler = ClusterSampler(gdf, id_col='id', strata_col=strata_col, cluster_col=cluster_col, ADMIN_IDS=ADMIN_IDS)

    n_strata = 10
    for points_per_cluster in [2, 5, 10, 25]:
        sampler.cluster_col = cluster_col
        for total_sample_size in range(100, 1100, 100):
            
            for seed in [1, 42, 123, 456, 789, 1234, 5678, 9101, 1213, 1415]:
                try:
                    sampler.sample(total_sample_size, points_per_cluster, seed, n_strata=n_strata)
                    sampler.save_sampled_ids(out_path)
                    sampler.plot(country_shape_file=country_shape_file, exclude_names=exclude_names)
                except Exception as e:
                    print(e)
                sampler.reset_sample()

[Init] Initializing ClusterSampler...



KeyboardInterrupt



# Convenience Sample

In [9]:
from infrastructure_convenience_sampling import UrbanConvenienceSampler

In [10]:
id_col = 'id'
ADMIN_IDS = {
    'STATEFP': 'state',
    'STATE_NAME': 'state',
    'COUNTYFP': 'county',
    'COUNTY_NAME': 'county'
}

for label in ['population', 'treecover']:
    gdf_path = f"/home/libe2152/optimizedsampling/0_data/admin_gdfs/usavars/{label}/gdf_counties_2015.geojson"
    gdf = gpd.read_file(gdf_path)

    for n_urban in [10, 20, 50]:
        pop_col = 'POP'
        gdf_urban_path = '/home/libe2152/optimizedsampling/0_data/boundaries/us/us_urban_area_census_2020/tl_2020_us_uac20_with_pop.shp'
        gdf_urban = gpd.read_file(gdf_urban_path)

        distances_dir = f'/home/libe2152/optimizedsampling/0_data/distances/usavars/{label}/distance_to_top{n_urban}_urban.pkl'

        country_shape_file = '/home/libe2152/optimizedsampling/0_data/boundaries/us/us_states_provinces/ne_110m_admin_1_states_provinces.shp'
        exclude_names = ['Alaska', 'Hawaii', 'Puerto Rico']

        out_path = f'/home/libe2152/optimizedsampling/0_data/initial_samples/usavars/{label}/convenience_sampling'

        print("Reading GeoDataFrame...")
        gdf = gpd.read_file(gdf_path)

        method = 'probabilistic'
        if label=="population":
            temp=0.025
        else:
            temp=0.001
        for desired_sample_size in range(100, 1100, 100):
            for seed in [1, 42, 123, 456, 789, 1234, 5678, 9101, 1213, 1415]:

                print("Initializing UrbanConvenienceSampler...")
                sampler = UrbanConvenienceSampler(
                    id_col=id_col,
                    gdf_points=gdf,
                    gdf_urban=gdf_urban,
                    n_urban=n_urban,
                    pop_col=pop_col,
                    distances_dir=distances_dir
                )

                sampler.sample(n_samples=desired_sample_size, method=method, temp=temp, seed=seed)
                sampler.save_sampled_ids(out_path)
                sampler.plot(country_shape_file=country_shape_file, exclude_names=exclude_names)

Reading GeoDataFrame...
Initializing UrbanConvenienceSampler...
Computing distances using spatial index...
Saved distances for 54343 points to /home/libe2152/optimizedsampling/0_data/distances/usavars/population/distance_to_top10_urban.pkl
Running probabilistic sampling...
Saved 100 sampled IDs to /home/libe2152/optimizedsampling/0_data/initial_samples/usavars/population/convenience_sampling/urban_based/IDS_top10_urban_100_points_probabilistic_100_size_seed_1.pkl
Saved plot to /home/libe2152/optimizedsampling/0_data/initial_samples/usavars/population/convenience_sampling/urban_based/plots/top10_urban_areas_100_points_probabilistic_100_size_seed_1.png
Initializing UrbanConvenienceSampler...
Loading precomputed distances from /home/libe2152/optimizedsampling/0_data/distances/usavars/population/distance_to_top10_urban.pkl...
Running probabilistic sampling...
Saved 100 sampled IDs to /home/libe2152/optimizedsampling/0_data/initial_samples/usavars/population/convenience_sampling/urban_based

### Cluster convenience sampling

In [11]:
ADMIN_IDS = {
    'STATEFP': 'state',
    'STATE_NAME': 'state',
    'COUNTYFP': 'county',
    'COUNTY_NAME': 'county'
}

cluster_col = ['COUNTY_NAME', 'COUNTYFP']
# or another appropriate cluster ID
id_col="id"

for label in ['treecover']:
    for points_per_cluster in [5, 10]:
        gdf_path = f"/home/libe2152/optimizedsampling/0_data/admin_gdfs/usavars/{label}/gdf_counties_2015.geojson"
        gdf = gpd.read_file(gdf_path)

        for n_urban in [20, 50]:
            pop_col = 'POP'
            gdf_urban_path = '/home/libe2152/optimizedsampling/0_data/boundaries/us/us_urban_area_census_2020/tl_2020_us_uac20_with_pop.shp'
            gdf_urban = gpd.read_file(gdf_urban_path)

            distances_dir = f'/home/libe2152/optimizedsampling/0_data/distances/usavars/{label}/distance_to_top{n_urban}_urban.pkl'
            cluster_distances_dir = f'/home/libe2152/optimizedsampling/0_data/distances/usavars/{label}/cluster_distance_to_top{n_urban}_urban.pkl'

            country_shape_file = '/home/libe2152/optimizedsampling/0_data/boundaries/us/us_states_provinces/ne_110m_admin_1_states_provinces.shp'
            exclude_names = ['Alaska', 'Hawaii', 'Puerto Rico']

            out_path = f'/home/libe2152/optimizedsampling/0_data/initial_samples/usavars/{label}/convenience_sampling'

            method = 'probabilistic'
            temp = 0.025 if label == "population" else 0.001

            for desired_sample_size in range(100, 1100, 100):
                for seed in [1, 42, 123, 456, 789, 1234, 5678, 9101, 1213, 1415]:
                    # Cluster Convenience Sampler (NEW)
                    print("Initializing ClusterConvenienceSampler...")
                    sampler = UrbanConvenienceSampler(  # make sure you have this class
                        gdf_points=gdf,
                        id_col=id_col,
                        pop_col=pop_col,
                        cluster_col=cluster_col,
                        points_per_cluster=points_per_cluster,
                        gdf_urban=gdf_urban,
                        n_urban=n_urban,
                        distances_dir=distances_dir,
                        cluster_distances_dir=cluster_distances_dir,
                        admin_ids=ADMIN_IDS
                    )

                    sampler.sample_by_clusters(
                        total_sample_size=desired_sample_size,
                        method=method,
                        temp=temp,
                        seed=seed
                    )
                    sampler.save_sampled_ids(out_path)
                    sampler.plot(country_shape_file=country_shape_file, exclude_names=exclude_names)

Initializing ClusterConvenienceSampler...
Computing cluster distances using spatial index...
Saved cluster distances for 2939 clusters to /home/libe2152/optimizedsampling/0_data/distances/usavars/treecover/cluster_distance_to_top20_urban.pkl
combined_cluster_id
Bergen_003         8
Bristol_005       14
Cecil_015          6
Chester_029       16
Delaware_045       6
Fulton_121         9
Gloucester_015    16
Gwinnett_135      12
Harris_201        34
Hennepin_053      16
Kendall_093        7
Mercer_021         7
Middlesex_017     21
Middlesex_023      8
Monmouth_025       6
Montgomery_091    18
Montgomery_339    31
Ramsey_123         8
Spalding_255       6
Tarrant_439       19
dtype: int64
Saved 100 sampled IDs to /home/libe2152/optimizedsampling/0_data/initial_samples/usavars/treecover/convenience_sampling/cluster_based/IDS_top20_urban_cluster_COUNTY_NAME_5_ppc_20_clusters_100_size_probabilistic_seed_1.pkl
Saved plot to /home/libe2152/optimizedsampling/0_data/initial_samples/usavars/treec

# Random Sampling

In [None]:
from random_sampling import RandomSampler

In [None]:
for label in ['population', 'treecover']:

    data_path = f"/home/libe2152/optimizedsampling/0_data/admin_gdfs/usavars/{label}/gdf_counties_2015.geojson"
    gdf = gpd.read_file(data_path)

    out_path = f'/home/libe2152/optimizedsampling/0_data/initial_samples/usavars/{label}/random_sampling'

    country_shape_file = '/home/libe2152/optimizedsampling/0_data/boundaries/us/us_states_provinces/ne_110m_admin_1_states_provinces.shp'
    exclude_names = ['Alaska', 'Hawaii', 'Puerto Rico']

    sampler = RandomSampler(gdf, id_col="id")

    for total_sample_size in range(10, 1000, 100):
        for seed in [1, 42, 123, 456, 789, 1234, 5678, 9101, 1213, 1415]:
            sampler.sample(total_sample_size=total_sample_size, seed=seed)
            sampler.save_sampled_ids(out_path)
            sampler.plot(country_shape_file, exclude_names=["Alaska", "Hawaii", "Puerto Rico"])
            sampler.reset_sample()