In [1]:
import geopandas as gpd
import numpy as np

# Cluster Sample

In [2]:
from cluster_sampling import ClusterSampler

In [3]:
ADMIN_IDS = {
    'STATEFP': 'state',
    'STATE_NAME': 'state',
    'COUNTYFP': 'county',
    'COUNTY_NAME': 'county'
}

for label in ['population', 'treecover']:

    data_path = f"/home/libe2152/optimizedsampling/0_data/admin_gdfs/usavars/{label}/gdf_counties_2015.geojson"
    gdf = gpd.read_file(data_path)

    split_cols = gdf['combined_county_id'].str.split('_', expand=True)
    gdf['state'] = split_cols[2] + "_" + split_cols[3]
    gdf['county'] = split_cols[2] + "_" + split_cols[3]


    out_path = f'/home/libe2152/optimizedsampling/0_data/initial_samples/usavars/{label}/cluster_sampling'

    country_shape_file = '/home/libe2152/optimizedsampling/0_data/boundaries/us/us_states_provinces/ne_110m_admin_1_states_provinces.shp'
    exclude_names = ['Alaska', 'Hawaii', 'Puerto Rico']

    strata_col = 'state'
    cluster_col = 'combined_county_id'

    sampler = ClusterSampler(gdf, id_col='id', strata_col=strata_col, cluster_col=cluster_col, ADMIN_IDS=ADMIN_IDS)

    all_strata = gdf[strata_col].astype(str).unique()
    all_strata = [s for s in all_strata if s != 'District of Columbia_11']
    n_strata = 5

    np.random.seed(78910)
    fixed_strata = np.random.choice(all_strata, size=n_strata, replace=False)

    for points_per_cluster in [5, 10]:
        sampler.cluster_col = cluster_col
        for total_sample_size in range(1100, 1200, 100):
            
            for seed in [1, 42, 123, 456, 789, 1234, 5678, 9101, 1213, 1415]:
                try:
                    sampler.sample(total_sample_size, points_per_cluster, seed, fixed_strata=fixed_strata)
                    sampler.save_sampled_ids(out_path)
                    sampler.plot(country_shape_file=country_shape_file, exclude_names=exclude_names)
                except Exception as e:
                    print(e)
                sampler.reset_sample()

[Init] Initializing ClusterSampler...
[Stratify] Stratifying points by column: state
[Stratify] Unique strata found: ['Arkansas_05' 'Ohio_39' 'Florida_12' 'Georgia_13' 'Kentucky_21'
 'Missouri_29' 'South Dakota_46' 'Illinois_17' 'Indiana_18' 'Idaho_16'
 'Nevada_32' 'North Dakota_38' 'Louisiana_22' 'Texas_48' 'Michigan_26'
 'Wisconsin_55' 'Arizona_04' 'North Carolina_37' 'Minnesota_27' 'Iowa_19'
 'Kansas_20' 'Utah_49' 'Montana_30' 'Maine_23' 'Colorado_08' 'Virginia_51'
 'New York_36' 'Mississippi_28' 'Nebraska_31' 'Oklahoma_40' 'Oregon_41'
 'Alabama_01' 'Vermont_50' 'Washington_53' 'California_06'
 'Pennsylvania_42' 'South Carolina_45' 'Tennessee_47' 'Wyoming_56'
 'New Mexico_35' 'New Hampshire_33' 'Massachusetts_25' 'West Virginia_54'
 'Connecticut_09' 'New Jersey_34' 'Maryland_24' 'Rhode Island_44'
 'Delaware_10' 'District of Columbia_11']
[Init] Found 49 strata.
[Sample] Starting sampling process...
[Determine Sample Sizes] Total desired sample size: 1100
[Determine Sample Sizes] Sel

# Convenience Sample

In [None]:
from infrastructure_convenience_sampling import UrbanConvenienceSampler

In [None]:
id_col = 'id'
ADMIN_IDS = {
    'STATEFP': 'state',
    'STATE_NAME': 'state',
    'COUNTYFP': 'county',
    'COUNTY_NAME': 'county'
}

for label in ['population', 'treecover']:
    gdf_path = f"/home/libe2152/optimizedsampling/0_data/admin_gdfs/usavars/{label}/gdf_counties_2015.geojson"
    gdf = gpd.read_file(gdf_path)

    for n_urban in [10, 20, 50]:
        pop_col = 'POP'
        gdf_urban_path = '/home/libe2152/optimizedsampling/0_data/boundaries/us/us_urban_area_census_2020/tl_2020_us_uac20_with_pop.shp'
        gdf_urban = gpd.read_file(gdf_urban_path)

        distances_dir = f'/home/libe2152/optimizedsampling/0_data/distances/usavars/{label}/distance_to_top{n_urban}_urban.pkl'

        country_shape_file = '/home/libe2152/optimizedsampling/0_data/boundaries/us/us_states_provinces/ne_110m_admin_1_states_provinces.shp'
        exclude_names = ['Alaska', 'Hawaii', 'Puerto Rico']

        out_path = f'/home/libe2152/optimizedsampling/0_data/initial_samples/usavars/{label}/convenience_sampling'

        print("Reading GeoDataFrame...")
        gdf = gpd.read_file(gdf_path)

        method = 'probabilistic'
        if label=="population":
            temp=0.025
        else:
            temp=0.001
        for desired_sample_size in range(1100, 1200, 100):
            for seed in [1, 42, 123, 456, 789, 1234, 5678, 9101, 1213, 1415]:

                print("Initializing UrbanConvenienceSampler...")
                sampler = UrbanConvenienceSampler(
                    id_col=id_col,
                    gdf_points=gdf,
                    gdf_urban=gdf_urban,
                    n_urban=n_urban,
                    pop_col=pop_col,
                    distances_dir=distances_dir
                )

                sampler.sample(n_samples=desired_sample_size, method=method, temp=temp, seed=seed)
                sampler.save_sampled_ids(out_path)
                sampler.plot(country_shape_file=country_shape_file, exclude_names=exclude_names)

### Cluster convenience sampling

In [None]:
ADMIN_IDS = {
    'STATEFP': 'state',
    'STATE_NAME': 'state',
    'COUNTYFP': 'county',
    'COUNTY_NAME': 'county'
}

cluster_col = ['COUNTY_NAME', 'COUNTYFP']
# or another appropriate cluster ID
id_col="id"

for label in ['treecover']:
    for points_per_cluster in [5, 10]:
        gdf_path = f"/home/libe2152/optimizedsampling/0_data/admin_gdfs/usavars/{label}/gdf_counties_2015.geojson"
        gdf = gpd.read_file(gdf_path)

        for n_urban in [20, 50]:
            pop_col = 'POP'
            gdf_urban_path = '/home/libe2152/optimizedsampling/0_data/boundaries/us/us_urban_area_census_2020/tl_2020_us_uac20_with_pop.shp'
            gdf_urban = gpd.read_file(gdf_urban_path)

            distances_dir = f'/home/libe2152/optimizedsampling/0_data/distances/usavars/{label}/distance_to_top{n_urban}_urban.pkl'
            cluster_distances_dir = f'/home/libe2152/optimizedsampling/0_data/distances/usavars/{label}/cluster_distance_to_top{n_urban}_urban.pkl'

            country_shape_file = '/home/libe2152/optimizedsampling/0_data/boundaries/us/us_states_provinces/ne_110m_admin_1_states_provinces.shp'
            exclude_names = ['Alaska', 'Hawaii', 'Puerto Rico']

            out_path = f'/home/libe2152/optimizedsampling/0_data/initial_samples/usavars/{label}/convenience_sampling'

            method = 'probabilistic'
            temp = 0.025 if label == "population" else 0.001

            for desired_sample_size in range(100, 1100, 100):
                for seed in [1, 42, 123, 456, 789, 1234, 5678, 9101, 1213, 1415]:
                    # Cluster Convenience Sampler (NEW)
                    print("Initializing ClusterConvenienceSampler...")
                    sampler = UrbanConvenienceSampler(  # make sure you have this class
                        gdf_points=gdf,
                        id_col=id_col,
                        pop_col=pop_col,
                        cluster_col=cluster_col,
                        points_per_cluster=points_per_cluster,
                        gdf_urban=gdf_urban,
                        n_urban=n_urban,
                        distances_dir=distances_dir,
                        cluster_distances_dir=cluster_distances_dir,
                        admin_ids=ADMIN_IDS
                    )

                    sampler.sample_by_clusters(
                        total_sample_size=desired_sample_size,
                        method=method,
                        temp=temp,
                        seed=seed
                    )
                    sampler.save_sampled_ids(out_path)
                    sampler.plot(country_shape_file=country_shape_file, exclude_names=exclude_names)

# Random Sampling

In [None]:
from random_sampling import RandomSampler

In [None]:
for label in ['population', 'treecover']:

    data_path = f"/home/libe2152/optimizedsampling/0_data/admin_gdfs/usavars/{label}/gdf_counties_2015.geojson"
    gdf = gpd.read_file(data_path)

    out_path = f'/home/libe2152/optimizedsampling/0_data/initial_samples/usavars/{label}/random_sampling'

    country_shape_file = '/home/libe2152/optimizedsampling/0_data/boundaries/us/us_states_provinces/ne_110m_admin_1_states_provinces.shp'
    exclude_names = ['Alaska', 'Hawaii', 'Puerto Rico']

    sampler = RandomSampler(gdf, id_col="id")

    for total_sample_size in range(50, 1000, 100):
        for seed in [1, 42, 123, 456, 789, 1234, 5678, 9101, 1213, 1415]:
            sampler.sample(total_sample_size=total_sample_size, seed=seed)
            sampler.save_sampled_ids(out_path)
            sampler.plot(country_shape_file, exclude_names=["Alaska", "Hawaii", "Puerto Rico"])
            sampler.reset_sample()