In [None]:
import geopandas as gpd

# Cluster Sample

In [1]:
from cluster_sampling import ClusterSampler

In [None]:
ADMIN_IDS = {
    'STATEFP': 'state',
    'STATE_NAME': 'state',
    'COUNTYFP': 'county',
    'COUNTY_NAME': 'county'
}

for label in ['population', 'treecover']:

    data_path = f"/home/libe2152/optimizedsampling/0_data/admin_gdfs/usavars/{label}/gdf_counties_2015.geojson"
    gdf = gpd.read_file(data_path)

    out_path = f'/home/libe2152/optimizedsampling/0_data/initial_samples/usavars/{label}/cluster_sampling'

    country_shape_file = '/home/libe2152/optimizedsampling/0_data/boundaries/us/us_states_provinces/ne_110m_admin_1_states_provinces.shp'
    exclude_names = ['Alaska', 'Hawaii', 'Puerto Rico']

    strata_col = 'STATEFP'
    cluster_col = 'COUNTYFP'

    sampler = ClusterSampler(gdf, id_col='id', strata_col=strata_col, cluster_col=cluster_col)

    for points_per_cluster in [2, 5, 10, 25]:
        sampler.cluster_col = cluster_col
        sampler.merge_small_strata(points_per_cluster)
        sampler.merge_small_clusters(points_per_cluster)
        for total_sample_size in range(100, 5000, 100):
            
            for seed in [1, 42, 123, 456, 789, 1234, 5678, 9101, 1213, 1415]:
                try:
                    sampler.sample(total_sample_size, points_per_cluster, seed=seed)
                    sampler.save_sampled_ids(out_path)
                    sampler.plot(country_shape_file=country_shape_file, exclude_names=exclude_names)
                except Exception as e:
                    print(e)
                sampler.reset_sample()

# Convenience Sample

In [2]:
from convenience_sampling import ConvenienceSampler

In [None]:
id_col = 'id'
for label in ['population', 'treecover']:
    gdf_path = f"/home/libe2152/optimizedsampling/0_data/admin_gdfs/usavars/{label}/gdf_counties_2015.geojson"
    gdf = gpd.read_file(gdf_path)

    n_urban = 50
    pop_col = 'POP'
    gdf_urban_path = '/home/libe2152/optimizedsampling/0_data/boundaries/us/us_urban_area_census_2020/tl_2020_us_uac20_with_pop.shp'
    gdf_urban = gpd.read_file(gdf_urban_path)

    distances_dir = f'/home/libe2152/optimizedsampling/0_data/distances/usavars/{label}/distance_to_top50_urban.pkl'

    country_shape_file = '/home/libe2152/optimizedsampling/0_data/boundaries/us/us_states_provinces/ne_110m_admin_1_states_provinces.shp'
    exclude_names = ['Alaska', 'Hawaii', 'Puerto Rico']

    out_path = f'/home/libe2152/optimizedsampling/0_data/initial_samples/usavars/{label}/convenience_sampling'

    print("Reading GeoDataFrame...")
    gdf = gpd.read_file(gdf_path)

    method = 'deterministic'
    for desired_sample_size in range(100, 5000, 100):
        print("Initializing ConvenienceSampler...")
        sampler = ConvenienceSampler(
            id_col=id_col,
            gdf_points=gdf,
            gdf_urban=gdf_urban,
            n_urban=n_urban,
            pop_col=pop_col,
            distances_dir=distances_dir
        )

        sampler.sample(n_samples=desired_sample_size, method=method, seed=1) #seed needed to break ties
        sampler.save_sampled_ids(out_path)
        sampler.plot(country_shape_file=country_shape_file, exclude_names=exclude_names)

    for method in ['probabilistic']:
        for desired_sample_size in range(100, 5000, 100):
            for seed in [1, 42, 123, 456, 789, 1234, 5678, 9101, 1213, 1415]:

                print("Initializing ConvenienceSampler...")
                sampler = ConvenienceSampler(
                    id_col=id_col,
                    gdf_points=gdf,
                    gdf_urban=gdf_urban,
                    n_urban=n_urban,
                    pop_col=pop_col,
                    distances_dir=distances_dir
                )

                sampler.sample(n_samples=desired_sample_size, method=method, seed=seed)
                sampler.save_sampled_ids(out_path)
                sampler.plot(country_shape_file=country_shape_file, exclude_names=exclude_names)

# Random Sampling

In [None]:
from random_sampling import RandomSampler

In [None]:
for label in ['population', 'treecover']:

    data_path = f"/home/libe2152/optimizedsampling/0_data/admin_gdfs/usavars/{label}/gdf_counties_2015.geojson"
    gdf = gpd.read_file(data_path)

    out_path = f'/home/libe2152/optimizedsampling/0_data/initial_samples/usavars/{label}/random_sampling'

    country_shape_file = '/home/libe2152/optimizedsampling/0_data/boundaries/us/us_states_provinces/ne_110m_admin_1_states_provinces.shp'
    exclude_names = ['Alaska', 'Hawaii', 'Puerto Rico']

    sampler = RandomSampler(gdf, id_col="id")

    for total_sample_size in [1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000]:
        for seed in [1, 42, 123, 456, 789, 1234, 5678, 9101, 1213, 1415]:
            sampler.sample(total_sample_size=total_sample_size, seed=seed)
            sampler.save_sampled_ids(out_path)
            sampler.plot(country_shape_file, exclude_names=["Alaska", "Hawaii", "Puerto Rico"])
            sampler.reset_sample()