In [1]:
import geopandas as gpd

In [3]:
import numpy as np

# Cluster Sampling

In [4]:
from cluster_sampling import ClusterSampler

In [None]:
ADMIN_IDS = {
    'pc11_s_id': 'state',
    'pc11_d_id': 'district',
    'pc11_sd_id': 'subdistrict'
}

data_path = "/share/india_secc/MOSAIKS/train_shrugs_with_admins.geojson"
gdf = gpd.read_file(data_path)

country_shape_file = '/home/libe2152/optimizedsampling/0_data/boundaries/world/ne_10m_admin_0_countries.shp'
country_name = 'India'

strata_col = 'pc11_s_id'
cluster_col = 'pc11_d_id'

out_path = f'/home/libe2152/optimizedsampling/0_data/initial_samples/india_secc/cluster_sampling'

sampler = ClusterSampler(gdf, id_col='condensed_shrug_id', strata_col=strata_col, cluster_col=cluster_col, ADMIN_IDS=ADMIN_IDS)

n_strata = 5
all_strata = gdf[strata_col].astype(str).unique()

np.random.seed(78910)
fixed_strata = np.random.choice(all_strata, size=n_strata, replace=False)

for points_per_cluster in [50]:
    sampler.cluster_col = cluster_col
    for total_sample_size in range(1000, 6000, 1000):
        
        for seed in [1, 42, 123, 456, 789, 1234, 5678, 9101, 1213, 1415]:
            try:
                sampler.sample(total_sample_size, points_per_cluster, seed, fixed_strata=fixed_strata)
                sampler.save_sampled_ids(out_path)
                sampler.plot(country_shape_file=country_shape_file, country_name=country_name)
            except Exception as e:
                print(e)
                from IPython import embed; embed()
            sampler.reset_sample()

KeyboardInterrupt: 

# Convenience Sampling

In [None]:
from infrastructure_convenience_sampling import UrbanConvenienceSampler

In [None]:
ADMIN_IDS = {
    'pc11_s_id': 'state',
    'pc11_d_id': 'district',
    'pc11_sd_id': 'subdistrict'
}

id_col = 'condensed_shrug_id'
gdf_path = '/share/india_secc/MOSAIKS/train_shrugs_with_admins.geojson'
pop_col = 'pc11_pca_tot_p_combined'
country_shape_file = '/home/libe2152/optimizedsampling/0_data/boundaries/world/ne_10m_admin_0_countries.shp'
country_name = 'India'
out_path = f'/home/libe2152/optimizedsampling/0_data/initial_samples/india_secc/convenience_sampling'

print("Reading GeoDataFrame...")
gdf = gpd.read_file(gdf_path)

method = 'probabilistic'
temp=0.025
for n_urban in [20, 50]:
    distances_dir = f'/home/libe2152/optimizedsampling/0_data/distances/india_secc/distance_to_top{n_urban}_urban.pkl'
    for desired_sample_size in range(1000, 6000, 1000):
        for seed in [1, 42, 123, 456, 789, 1234, 5678, 9101, 1213, 1415]:

            print("Initializing UrbanConvenienceSampler...")
            sampler = UrbanConvenienceSampler(
                id_col=id_col,
                gdf_points=gdf,
                gdf_urban=gdf,
                n_urban=n_urban,
                pop_col=pop_col,
                distances_dir=distances_dir
            )

            sampler.sample(n_samples=desired_sample_size, method=method, temp=temp, seed=seed)
            sampler.save_sampled_ids(out_path)
            sampler.plot(country_shape_file=country_shape_file, country_name=country_name)

In [None]:
ADMIN_IDS = {
'pc11_s_id': 'state',
'pc11_d_id': 'district',
'pc11_sd_id': 'subdistrict'
}

cluster_col = 'pc11_d_id'
# or another appropriate cluster ID
id_col="id"
id_col = 'condensed_shrug_id'

for points_per_cluster in [20, 30, 50]:
    gdf_path = '/share/india_secc/MOSAIKS/train_shrugs_with_admins.geojson'
    gdf = gpd.read_file(gdf_path)

    for n_urban in [10, 20, 50]:
        pop_col = 'pc11_pca_tot_p_combined'

        distances_dir = f'/home/libe2152/optimizedsampling/0_data/distances/india_secc/distance_to_top{n_urban}_urban.pkl'
        cluster_distances_dir = f'/home/libe2152/optimizedsampling/0_data/distancesindia_secc/cluster_distance_to_top{n_urban}_urban.pkl'

        country_shape_file = '/home/libe2152/optimizedsampling/0_data/boundaries/world/ne_10m_admin_0_countries.shp'
        country_name = 'India'

        out_path = f'/home/libe2152/optimizedsampling/0_data/initial_samples/india_secc/convenience_sampling'

        method = 'probabilistic'
        temp = 0.025

        for desired_sample_size in range(1000, 6000, 1000):
            for seed in [1, 42, 123, 456, 789, 1234, 5678, 9101, 1213, 1415]:
                # Cluster Convenience Sampler (NEW)
                print("Initializing ClusterConvenienceSampler...")
                sampler = UrbanConvenienceSampler(  # make sure you have this class
                    gdf_points=gdf,
                    id_col=id_col,
                    pop_col=pop_col,
                    cluster_col=cluster_col,
                    points_per_cluster=points_per_cluster,
                    gdf_urban=gdf,
                    n_urban=n_urban,
                    distances_dir=distances_dir,
                    cluster_distances_dir=cluster_distances_dir,
                    admin_ids=ADMIN_IDS
                )

                sampler.sample_by_clusters(
                    total_sample_size=desired_sample_size,
                    method=method,
                    temp=temp,
                    seed=seed
                )
                sampler.save_sampled_ids(out_path)
                sampler.plot(country_shape_file=country_shape_file, country_name=country_name)

# Random Sampling

In [None]:
from random_sampling import RandomSampler

In [None]:
data_path = "/share/india_secc/MOSAIKS/train_shrugs_with_admins.geojson"
gdf = gpd.read_file(data_path)

out_path = f'/home/libe2152/optimizedsampling/0_data/initial_samples/india_secc/random_sampling'

country_shape_file = '/home/libe2152/optimizedsampling/0_data/boundaries/world/ne_10m_admin_0_countries.shp'
country_name = 'India'

sampler = RandomSampler(gdf, id_col="condensed_shrug_id")

for total_sample_size in range(1000, 6000, 1000):
    for seed in [1, 42, 123, 456, 789, 1234, 5678, 9101, 1213, 1415]:
        sampler.sample(total_sample_size=total_sample_size, seed=seed)
        sampler.save_sampled_ids(out_path)
        sampler.plot(country_shape_file=country_shape_file, country_name=country_name)
        sampler.reset_sample()