# Generating Togo initial sample

In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np

## Clustered Initial Sample

In [2]:
from cluster_sampling import ClusterSampler

In [3]:
ADMIN_IDS = {
    'canton': 'canton'
}

data_path = f"/home/libe2152/optimizedsampling/0_data/admin_gdfs/togo/gdf_adm3.geojson"
gdf = gpd.read_file(data_path)
train_ids = pd.read_csv('/share/togo/splits/train_ids.csv').astype(str)
train_id_set = set(train_ids.iloc[:, 0])
gdf = gdf[gdf['id'].astype(str).isin(train_ids.iloc[:, 0])].copy()

gdf['canton'] = gdf['admin_3']
gdf['prefecture'] = gdf['admin_2']
gdf['region'] = gdf['admin_1']

In [4]:
gdf

Unnamed: 0,id,admin_1,admin_2,admin_3,geometry,canton,prefecture,region
0,lat_11--135__lon_-0--125,savanes,cinkasse,Gouloungoussi__cinkasse,POINT (-0.125 11.135),Gouloungoussi__cinkasse,cinkasse,savanes
1,lat_11--135__lon_-0--115,savanes,cinkasse,nan__cinkasse,POINT (-0.115 11.135),nan__cinkasse,cinkasse,savanes
2,lat_11--125__lon_-0--135,savanes,cinkasse,Gouloungoussi__cinkasse,POINT (-0.135 11.125),Gouloungoussi__cinkasse,cinkasse,savanes
3,lat_11--125__lon_-0--125,savanes,cinkasse,Gouloungoussi__cinkasse,POINT (-0.125 11.125),Gouloungoussi__cinkasse,cinkasse,savanes
4,lat_11--125__lon_-0--115,savanes,cinkasse,Gouloungoussi__cinkasse,POINT (-0.115 11.125),Gouloungoussi__cinkasse,cinkasse,savanes
...,...,...,...,...,...,...,...,...
18936,lat_6--205__lon_1--075,maritime,ave,Akepe__ave,POINT (1.075 6.205),Akepe__ave,ave,maritime
18937,lat_6--205__lon_1--415,maritime,lacs,Agbodrafo__lacs,POINT (1.415 6.205),Agbodrafo__lacs,lacs,maritime
18938,lat_6--205__lon_1--425,maritime,lacs,Agbodrafo__lacs,POINT (1.425 6.205),Agbodrafo__lacs,lacs,maritime
18939,lat_6--205__lon_1--435,maritime,lacs,Agbodrafo__lacs,POINT (1.435 6.205),Agbodrafo__lacs,lacs,maritime


In [5]:
out_path = f'/home/libe2152/optimizedsampling/0_data/initial_samples/togo/cluster_sampling'

country_shape_file = '/home/libe2152/togo/data/shapefiles/openAfrica/Shapefiles/tgo_admbnda_adm0_inseed_itos_20210107.shp'

strata_col = 'region'
cluster_col = 'canton'

all_strata = gdf[strata_col].astype(str).unique()
n_strata = 2
print(all_strata)

np.random.seed(123456789) #78910
fixed_strata = np.random.choice(all_strata, size=n_strata, replace=False)

for points_per_cluster in [25]:
    sampler = ClusterSampler(gdf, id_col='id', strata_col=strata_col, cluster_col=cluster_col, ADMIN_IDS=ADMIN_IDS)
    for total_sample_size in range(3000, 3100, 100):
        
        for seed in [1, 42, 123, 456, 789, 1234, 5678, 9101, 1213, 1415]:
            try:
                sampler.sample(total_sample_size, points_per_cluster, seed, fixed_strata=fixed_strata)
                sampler.save_sampled_ids(out_path)

                # Assert that all sampled ids are in train_ids
                sampled_ids_set = set(map(str, sampler.sampled_ids))
                assert sampled_ids_set.issubset(train_id_set), (
                    "Error: Some sampled IDs are not in train_ids!"
                )
                sampler.plot(country_shape_file=country_shape_file)
            except Exception as e:
                print(e)
                from IPython import embed; embed()
            sampler.reset_sample()

['savanes' 'kara' 'centrale' 'plateaux' 'maritime']
[Init] Initializing ClusterSampler...
[Stratify] Stratifying points by column: region
[Stratify] Unique strata found: ['savanes' 'kara' 'centrale' 'plateaux' 'maritime']
[Init] Found 5 strata.
[Sample] Starting sampling process...
[Determine Sample Sizes] Total desired sample size: 3000
[Determine Sample Sizes] Selected strata: ['plateaux' 'kara']
[Determine Sample Sizes] Final sample size per stratum:
{'plateaux': 1200, 'kara': 1800}
[Sample] Sample sizes per stratum: {'plateaux': 1200, 'kara': 1800}
[Sample] Processing stratum: plateaux
[Sample] Processing stratum: kara
[Sample] Sampling complete. Total points sampled: 2803
Clusters: canton
['lat_6--985__lon_1--555', 'lat_6--955__lon_1--345', 'lat_6--995__lon_1--455', 'lat_6--995__lon_1--465', 'lat_6--955__lon_1--515', 'lat_6--915__lon_1--395', 'lat_6--985__lon_1--545', 'lat_7--045__lon_1--545', 'lat_7--015__lon_1--435', 'lat_7--045__lon_1--515', 'lat_7--045__lon_1--525', 'lat_7--06