In [12]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("/content/kathmandu_household.csv")

df.shape
df.head(5)


Unnamed: 0,household_id,lat,lon,district,season,household_size,uses_piped_water,uses_groundwater_well,uses_tanker,aquifer_id,water_insecurity_index
0,HH00001,27.643945,85.436554,Kathmandu,wet,5,0,1,0,AQ0498,47.35
1,HH00002,27.687734,85.426304,Kathmandu,dry,5,1,1,1,AQ0395,42.88
2,HH00003,27.7564,85.34514,Lalitpur,wet,4,0,1,1,AQ0444,32.72
3,HH00004,27.756791,85.284448,Kathmandu,wet,8,0,1,1,AQ0231,16.89
4,HH00005,27.690652,85.276281,Kathmandu,wet,4,1,1,1,AQ0449,4.65


In [20]:
# Choose target variable
target = "water_insecurity_index"
group_col = "district"

print("Population Size:", len(df))
print("Population Mean:", df[target].mean())
SAMPLE_N = 300

Population Size: 2000
Population Mean: 36.759175000000006


Simple Random Sampling

In [22]:
def simple_random_sampling(df, n, seed=42):
    return df.sample(n=n, random_state=seed)
srs = simple_random_sampling(df, SAMPLE_N)
srs

Unnamed: 0,household_id,lat,lon,district,season,household_size,uses_piped_water,uses_groundwater_well,uses_tanker,aquifer_id,water_insecurity_index
1860,HH01861,27.742049,85.374372,Kathmandu,wet,4,0,1,0,AQ0447,63.21
353,HH00354,27.788193,85.265015,Kathmandu,dry,4,1,0,0,AQ0033,53.20
1333,HH01334,27.761004,85.356546,Kathmandu,wet,5,0,0,1,AQ0372,11.01
905,HH00906,27.662195,85.413000,Bhaktapur,dry,4,0,1,0,AQ0450,15.54
1289,HH01290,27.732741,85.346480,Kathmandu,wet,5,1,1,0,AQ0059,32.39
...,...,...,...,...,...,...,...,...,...,...,...
316,HH00317,27.765706,85.419863,Kathmandu,dry,5,1,1,0,AQ0217,45.10
45,HH00046,27.704887,85.370014,Kathmandu,wet,4,1,1,1,AQ0108,30.45
1719,HH01720,27.743127,85.227721,Lalitpur,wet,4,1,1,0,AQ0325,36.71
1831,HH01832,27.791421,85.262539,Lalitpur,wet,5,1,1,0,AQ0167,22.09


Systematic Sampling


In [23]:
def systematic_sampling(df, n, seed=42):
    np.random.seed(seed)
    k = len(df) // n
    start = np.random.randint(0, k)
    indices = np.arange(start, len(df), k)
    return df.iloc[indices[:n]]
sys = systematic_sampling(df, SAMPLE_N)

Stratified Sampling

In [31]:
def stratified_sampling(df, group_col, n, seed=42):
    return (
        df.groupby(group_col, group_keys=False)
          .apply(lambda x: x.sample(
              max(1, int(len(x)/len(df)*n)),
              random_state=seed
          ))
    )
strat = stratified_sampling(df, group_col, SAMPLE_N)
strat

  .apply(lambda x: x.sample(


Unnamed: 0,household_id,lat,lon,district,season,household_size,uses_piped_water,uses_groundwater_well,uses_tanker,aquifer_id,water_insecurity_index
1439,HH01440,27.716600,85.444895,Bhaktapur,wet,4,0,0,1,AQ0037,21.00
1431,HH01432,27.693560,85.308138,Bhaktapur,dry,3,1,1,0,AQ0453,34.98
140,HH00141,27.730988,85.386818,Bhaktapur,wet,4,1,0,0,AQ0014,40.18
1067,HH01068,27.763821,85.206002,Bhaktapur,dry,5,0,1,1,AQ0259,31.06
464,HH00465,27.771734,85.376041,Bhaktapur,wet,7,0,1,0,AQ0067,64.36
...,...,...,...,...,...,...,...,...,...,...,...
653,HH00654,27.760073,85.315426,Lalitpur,wet,6,0,0,1,AQ0051,44.43
1070,HH01071,27.663002,85.408595,Lalitpur,dry,3,1,1,0,AQ0287,25.17
269,HH00270,27.602134,85.210527,Lalitpur,wet,3,1,1,0,AQ0492,13.95
758,HH00759,27.683752,85.402062,Lalitpur,dry,7,1,0,0,AQ0418,36.77


Cluster Sampling

In [30]:
def cluster_sampling(df, group_col, n_clusters=3, seed=42):
    np.random.seed(seed)
    clusters = df[group_col].unique()
    selected = np.random.choice(clusters, n_clusters, replace=False)
    return df[df[group_col].isin(selected)]
cluster = cluster_sampling(df, group_col, n_clusters=3)
cluster

Unnamed: 0,household_id,lat,lon,district,season,household_size,uses_piped_water,uses_groundwater_well,uses_tanker,aquifer_id,water_insecurity_index
0,HH00001,27.643945,85.436554,Kathmandu,wet,5,0,1,0,AQ0498,47.35
1,HH00002,27.687734,85.426304,Kathmandu,dry,5,1,1,1,AQ0395,42.88
2,HH00003,27.756400,85.345140,Lalitpur,wet,4,0,1,1,AQ0444,32.72
3,HH00004,27.756791,85.284448,Kathmandu,wet,8,0,1,1,AQ0231,16.89
4,HH00005,27.690652,85.276281,Kathmandu,wet,4,1,1,1,AQ0449,4.65
...,...,...,...,...,...,...,...,...,...,...,...
1995,HH01996,27.790122,85.321646,Kathmandu,wet,4,1,1,0,AQ0164,35.29
1996,HH01997,27.726851,85.271510,Kathmandu,wet,6,1,1,0,AQ0268,37.40
1997,HH01998,27.670781,85.236875,Kathmandu,dry,5,1,1,1,AQ0440,43.16
1998,HH01999,27.720665,85.200974,Bhaktapur,dry,3,0,1,0,AQ0003,48.84


Convenience Sampling

In [28]:
def convenience_sampling(df, n):
    return df.head(n)
conv = convenience_sampling(df, SAMPLE_N)
conv

Unnamed: 0,household_id,lat,lon,district,season,household_size,uses_piped_water,uses_groundwater_well,uses_tanker,aquifer_id,water_insecurity_index
0,HH00001,27.643945,85.436554,Kathmandu,wet,5,0,1,0,AQ0498,47.35
1,HH00002,27.687734,85.426304,Kathmandu,dry,5,1,1,1,AQ0395,42.88
2,HH00003,27.756400,85.345140,Lalitpur,wet,4,0,1,1,AQ0444,32.72
3,HH00004,27.756791,85.284448,Kathmandu,wet,8,0,1,1,AQ0231,16.89
4,HH00005,27.690652,85.276281,Kathmandu,wet,4,1,1,1,AQ0449,4.65
...,...,...,...,...,...,...,...,...,...,...,...
295,HH00296,27.679617,85.293720,Lalitpur,dry,4,1,0,1,AQ0324,43.96
296,HH00297,27.605289,85.311910,Kathmandu,dry,5,0,1,0,AQ0313,38.07
297,HH00298,27.667111,85.306475,Kathmandu,dry,4,1,0,0,AQ0370,32.12
298,HH00299,27.795204,85.297430,Bhaktapur,dry,6,0,0,1,AQ0460,15.03


In [32]:
def compare_methods(df, samples, target):
    pop_mean = df[target].mean()
    results = []

    for name, sample_df in samples.items():
        sample_mean = sample_df[target].mean()
        error = sample_mean - pop_mean

        results.append({
            "Method": name,
            "Sample Size": len(sample_df),
            "Sample Mean": round(sample_mean, 2),
            "Population Mean": round(pop_mean, 2),
            "Mean Error": round(error, 2)
        })

    return pd.DataFrame(results).sort_values("Mean Error")

samples = {
    "Simple Random": srs,
    "Systematic": sys,
    "Stratified": strat,
    "Cluster": cluster,
    "Convenience": conv
}

comparison_table = compare_methods(df, samples, target)
comparison_table


Unnamed: 0,Method,Sample Size,Sample Mean,Population Mean,Mean Error
4,Convenience,300,36.06,36.76,-0.7
3,Cluster,2000,36.76,36.76,0.0
0,Simple Random,300,36.79,36.76,0.03
1,Systematic,300,37.22,36.76,0.46
2,Stratified,299,37.84,36.76,1.08


Cluster and Simple Random sampling were closest to the population mean, so they were the most accurate methods here. Convenience and Systematic had moderate error, while Stratified showed the highest deviation. Lower mean error indicates better representation of the population.