In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

In [2]:
# Load the dataset
df = pd.read_csv(r"C:\Users\KIIT\Downloads\petrol_consumption.csv")

In [16]:
# 1. Simple Random Sampling
simple_random_sample = df.sample(n=10, random_state=42)
simple_random_sample

Unnamed: 0,Petrol_tax,Average_income,Paved_Highways,Population_Driver_licence(%),Petrol_Consumption,consumption_bin,cluster
27,7.5,3846.0,9061.0,0.579,631.0,2.0,1.0
40,7.0,4449.0,4639.0,0.626,587.0,2.0,1.0
26,8.0,3448.0,5399.0,0.548,577.0,2.0,1.0
43,7.0,3745.0,2611.0,0.508,591.0,2.0,1.0
24,8.5,4574.0,2619.0,0.551,460.0,0.0,0.0
37,7.0,3897.0,6385.0,0.586,704.0,3.0,1.0
12,7.0,4817.0,6930.0,0.574,525.0,1.0,0.0
19,8.5,4341.0,6010.0,0.677,640.0,3.0,1.0
4,8.0,4399.0,431.0,0.544,410.0,0.0,0.0
25,9.0,3721.0,4746.0,0.544,566.0,1.0,1.0


In [17]:
# 2. Systematic Sampling
step = len(df) // 10
systematic_sample = df.iloc[::step][:10]
systematic_sample

Unnamed: 0,Petrol_tax,Average_income,Paved_Highways,Population_Driver_licence(%),Petrol_Consumption,consumption_bin,cluster
0,9.0,3571.0,1976.0,0.525,541.0,1.0,0.0
4,8.0,4399.0,431.0,0.544,410.0,0.0,0.0
8,8.0,4447.0,8577.0,0.529,464.0,0.0,0.0
12,7.0,4817.0,6930.0,0.574,525.0,1.0,0.0
16,7.0,4206.0,,0.572,603.0,2.0,1.0
20,7.0,,7834.0,0.663,649.0,3.0,1.0
24,8.5,4574.0,2619.0,0.551,460.0,0.0,0.0
28,8.0,4188.0,5975.0,0.563,574.0,2.0,1.0
32,8.0,3063.0,6524.0,0.578,577.0,2.0,1.0
36,5.0,4045.0,,0.566,640.0,3.0,1.0


In [18]:
# 3. Stratified Sampling (assuming 'Petrol_Consumption' is a categorical or can be binned)
df['consumption_bin'] = pd.qcut(df['Petrol_Consumption'], q=4, labels=False)
df_strat = df.dropna(subset=['consumption_bin'])
stratified_sample = train_test_split(df_strat, test_size=0.9, stratify=df_strat['consumption_bin'], random_state=42)
stratified_sample

[    Petrol_tax  Average_income  Paved_Highways  Population_Driver_licence(%)  \
 0          9.0          3571.0          1976.0                         0.525   
 17         7.0          3718.0          4725.0                         0.540   
 45         9.0          4476.0          3942.0                         0.571   
 33         7.5          3357.0          4121.0                         0.547   
 
     Petrol_Consumption  consumption_bin  cluster  
 0                541.0              1.0      0.0  
 17               714.0              3.0      1.0  
 45               510.0              0.0      0.0  
 33               628.0              2.0      1.0  ,
     Petrol_tax  Average_income  Paved_Highways  Population_Driver_licence(%)  \
 5        10.00          5342.0          1333.0                         0.571   
 37        7.00          3897.0          6385.0                         0.586   
 6         8.00          5319.0         11868.0                         0.451   
 24     

In [20]:
# 4. Cluster Sampling (assuming 'Petrol_Consumption' can be used to form clusters)
df['cluster'] = pd.cut(df['Petrol_Consumption'], bins=3, labels=False)
clusters = df['cluster'].unique()
chosen_cluster = np.random.choice(clusters)
cluster_sample = df[df['cluster'] == chosen_cluster]
cluster_sample

Unnamed: 0,Petrol_tax,Average_income,Paved_Highways,Population_Driver_licence(%),Petrol_Consumption,consumption_bin,cluster
0,9.0,3571.0,1976.0,0.525,541.0,1.0,0.0
1,9.0,4092.0,1250.0,0.572,524.0,0.0,0.0
4,8.0,4399.0,431.0,0.544,410.0,0.0,0.0
5,10.0,5342.0,1333.0,0.571,457.0,0.0,0.0
6,8.0,5319.0,11868.0,0.451,344.0,0.0,0.0
7,8.0,5126.0,2138.0,0.553,467.0,0.0,0.0
8,8.0,4447.0,8577.0,0.529,464.0,0.0,0.0
9,7.0,4512.0,8507.0,0.552,498.0,0.0,0.0
12,7.0,4817.0,6930.0,0.574,525.0,1.0,0.0
13,7.0,4207.0,6580.0,0.545,508.0,0.0,0.0


In [21]:
# 5. Multi-stage Sampling (randomly select clusters, then random sample within them)
selected_clusters = np.random.choice(clusters, size=2, replace=False)
multi_stage_sample = df[df['cluster'].isin(selected_clusters)].sample(n=10, random_state=42)
multi_stage_sample

Unnamed: 0,Petrol_tax,Average_income,Paved_Highways,Population_Driver_licence(%),Petrol_Consumption,consumption_bin,cluster
19,8.5,4341.0,6010.0,0.677,640.0,3.0,1.0
28,8.0,4188.0,5975.0,0.563,574.0,2.0,1.0
20,7.0,,7834.0,0.663,649.0,3.0,1.0
38,8.5,3635.0,3274.0,0.663,648.0,3.0,1.0
2,9.0,,,0.58,561.0,1.0,1.0
26,8.0,3448.0,5399.0,0.548,577.0,2.0,1.0
32,8.0,3063.0,6524.0,0.578,577.0,2.0,1.0
33,7.5,3357.0,4121.0,0.547,628.0,2.0,1.0
27,7.5,3846.0,9061.0,0.579,631.0,2.0,1.0
42,7.0,4300.0,3635.0,0.603,632.0,2.0,1.0


In [23]:
# 6. Convenience Sampling (first 10 rows)
convenience_sample = df.head(10)
convenience_sample

Unnamed: 0,Petrol_tax,Average_income,Paved_Highways,Population_Driver_licence(%),Petrol_Consumption,consumption_bin,cluster
0,9.0,3571.0,1976.0,0.525,541.0,1.0,0.0
1,9.0,4092.0,1250.0,0.572,524.0,0.0,0.0
2,9.0,,,0.58,561.0,1.0,1.0
3,7.5,4870.0,2351.0,0.529,,,
4,8.0,4399.0,431.0,0.544,410.0,0.0,0.0
5,10.0,5342.0,1333.0,0.571,457.0,0.0,0.0
6,8.0,5319.0,11868.0,0.451,344.0,0.0,0.0
7,8.0,5126.0,2138.0,0.553,467.0,0.0,0.0
8,8.0,4447.0,8577.0,0.529,464.0,0.0,0.0
9,7.0,4512.0,8507.0,0.552,498.0,0.0,0.0


In [24]:
# 7. Snowball Sampling (simulate by expanding from a random row)
seed = df.sample(n=1, random_state=42)
neighbors = df[(df['Petrol_Consumption'] >= seed['Petrol_Consumption'].values[0] - 10) &
               (df['Petrol_Consumption'] <= seed['Petrol_Consumption'].values[0] + 10)]
snowball_sample = pd.concat([seed, neighbors]).drop_duplicates().head(10)
snowball_sample 

Unnamed: 0,Petrol_tax,Average_income,Paved_Highways,Population_Driver_licence(%),Petrol_Consumption,consumption_bin,cluster
27,7.5,3846.0,9061.0,0.579,631.0,2.0,1.0
15,7.0,4318.0,10340.0,0.586,635.0,3.0,1.0
19,8.5,4341.0,6010.0,0.677,640.0,3.0,1.0
33,7.5,3357.0,4121.0,0.547,628.0,2.0,1.0
36,5.0,4045.0,,0.566,640.0,3.0,1.0
42,7.0,4300.0,3635.0,0.603,632.0,2.0,1.0


In [25]:
# 8. Quota Sampling (select a fixed number from each bin)
quota_sample = df.groupby('consumption_bin').apply(lambda x: x.sample(n=2, random_state=42)).reset_index(drop=True)
quota_sample

  quota_sample = df.groupby('consumption_bin').apply(lambda x: x.sample(n=2, random_state=42)).reset_index(drop=True)


Unnamed: 0,Petrol_tax,Average_income,Paved_Highways,Population_Driver_licence(%),Petrol_Consumption,consumption_bin,cluster
0,9.0,4476.0,3942.0,0.571,510.0,0.0,0.0
1,8.5,4574.0,2619.0,0.551,460.0,0.0,0.0
2,7.0,3640.0,6905.0,0.518,571.0,1.0,1.0
3,9.0,,,0.58,561.0,1.0,1.0
4,8.0,3063.0,6524.0,0.578,577.0,2.0,1.0
5,8.0,4391.0,5939.0,0.53,580.0,2.0,1.0
6,7.0,,3985.0,0.563,699.0,3.0,1.0
7,7.0,4345.0,3905.0,0.672,968.0,3.0,2.0


In [26]:
# 9. Judgmental/Purposive Sampling (select rows based on a condition)
judgmental_sample = df[df['Petrol_Consumption'] > df['Petrol_Consumption'].mean()].sample(n=10, random_state=42)
judgmental_sample

Unnamed: 0,Petrol_tax,Average_income,Paved_Highways,Population_Driver_licence(%),Petrol_Consumption,consumption_bin,cluster
15,7.0,4318.0,10340.0,0.586,635.0,3.0,1.0
20,7.0,,7834.0,0.663,649.0,3.0,1.0
38,8.5,3635.0,3274.0,0.663,648.0,3.0,1.0
16,7.0,4206.0,,0.572,603.0,2.0,1.0
35,6.58,3802.0,7834.0,0.629,644.0,3.0,1.0
43,7.0,3745.0,2611.0,0.508,591.0,2.0,1.0
18,7.0,,5915.0,0.724,865.0,3.0,2.0
40,7.0,4449.0,4639.0,0.626,587.0,2.0,1.0
42,7.0,4300.0,3635.0,0.603,632.0,2.0,1.0
44,6.0,5215.0,2302.0,0.672,782.0,3.0,2.0


In [27]:

# 10. Self-selection Sampling (simulate by filtering a condition, e.g., high consumption)
self_selection_sample = df[df['Petrol_Consumption'] > df['Petrol_Consumption'].quantile(0.75)].sample(n=10, random_state=42)
self_selection_sample

Unnamed: 0,Petrol_tax,Average_income,Paved_Highways,Population_Driver_licence(%),Petrol_Consumption,consumption_bin,cluster
41,7.0,,3985.0,0.563,699.0,3.0,1.0
39,7.0,4345.0,3905.0,0.672,968.0,3.0,2.0
15,7.0,4318.0,10340.0,0.586,635.0,3.0,1.0
38,8.5,3635.0,3274.0,0.663,648.0,3.0,1.0
35,6.58,3802.0,7834.0,0.629,644.0,3.0,1.0
18,7.0,,5915.0,0.724,865.0,3.0,2.0
17,7.0,3718.0,4725.0,0.54,714.0,3.0,1.0
44,6.0,5215.0,2302.0,0.672,782.0,3.0,2.0
20,7.0,,7834.0,0.663,649.0,3.0,1.0
37,7.0,3897.0,6385.0,0.586,704.0,3.0,1.0


In [28]:
# 11. Balanced Sampling (sample to match the distribution of a variable)
min_count = df['consumption_bin'].value_counts().min()
balanced_sample = df.groupby('consumption_bin').apply(lambda x: x.sample(n=min_count, random_state=42)).reset_index(drop=True)
balanced_sample

  balanced_sample = df.groupby('consumption_bin').apply(lambda x: x.sample(n=min_count, random_state=42)).reset_index(drop=True)


Unnamed: 0,Petrol_tax,Average_income,Paved_Highways,Population_Driver_licence(%),Petrol_Consumption,consumption_bin,cluster
0,9.0,4476.0,3942.0,0.571,510.0,0.0,0.0
1,8.5,4574.0,2619.0,0.551,460.0,0.0,0.0
2,9.0,4092.0,1250.0,0.572,524.0,0.0,0.0
3,9.0,4897.0,2449.0,0.511,464.0,0.0,0.0
4,8.0,4447.0,8577.0,0.529,464.0,0.0,0.0
5,10.0,5342.0,1333.0,0.571,457.0,0.0,0.0
6,8.0,4399.0,431.0,0.544,410.0,0.0,0.0
7,7.0,5002.0,9794.0,0.593,524.0,0.0,0.0
8,8.0,5126.0,2138.0,0.553,467.0,0.0,0.0
9,7.0,4207.0,6580.0,0.545,508.0,0.0,0.0


In [29]:
# 12. Time-based Sampling (if data has a time column, here simulated by index)
time_based_sample = df.iloc[::len(df)//10][:10]
time_based_sample

Unnamed: 0,Petrol_tax,Average_income,Paved_Highways,Population_Driver_licence(%),Petrol_Consumption,consumption_bin,cluster
0,9.0,3571.0,1976.0,0.525,541.0,1.0,0.0
4,8.0,4399.0,431.0,0.544,410.0,0.0,0.0
8,8.0,4447.0,8577.0,0.529,464.0,0.0,0.0
12,7.0,4817.0,6930.0,0.574,525.0,1.0,0.0
16,7.0,4206.0,,0.572,603.0,2.0,1.0
20,7.0,,7834.0,0.663,649.0,3.0,1.0
24,8.5,4574.0,2619.0,0.551,460.0,0.0,0.0
28,8.0,4188.0,5975.0,0.563,574.0,2.0,1.0
32,8.0,3063.0,6524.0,0.578,577.0,2.0,1.0
36,5.0,4045.0,,0.566,640.0,3.0,1.0
