In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit

In [24]:
wh_data=pd.read_csv("weight-height.csv")
wh_data

Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.042470
4,Male,69.881796,206.349801
...,...,...,...
9995,Female,66.172652,136.777454
9996,Female,67.067155,170.867906
9997,Female,63.867992,128.475319
9998,Female,69.034243,163.852461


In [25]:
mean_population = round(wh_data['Height'].mean(),3)
mean_population

66.368

In [26]:
sample_random = wh_data.sample(n=1000).sort_index()
sample_random

Unnamed: 0,Gender,Height,Weight
5,Male,67.253016,152.212156
10,Male,71.195382,186.604926
18,Male,69.640060,185.983958
26,Male,69.089631,184.435174
45,Male,70.104786,188.922303
...,...,...,...
9950,Female,61.110725,123.386296
9955,Female,65.291384,141.801085
9956,Female,62.869566,140.298043
9976,Female,61.475904,121.387236


In [27]:
mean_sample=round(sample_random['Height'].mean(),3)
mean_sample

66.375

In [28]:
output = {'mean_sample':[mean_sample],'mean_population':mean_population}

# Transform dictionary into a data frame
output = pd.DataFrame(output, index=['Simple Random Sampling'])

# Add a value corresponding to the absolute error
output['abs_error'] = abs(output['mean_population'] - output['mean_sample'])

# Sort data frame by absolute error
output.sort_values(by='abs_error')

Unnamed: 0,mean_sample,mean_population,abs_error
Simple Random Sampling,66.375,66.368,0.007


In [29]:
def systematic_sampling(df, step):
    
    indexes = np.arange(0,1000,step=step)
    systematic_sample = df.iloc[indexes]
    return systematic_sample
    
# Obtain a systematic sample and save it in a new variable
systematic_sample = systematic_sampling(wh_data, 1)

# Save the sample mean in a separate variable
mean_systematic = round(systematic_sample['Height'].mean(),3)

# View sampled data frame
systematic_sample

Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.042470
4,Male,69.881796,206.349801
...,...,...,...
995,Male,68.642628,178.816795
996,Male,70.026842,186.358702
997,Male,74.325677,220.845727
998,Male,72.231636,202.137160


In [30]:
outcomes = {'mean_systematic':[mean_systematic],
           'mean_population':mean_population}

# Transform dictionary into a data frame
outcomes = pd.DataFrame(outcomes, index=[' Systematic Sampling'])

# Add a value corresponding to the absolute error
outcomes['abs_error'] = abs(outcomes['mean_population'] - outcomes['mean_systematic'])

# Sort data frame by absolute error
outcomes.sort_values(by='abs_error')

Unnamed: 0,mean_systematic,mean_population,abs_error
Systematic Sampling,68.96,66.368,2.592


In [31]:
# Set the split criteria
split = StratifiedShuffleSplit(n_splits=1, test_size=1000)
print(split)

# Perform data frame split
for x, y in split.split(wh_data, wh_data['Gender']):
    stratified_random_sample = wh_data.iloc[y]

# View sampled data frame

stratified_random_sample_mean = round(stratified_random_sample['Height'].mean(),3)


# Obtain the sample mean for each group
# stratified_random_sample.groupby("Gender").mean(),axis=1)

stratified_random_sample

StratifiedShuffleSplit(n_splits=1, random_state=None, test_size=1000,
            train_size=None)


Unnamed: 0,Gender,Height,Weight
2482,Male,64.602807,149.937713
2668,Male,72.979451,202.176784
3113,Male,68.128991,177.897336
5848,Female,64.956345,144.759089
7294,Female,54.873728,78.606670
...,...,...,...
8011,Female,62.821454,138.402907
453,Male,68.457894,192.761886
9370,Female,60.944727,113.537589
7887,Female,63.465882,119.351599


In [32]:
outcomes = {'mean_stratified':[stratified_random_sample_mean],
           'mean_population':mean_population}

# Transform dictionary into a data frame
outcomes = pd.DataFrame(outcomes, index=[' Stratified Sampling'])

# Add a value corresponding to the absolute error
outcomes['abs_error'] = abs(outcomes['mean_population'] - outcomes['mean_stratified'])

# Sort data frame by absolute error
outcomes.sort_values(by='abs_error')

Unnamed: 0,mean_stratified,mean_population,abs_error
Stratified Sampling,66.281,66.368,0.087
