In [1]:
import numpy as np
import pandas as pd

In [6]:
# Set random seed
np.random.seed(42) #42 - answer to everything

# Define total number of products
number_of_products = 10

# Create data dictionary
data = {'product_id':np.arange(1, number_of_products+1).tolist(), #values are from 1 to 9+1, as a list
       'measure':np.round(np.random.normal(loc=10, scale=0.5, size=number_of_products),3)} #10 values with 3 numbers after ,

# Transform dictionary into a data frame
df = pd.DataFrame(data)

# View data frame
df

Unnamed: 0,product_id,measure
0,1,10.248
1,2,9.931
2,3,10.324
3,4,10.762
4,5,9.883
5,6,9.883
6,7,10.79
7,8,10.384
8,9,9.765
9,10,10.271


In [3]:
simple_random_sample = df.sample(n=4).sort_values(by='product_id') #chose randomly 4 numbers from the dataset sorted by values
#all the elements have the same chance to be in the sample

In [4]:
simple_random_sample

Unnamed: 0,product_id,measure
2,3,10.324
6,7,10.79
7,8,10.384
8,9,9.765


In [7]:
#SYSTEMATIC SAMPLING

In [8]:
def systematic_sampling(df, step):
    
    indexes = np.arange(0,len(df),step=step)
    systematic_sample = df.iloc[indexes]
    return systematic_sample
    
# Obtain a systematic sample and save it in a new variable
systematic_sample = systematic_sampling(df, 3)

In [9]:
systematic_sample 

Unnamed: 0,product_id,measure
0,1,10.248
3,4,10.762
6,7,10.79
9,10,10.271


In [10]:
#CLUSTER SAMPLING

In [18]:
def cluster_sampling(df, number_of_clusters):
    
    try:
        # Divide the units into cluster of equal size
        df['cluster_id'] = np.repeat([range(1,number_of_clusters+1)],len(df)/number_of_clusters)

        # Create an empty list
        indices = []

        # Append the indexes from the clusters that meet the criteria
        # For this formula, clusters id must be an even number
        for i in range(0,len(df)):
            if df['cluster_id'].iloc[i]%2 == 0:
                indices.append(i)
        cluster_sample = df.iloc[indices]
        return(cluster_sample)
    
    except:
        print("The population cannot be divided into clusters of equal size!")
        
# Obtain a cluster sample and save it in a new variable
cluster_sample = cluster_sampling(df,5)

In [19]:
cluster_sample

Unnamed: 0,product_id,product_strata,measure,cluster_id
2,3,1,10.121,2
3,4,1,9.043,2
6,7,2,9.494,4
7,8,2,10.157,4


In [13]:
# STRATIFIED SAMPLE
data = {'product_id':np.arange(1, number_of_products+1).tolist(),
       'product_strata':np.repeat([1,2], number_of_products/2).tolist(),
       'measure':np.round(np.random.normal(loc=10, scale=0.5, size=number_of_products),3)}

# Transform dictionary into a data frame
df = pd.DataFrame(data)

# View data frame
df

Unnamed: 0,product_id,product_strata,measure
0,1,1,9.768
1,2,1,9.767
2,3,1,10.121
3,4,1,9.043
4,5,1,9.138
5,6,2,9.719
6,7,2,9.494
7,8,2,10.157
8,9,2,9.546
9,10,2,9.294
