Importing

In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import StratifiedShuffleSplit

Load dataset credit_data.csv

In [2]:
dataset = pd.read_csv('credit_data.csv')
dataset.shape
dataset.head()

Unnamed: 0,clientid,income,age,loan,default
0,1,66155.9251,59.017015,8106.532131,0
1,2,34415.15397,48.117153,6564.745018,0
2,3,57317.17006,63.108049,8020.953296,0
3,4,42709.5342,45.751972,6103.64226,0
4,5,66952.68885,18.584336,8770.099235,1


Function using simple random sampling

In [3]:
def simple_random_sampling(dataset, amount):
    return dataset.sample(amount)

df = simple_random_sampling(dataset, 100)
df.shape
df.head()

Unnamed: 0,clientid,income,age,loan,default
225,226,58121.66858,27.3018,7531.101249,1
400,401,51625.31323,44.808841,4592.24555,0
158,159,65632.60458,47.105767,12498.04045,0
1865,1866,27045.39957,50.221201,2503.7884,0
1383,1384,26643.80899,19.289629,1413.783224,0


Function using systematic sampling

In [4]:
def systematic_sampling(dataset, amount):
    dataset_length = len(dataset)
    step = dataset_length // amount
    
    random.seed(1)
    first_step = random.randint(0, step)
    indexes = np.arange(first_step, dataset_length, step=step)

    return dataset.iloc[indexes]

df = systematic_sampling(dataset, 100)
df.shape
df.head()

Unnamed: 0,clientid,income,age,loan,default
4,5,66952.68885,18.584336,8770.099235,1
24,25,65301.98403,48.840922,5465.267886,0
44,45,67852.10587,47.321899,5730.588251,0
64,65,69078.60481,25.107524,4076.583914,0
84,85,30483.29553,33.656441,4514.00978,1


Function using group sampling

In [5]:
def group_sampling(dataset, group_amount):
    group = []
    group_id = 0
    count = 0
    group_size = len(dataset) / group_amount

    for _ in dataset.iterrows():
        group.append(group_id)
        count += 1
        if count > group_size:
            count = 0
            group_id += 1

    dataset['group'] = group

    selected_group_id = random.randint(0, group_amount)

    return dataset[dataset['group'] == selected_group_id]

df = group_sampling(dataset, 5)
df.shape
df.head()

Unnamed: 0,clientid,income,age,loan,default,group
1604,1605,33261.64602,18.229629,586.651096,0,4
1605,1606,53113.0361,59.436892,10080.52438,0,4
1606,1607,42749.99032,56.419095,4626.538637,0,4
1607,1608,42108.19992,26.991351,1020.978164,0,4
1608,1609,61344.53221,20.175532,7172.654332,0,4


Function using stratified sampling


In [6]:
def stratified_sampling(dataset, percent, column):
    split = StratifiedShuffleSplit(test_size = percent)

    for _, y in split.split(dataset, dataset[column]):
        df_y = dataset.iloc[y]

    return df_y

income_label = []

# Define income_label column
for row in dataset.iterrows():
    if row[1]['income'] > 50000:
        income_label.append('> 50')
    else:
        income_label.append('< 50')
dataset['income_label'] = income_label

df = stratified_sampling(dataset, 0.1, 'income_label')
df.shape
df.head()

Unnamed: 0,clientid,income,age,loan,default,group,income_label
1185,1186,41273.7715,32.090395,3299.885072,0,2,< 50
450,451,60675.81216,39.963906,11617.74891,0,1,> 50
1293,1294,52593.51506,19.534982,811.907862,0,3,> 50
111,112,41631.6663,53.047655,106.090747,0,0,< 50
233,234,34615.54217,25.51439,6476.760852,1,0,< 50


Function using reservoir sampling

In [8]:
def reservoir_sampling(dataset, amount):
    stream = []
    i = 0
    size = len(dataset)
    reservoir = [0] * amount

    # creates an array with indexes for all dataset objects
    for i in range(size):
        stream.append(i)

    # choose samples according amount
    for i in range(amount):
        reservoir[i] = stream[i]

    while i < size:
        j = random.randrange(i + 1)
        if j < amount:
            reservoir[j] = stream[i]
        i += 1

    return dataset.iloc[reservoir] 

df = reservoir_sampling(dataset, 100)
df.shape


(100, 7)