In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from collections import Counter

In [2]:
X_toy = [1,2,3,4,5]
y_toy = [6,7,8,9,10]

In [3]:
X_toy_sample, y_toy_sample = resample(X_toy, y_toy,
                                      replace = True, #Default is True
                                      n_samples = 7,
                                      random_state = 0)
print(X_toy_sample, y_toy_sample)

[5, 1, 4, 4, 4, 2, 4] [10, 6, 9, 9, 9, 7, 9]


In [6]:
X_toy_sample, y_toy_sample = resample(X_toy, y_toy,
                                      replace = False,
                                      n_samples = 5,
                                      random_state = 0)
print(X_toy_sample, y_toy_sample)

[3, 1, 2, 4, 5] [8, 6, 7, 9, 10]


In [7]:
df = pd.read_csv('data/glass.csv')
df[:5]

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.51793,12.79,3.5,1.12,73.03,0.64,8.77,0.0,0.0,'build wind float'
1,1.51643,12.16,3.52,1.35,72.89,0.57,8.53,0.0,0.0,'vehic wind float'
2,1.51793,13.21,3.48,1.41,72.64,0.59,8.43,0.0,0.0,'build wind float'
3,1.51299,14.4,1.74,1.54,74.55,0.0,7.59,0.0,0.0,tableware
4,1.53393,12.3,0.0,1.0,70.16,0.12,16.19,0.0,0.24,'build wind non-float'


In [9]:
X = df.values[:,:-1]
y = df.values[:,-1]
N = len(X)

In [10]:
# 전체데이터의 20%에 해당하는 만큼만 복원추출로 샘플링
X_sample, y_sample = resample(X, y,
                              replace = True,
                              n_samples = int(N*0.2),
                              random_state = 0)

In [11]:
c = Counter(y_sample)
print(c)

Counter({"'build wind float'": 16, "'build wind non-float'": 13, 'headlamps': 5, 'tableware': 4, "'vehic wind float'": 4})


In [13]:
list(c.values())

[16, 5, 4, 4, 13]

In [14]:
# 전체데이터의 20%에 해당하는 만큼만 복원추출로 샘플링
X_sample, y_sample = resample(X, y,
                              replace = False,
                              n_samples = int(N*0.2),
                              random_state = 0)
c = Counter(y_sample)
print(c)

Counter({"'build wind non-float'": 19, "'build wind float'": 13, 'headlamps': 4, 'containers': 3, "'vehic wind float'": 2, 'tableware': 1})


Changing Random Seed

In [15]:
for i in range(5):
    X_sample, y_sample = resample(X, y,
                              replace = True,
                              n_samples = int(N*0.2),
                              random_state = i)
    c = Counter(y_sample)
    print(sorted(c.items()))

[("'build wind float'", 16), ("'build wind non-float'", 13), ("'vehic wind float'", 4), ('headlamps', 5), ('tableware', 4)]
[("'build wind float'", 15), ("'build wind non-float'", 15), ("'vehic wind float'", 2), ('containers', 3), ('headlamps', 5), ('tableware', 2)]
[("'build wind float'", 9), ("'build wind non-float'", 20), ("'vehic wind float'", 3), ('containers', 3), ('headlamps', 5), ('tableware', 2)]
[("'build wind float'", 15), ("'build wind non-float'", 18), ("'vehic wind float'", 1), ('containers', 1), ('headlamps', 5), ('tableware', 2)]
[("'build wind float'", 15), ("'build wind non-float'", 15), ("'vehic wind float'", 2), ('containers', 2), ('headlamps', 7), ('tableware', 1)]


In [16]:
for i in range(5):
    X_sample, y_sample = resample(X, y,
                              replace = False,
                              n_samples = int(N*0.2),
                              random_state = i)
    c = Counter(y_sample)
    print(sorted(c.items()))

[("'build wind float'", 13), ("'build wind non-float'", 19), ("'vehic wind float'", 2), ('containers', 3), ('headlamps', 4), ('tableware', 1)]
[("'build wind float'", 10), ("'build wind non-float'", 21), ("'vehic wind float'", 3), ('containers', 2), ('headlamps', 5), ('tableware', 1)]
[("'build wind float'", 14), ("'build wind non-float'", 15), ("'vehic wind float'", 5), ('containers', 4), ('headlamps', 2), ('tableware', 2)]
[("'build wind float'", 16), ("'build wind non-float'", 12), ("'vehic wind float'", 2), ('containers', 5), ('headlamps', 5), ('tableware', 2)]
[("'build wind float'", 18), ("'build wind non-float'", 11), ("'vehic wind float'", 4), ('containers', 4), ('headlamps', 5)]


Stratified Sampling

In [17]:
X_sample, y_sample = resample(X, y,
                              n_samples = int(N*0.2),
                              random_state = 0,
                              stratify=y)
c = Counter(y_sample)
print(sorted(c.items()))

[("'build wind float'", 14), ("'build wind non-float'", 15), ("'vehic wind float'", 3), ('containers', 2), ('headlamps', 6), ('tableware', 2)]
