# 재표본 추출 방법 - resampling

In [2]:
import numpy as np

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from sklearn.utils import resample

from sklearn.utils import shuffle

# 데이터셋
from sklearn.datasets import load_iris

* 간단한 예제

In [6]:
x = ['a', 'b', 'c', 'd']

# 모델 생성
kfold = KFold(n_splits=2) # 2개로 나누기

for train, test in kfold.split(x):
    print(f'{train} {test}')
    print(x[train[0]], x[train[1]])
    print(x[test[0]], x[test[1]])

[2 3] [0 1]
c d
a b
[0 1] [2 3]
a b
c d


In [5]:
print(x[train[0]], x[train[1]])
print(x[test[0]], x[test[1]])

a b
c d


* 실데이터 활용

## 데이터 준비

In [7]:
X, y = load_iris(return_X_y=True)

## 데이터 모델링

In [12]:
# 5 fold cross validation

kfold = KFold(n_splits=5)
idx = 1

for train, val in kfold.split(X):
    # 몇 번째 fold?
    print(f'========== Fold # {idx} ===========')

    # LDA 적용하기
    lda = LinearDiscriminantAnalysis().fit(X[train], y[train])

    # 예측하기
    print('True Label :', y[val])
    print('Predicted Label :', lda.predict(X[val]))

    # 정확도 확인하기
    print('Accuracy :', lda.score(X[val], y[val]) * 100, '%')

    idx += 1

True Label : [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Predicted Label : [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Accuracy : 100.0 %
True Label : [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1]
Predicted Label : [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1]
Accuracy : 100.0 %
True Label : [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
Predicted Label : [1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1]
Accuracy : 90.0 %
True Label : [1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
Predicted Label : [1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
Accuracy : 100.0 %
True Label : [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
Predicted Label : [2 2 2 2 2 2 2 2 2 1 2 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
Accuracy : 90.0 %


# Bootstrap

* 간단한 예제

In [13]:
X = np.array([[1, 0], [2, 1], [0, 0]])
y = np.array([0, 1, 2])

print(X, y)

[[1 0]
 [2 1]
 [0 0]] [0 1 2]


In [14]:
X, y = resample(X, y)
X, y

(array([[2, 1],
        [2, 1],
        [2, 1]]),
 array([1, 1, 1]))

* iris data

In [15]:
X, y = load_iris(return_X_y=True)

X_re, y_re = resample(X, y)
X_re, y_re

(array([[4.7, 3.2, 1.3, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.8, 3. , 1.4, 0.1],
        [6.5, 2.8, 4.6, 1.5],
        [6.3, 2.5, 4.9, 1.5],
        [6.4, 2.9, 4.3, 1.3],
        [5.5, 2.5, 4. , 1.3],
        [6.6, 3. , 4.4, 1.4],
        [6.5, 3.2, 5.1, 2. ],
        [6.8, 2.8, 4.8, 1.4],
        [5.5, 3.5, 1.3, 0.2],
        [5.5, 2.5, 4. , 1.3],
        [5.6, 2.7, 4.2, 1.3],
        [5.5, 2.5, 4. , 1.3],
        [4.9, 3.1, 1.5, 0.2],
        [6. , 2.7, 5.1, 1.6],
        [5.5, 2.4, 3.8, 1.1],
        [6.7, 3.1, 4.4, 1.4],
        [5.1, 3.8, 1.5, 0.3],
        [6.3, 2.5, 4.9, 1.5],
        [6.4, 2.8, 5.6, 2.2],
        [6.1, 3. , 4.6, 1.4],
        [4.8, 3.4, 1.9, 0.2],
        [6.4, 2.7, 5.3, 1.9],
        [7.7, 2.8, 6.7, 2. ],
        [6.4, 3.1, 5.5, 1.8],
        [6.5, 3. , 5.5, 1.8],
        [5.6, 3. , 4.5, 1.5],
        [6.5, 3. , 5.8, 2.2],
        [5. , 3.5, 1.3, 0.3],
        [6.9, 3.1, 5.4, 2.1],
        [4.6, 3.1, 1.5, 0.2],
        [5.8, 2.7, 4.1, 1. ],
        [6

In [19]:
# unique 한 값 및 개수 출력
unique, counts = np.unique(y_re, return_counts=True)
unique, counts

(array([0, 1, 2]), array([42, 60, 48], dtype=int64))