In [20]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
import numpy as np
import pandas as pd

## Model Selection 모듈 소개

In [5]:
iris = load_iris()
train_data = iris.data
train_label = iris.target

In [6]:
dt_clf = DecisionTreeClassifier()
dt_clf.fit(train_data, train_label)
pred = dt_clf.predict(train_data)
print("accuracy:", accuracy_score(pred, train_label))

accuracy: 1.0


In [10]:
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size = 0.3)
dt_clf = DecisionTreeClassifier()
dt_clf.fit(x_train, y_train)
pred = dt_clf.predict(x_test)
print("accuracy:", accuracy_score(pred, y_test))

accuracy: 0.9555555555555556


In [12]:
features = iris.data
label = iris.target
print("세트 크기: ", features.shape[0])

세트 크기:  150


In [13]:
dt_clf = DecisionTreeClassifier()
kfold = KFold(n_splits=5)
cv_accuracy = []
n_iter = 0

In [14]:
for train_ind, test_ind in kfold.split(features):
    x_train, x_test = features[train_ind], features[test_ind]
    y_train, y_test = label[train_ind], label[test_ind]
    
    dt_clf.fit(x_train, y_train) # learning
    pred = dt_clf.predict(x_test) # 예측
    n_iter += 1
    
    acc = np.round(accuracy_score(pred, y_test),3)
    print(n_iter, "회 교차 검증 accuracy: ", acc)
    cv_accuracy.append(acc)
print("평균 accuracy: ", np.mean(cv_accuracy))

1 회 교차 검증 accuracy:  1.0
2 회 교차 검증 accuracy:  0.967
3 회 교차 검증 accuracy:  0.867
4 회 교차 검증 accuracy:  0.933
5 회 교차 검증 accuracy:  0.833
평균 accuracy:  0.9200000000000002


In [16]:
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df['label'] = iris.target
iris_df['label'].value_counts()

label
0    50
1    50
2    50
Name: count, dtype: int64

In [18]:
kfold = KFold(n_splits=3)
n_iter = 0
for train_ind, test_ind in kfold.split(iris_df):
    n_iter += 1
    label_train = iris_df['label'].iloc[train_ind]
    label_test = iris_df['label'].iloc[test_ind]
    print("##", n_iter, "회 교차 검증")
    print("학습 데이터 분포\n", label_train.value_counts())
    print("평가 데이터 분포\n", label_test.value_counts())

## 1 회 교차 검증
학습 데이터 분포
 label
1    50
2    50
Name: count, dtype: int64
평가 데이터 분포
 label
0    50
Name: count, dtype: int64
## 2 회 교차 검증
학습 데이터 분포
 label
0    50
2    50
Name: count, dtype: int64
평가 데이터 분포
 label
1    50
Name: count, dtype: int64
## 3 회 교차 검증
학습 데이터 분포
 label
0    50
1    50
Name: count, dtype: int64
평가 데이터 분포
 label
2    50
Name: count, dtype: int64


In [19]:
dt_clf = DecisionTreeClassifier()
kfold = KFold(n_splits=3)
cv_accuracy = []
n_iter = 0

for train_ind, test_ind in kfold.split(features):
    x_train, x_test = features[train_ind], features[test_ind]
    y_train, y_test = label[train_ind], label[test_ind]
    
    dt_clf.fit(x_train, y_train)
    pred = dt_clf.predict(x_test)
    n_iter += 1
    
    acc = np.round(accuracy_score(pred, y_test),3)
    print(n_iter, "회 교차 검증 accuracy: ", acc)
    cv_accuracy.append(acc)

print("평균 accuracy: ", np.mean(cv_accuracy))

1 회 교차 검증 accuracy:  0.0
2 회 교차 검증 accuracy:  0.0
3 회 교차 검증 accuracy:  0.0
평균 accuracy:  0.0


In [21]:
dt_clf = DecisionTreeClassifier()
skf = StratifiedKFold(n_splits=3)
cv_accuracy = []
n_iter = 0
for train_ind, test_ind in skf.split(features, label):
    x_trian, x_test = features[train_ind], features[test_ind]
    y_train, y_test = label[train_ind], label[test_ind]
    
    dt_clf.fit(x_train, y_train)
    pred = dt_clf.predict(x_test)
    n_iter += 1
    
    acc = np.round(accuracy_score(pred, y_test),3)
    print(n_iter, "회 교차 검증 accuracy: ", acc)
    cv_accuracy.append(acc)
    
print("평균 accuracy: ", np.mean(cv_accuracy))

1 회 교차 검증 accuracy:  0.8
2 회 교차 검증 accuracy:  0.54
3 회 교차 검증 accuracy:  0.22
평균 accuracy:  0.52
