#### 붓꽃 품종 분류
- 목표 : 붓꽃의 3개 품종을 분류하기
- 데이터셋 : 내장 데이터셋 사용
- 피쳐 : 4개
- 타겟 : 품종 1개
- 학습 : 지도학습 > 분류

[1] 데이터 준비

In [16]:
# 모듈로딩
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [17]:
# 내장 데이터셋 로딩
data=load_iris(as_frame=True)

In [18]:
# Bunch 인스턴스 => dict와 유사한 형태
data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [19]:
featureDF=data['data']
targetSR=data['target']

In [20]:
featureDF.shape, targetSR.shape

((150, 4), (150,))

In [21]:
featureDF.head(1), targetSR.head(1)

(   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
 0                5.1               3.5                1.4               0.2,
 0    0
 Name: target, dtype: int32)

[2] 학습을 위한 데이터셋 준비 => 학습용, 검증용, 테스트용

In [22]:
# 학습용 & 테스트용 분리
x_train, x_test, y_train, y_test = train_test_split(featureDF, targetSR,
                                                    stratify=targetSR)

In [23]:
# 학습용 & 검증용 분리
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train,
                                                    stratify=y_train)

In [24]:
print(f'Train DS: {x_train.shape[0]} {x_train.shape[0]/featureDF.shape[0]:.2f}%')
print(f'val DS: {x_val.shape[0]} {x_val.shape[0]/featureDF.shape[0]:.2f}%')
print(f'Test DS: {x_test.shape[0]} {x_test.shape[0]/featureDF.shape[0]:.2f}%')

Train DS: 84 0.56%
val DS: 28 0.19%
Test DS: 38 0.25%


[3] 교차검증 방식

In [25]:
# 모듈 로딩
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier

In [26]:
# 모델 인스턴스 생성
dtc_model = DecisionTreeClassifier()

# [3-1] KFold 기반 --------------------------------------------------
# 정확도 저장 리스트
accuracys = []

# KFold 인스턴스 생성
kfold = KFold()

In [30]:
# k번 만큼 k개 데이터셋으로 학습 진행
# -> k등분 후 학습용 데이터셋 인덱스, 검증용 데이터셋 인덱스
for idx, (train_index, val_index) in enumerate(kfold.split(featureDF), 1):
    
    print(f'train_index: {train_index.tolist()}')

    # x_train, x_val 데이터셋 설정
    x_train, y_train = featureDF.iloc[train_index.tolist()], targetSR[train_index.tolist()]
    x_val, y_val = featureDF.iloc[val_index.tolist()], targetSR[val_index.tolist()]

    # 학습 진행
    dtc_model.fit(x_train, y_train)

    # 평가 -> 분류일 경우 score() 메서드는 정확도를 반환해준다.
    accuracy=dtc_model.score(x_train, y_train)
    val_acc=dtc_model.score(x_val, y_val)
    
    # accuracys.append(accuracy)
    accuracys.append([accuracy, val_acc])
    print(f'[{idx}번째] train 정확도 : {accuracy} Val 정확도: {val_acc}')

train_index: [30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149]
[1번째] 정확도 : 1.0 val 정확도: 1.0
train_index: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128

In [31]:
# 평균 계산
train_mean=sum([value[0] for value in accuracys])/kfold.n_splits
test_mean=sum([value[1] for value in accuracys])/kfold.n_splits

print(f'Train 정확도:{train_mean}, Val 정확도:{test_mean:.2f}')

Train 정확도:1.0 Val 정확도:0.89


In [34]:
### ==> [3-2] StratifiedKfold : 정답/레이블/타겟의 비율을 고려해서 데이터를 나누어줌

skfold = StratifiedKFold()

# k번 만큼 k개 데이터셋으로 학습 진행
# -> k등분 후 학습용 데이터셋 인덱스, 검증용 데이터셋 인덱스
for idx, (train_index, val_index) in enumerate(skfold.split(featureDF, targetSR), 1):
    
    print(f'train_index: {train_index.tolist()}')

    # x_train, x_val 데이터셋 설정
    x_train, y_train = featureDF.iloc[train_index.tolist()], targetSR[train_index.tolist()]
    x_val, y_val = featureDF.iloc[val_index.tolist()], targetSR[val_index.tolist()]

    # 학습 진행
    dtc_model.fit(x_train, y_train)

    # 평가 -> 분류일 경우 score() 메서드는 정확도를 반환해준다.
    accuracy=dtc_model.score(x_train, y_train)
    val_acc=dtc_model.score(x_val, y_val)
    
    # accuracys.append(accuracy)
    accuracys.append([accuracy, val_acc])
    print(f'[{idx}번째] Train 정확도: {accuracy}, Val 정확도: {val_acc}')

train_index: [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149]
[1번째] 정확도 : 1.0 val 정확도: 0.9666666666666667
train_index: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129

In [35]:
# 평균 계산
train_mean=sum([value[0] for value in accuracys])/skfold.n_splits
test_mean=sum([value[1] for value in accuracys])/skfold.n_splits

print(f'Train 정확도:{train_mean} Val 정확도:{test_mean:.2f}')

Train 정확도:2.0 Val 정확도:1.86


- 교차검증 및 성능평가 동시 진행 함수
    * => cross_val_score, cross_val_predict
    * => cross_validate

In [45]:
from sklearn.model_selection import cross_val_predict, cross_val_score, cross_validate

In [81]:
### [1] 전체 DS ==> 학습용과 테스트용 DS 분리
x_train, x_test, y_train, y_test = train_test_split(featureDF,
                                                    targetSR,
                                                    stratify=targetSR)

In [58]:
### cross_val_predict
predict = cross_val_predict(dtc_model, featureDF, targetSR, cv=3)

In [59]:
print(f'predict: {predict}')

predict: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1
 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 2 2 2 2
 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [56]:
### cross_val_score
predict = cross_val_score(dtc_model, featureDF, targetSR)

In [57]:
print(f'predict: {predict}')

predict: [0.96666667 0.96666667 0.9        1.         1.        ]


In [82]:
### cross_validate
result = cross_validate(dtc_model, x_train, y_train, 
                        return_train_score=True,
                        return_estimator=True)

In [67]:
### cross_validate
result = cross_validate(dtc_model, featureDF, targetSR, 
                        return_train_score=True,
                        return_estimator=True)

In [83]:
result

{'fit_time': array([0.00226307, 0.00199842, 0.00100017, 0.        , 0.00796413]),
 'score_time': array([0.00308275, 0.00099969, 0.00105619, 0.        , 0.00112462]),
 'estimator': [DecisionTreeClassifier(),
  DecisionTreeClassifier(),
  DecisionTreeClassifier(),
  DecisionTreeClassifier(),
  DecisionTreeClassifier()],
 'test_score': array([0.95652174, 0.91304348, 1.        , 1.        , 0.95454545]),
 'train_score': array([1., 1., 1., 1., 1.])}

In [84]:
print(f'predict: {result}')

predict: {'fit_time': array([0.00226307, 0.00199842, 0.00100017, 0.        , 0.00796413]), 'score_time': array([0.00308275, 0.00099969, 0.00105619, 0.        , 0.00112462]), 'estimator': [DecisionTreeClassifier(), DecisionTreeClassifier(), DecisionTreeClassifier(), DecisionTreeClassifier(), DecisionTreeClassifier()], 'test_score': array([0.95652174, 0.91304348, 1.        , 1.        , 0.95454545]), 'train_score': array([1., 1., 1., 1., 1.])}


In [85]:
resultDF=pd.DataFrame(result).loc[:, ['test_score', 'train_score']]
resultDF

Unnamed: 0,test_score,train_score
0,0.956522,1.0
1,0.913043,1.0
2,1.0,1.0
3,1.0,1.0
4,0.954545,1.0


In [89]:
best_model=result['estimator'][3]

In [90]:
best_model.predict(x_test)

array([0, 0, 0, 1, 0, 2, 0, 2, 1, 1, 1, 0, 1, 0, 1, 0, 1, 2, 1, 2, 2, 0,
       1, 2, 0, 2, 2, 0, 0, 2, 2, 2, 0, 1, 2, 2, 1, 2])

In [91]:
best_model.score(x_test, y_test)

0.9210526315789473