In [23]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, cross_validate
import numpy as np
import pandas as pd

## Model Selection 모듈 소개

In [5]:
iris = load_iris()
train_data = iris.data
train_label = iris.target

In [6]:
dt_clf = DecisionTreeClassifier()
dt_clf.fit(train_data, train_label)
pred = dt_clf.predict(train_data)
print("accuracy:", accuracy_score(pred, train_label))

accuracy: 1.0


In [10]:
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size = 0.3)
dt_clf = DecisionTreeClassifier()
dt_clf.fit(x_train, y_train)
pred = dt_clf.predict(x_test)
print("accuracy:", accuracy_score(pred, y_test))

accuracy: 0.9555555555555556


In [12]:
features = iris.data
label = iris.target
print("세트 크기: ", features.shape[0])

세트 크기:  150


In [13]:
dt_clf = DecisionTreeClassifier()
kfold = KFold(n_splits=5)
cv_accuracy = []
n_iter = 0

In [14]:
for train_ind, test_ind in kfold.split(features):
    x_train, x_test = features[train_ind], features[test_ind]
    y_train, y_test = label[train_ind], label[test_ind]
    
    dt_clf.fit(x_train, y_train) # learning
    pred = dt_clf.predict(x_test) # 예측
    n_iter += 1
    
    acc = np.round(accuracy_score(pred, y_test),3)
    print(n_iter, "회 교차 검증 accuracy: ", acc)
    cv_accuracy.append(acc)
print("평균 accuracy: ", np.mean(cv_accuracy))

1 회 교차 검증 accuracy:  1.0
2 회 교차 검증 accuracy:  0.967
3 회 교차 검증 accuracy:  0.867
4 회 교차 검증 accuracy:  0.933
5 회 교차 검증 accuracy:  0.833
평균 accuracy:  0.9200000000000002


In [16]:
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df['label'] = iris.target
iris_df['label'].value_counts()

label
0    50
1    50
2    50
Name: count, dtype: int64

In [18]:
kfold = KFold(n_splits=3)
n_iter = 0
for train_ind, test_ind in kfold.split(iris_df):
    n_iter += 1
    label_train = iris_df['label'].iloc[train_ind]
    label_test = iris_df['label'].iloc[test_ind]
    print("##", n_iter, "회 교차 검증")
    print("학습 데이터 분포\n", label_train.value_counts())
    print("평가 데이터 분포\n", label_test.value_counts())

## 1 회 교차 검증
학습 데이터 분포
 label
1    50
2    50
Name: count, dtype: int64
평가 데이터 분포
 label
0    50
Name: count, dtype: int64
## 2 회 교차 검증
학습 데이터 분포
 label
0    50
2    50
Name: count, dtype: int64
평가 데이터 분포
 label
1    50
Name: count, dtype: int64
## 3 회 교차 검증
학습 데이터 분포
 label
0    50
1    50
Name: count, dtype: int64
평가 데이터 분포
 label
2    50
Name: count, dtype: int64


In [19]:
dt_clf = DecisionTreeClassifier()
kfold = KFold(n_splits=3)
cv_accuracy = []
n_iter = 0

for train_ind, test_ind in kfold.split(features):
    x_train, x_test = features[train_ind], features[test_ind]
    y_train, y_test = label[train_ind], label[test_ind]
    
    dt_clf.fit(x_train, y_train)
    pred = dt_clf.predict(x_test)
    n_iter += 1
    
    acc = np.round(accuracy_score(pred, y_test),3)
    print(n_iter, "회 교차 검증 accuracy: ", acc)
    cv_accuracy.append(acc)

print("평균 accuracy: ", np.mean(cv_accuracy))

1 회 교차 검증 accuracy:  0.0
2 회 교차 검증 accuracy:  0.0
3 회 교차 검증 accuracy:  0.0
평균 accuracy:  0.0


In [21]:
dt_clf = DecisionTreeClassifier()
skf = StratifiedKFold(n_splits=3)
cv_accuracy = []
n_iter = 0
for train_ind, test_ind in skf.split(features, label):
    x_trian, x_test = features[train_ind], features[test_ind]
    y_train, y_test = label[train_ind], label[test_ind]
    
    dt_clf.fit(x_train, y_train)
    pred = dt_clf.predict(x_test)
    n_iter += 1
    
    acc = np.round(accuracy_score(pred, y_test),3)
    print(n_iter, "회 교차 검증 accuracy: ", acc)
    cv_accuracy.append(acc)
    
print("평균 accuracy: ", np.mean(cv_accuracy))

1 회 교차 검증 accuracy:  0.8
2 회 교차 검증 accuracy:  0.54
3 회 교차 검증 accuracy:  0.22
평균 accuracy:  0.52


In [22]:
skf = StratifiedKFold(n_splits=3)
n_iter = 0

for train_ind, test_ind in skf.split(iris_df, iris_df['label']):
    n_iter += 1
    label_train = iris_df['label'].iloc[train_ind]
    label_test = iris_df['label'].iloc[test_ind]
    print("##", n_iter, "회 교차 검증")
    print("학습 데이터 분포\n", label_train.value_counts())
    print("평가 데이터 분포\n", label_test.value_counts())

## 1 회 교차 검증
학습 데이터 분포
 label
2    34
0    33
1    33
Name: count, dtype: int64
평가 데이터 분포
 label
0    17
1    17
2    16
Name: count, dtype: int64
## 2 회 교차 검증
학습 데이터 분포
 label
1    34
0    33
2    33
Name: count, dtype: int64
평가 데이터 분포
 label
0    17
2    17
1    16
Name: count, dtype: int64
## 3 회 교차 검증
학습 데이터 분포
 label
0    34
1    33
2    33
Name: count, dtype: int64
평가 데이터 분포
 label
1    17
2    17
0    16
Name: count, dtype: int64


In [24]:
dt_clf = DecisionTreeClassifier()

data = iris.data
label = iris.target

scores = cross_val_score(dt_clf, data, label, scoring='accuracy', cv=3)
print("교차 검증 accuracy: ", np.round(scores, 4))
print("평균 검증 accuracy: ", np.round(np.mean(scores), 4))

교차 검증 accuracy:  [0.98 0.94 1.  ]
평균 검증 accuracy:  0.9733


In [25]:
from sklearn.model_selection import GridSearchCV

In [43]:
iris_data = load_iris()

x_train, x_test, y_train, y_test = train_test_split(iris_data.data, iris_data.target, test_size = 0.2)
grid_parameters = {'max_depth':[1,2,3], 'min_samples_split':[2,3]}

In [44]:
dtree = DecisionTreeClassifier()
grid_dtree = GridSearchCV(dtree, param_grid = grid_parameters, cv = 3, refit = True)
grid_dtree.fit(x_train, y_train)

In [45]:
print("최적 파라미터: ", grid_dtree.best_params_)
print("최고 accuracy: ", grid_dtree.best_score_)

최적 파라미터:  {'max_depth': 2, 'min_samples_split': 2}
최고 accuracy:  0.9249999999999999


In [46]:
score_df = pd.DataFrame(grid_dtree.cv_results_)
score_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000684,0.000318,0.000424,0.000135,1,2,"{'max_depth': 1, 'min_samples_split': 2}",0.675,0.675,0.7,0.683333,0.011785,5
1,0.000387,3e-05,0.000305,1.3e-05,1,3,"{'max_depth': 1, 'min_samples_split': 3}",0.675,0.675,0.7,0.683333,0.011785,5
2,0.000379,8e-06,0.000292,5e-06,2,2,"{'max_depth': 2, 'min_samples_split': 2}",0.875,0.925,0.975,0.925,0.040825,1
3,0.000423,0.000106,0.000318,1.8e-05,2,3,"{'max_depth': 2, 'min_samples_split': 3}",0.875,0.925,0.975,0.925,0.040825,1
4,0.000449,6.3e-05,0.000343,6.7e-05,3,2,"{'max_depth': 3, 'min_samples_split': 2}",0.85,0.925,0.975,0.916667,0.05137,3
5,0.000352,1.1e-05,0.000259,3e-06,3,3,"{'max_depth': 3, 'min_samples_split': 3}",0.85,0.925,0.975,0.916667,0.05137,3


In [47]:
estimator = grid_dtree.best_estimator_
pred = estimator.predict(x_test)
print("Test accuracy: ", accuracy_score(y_test, pred))

Test accuracy:  0.9333333333333333


In [48]:
from sklearn.preprocessing import LabelEncoder

In [49]:
items = ['TV','냉장고','전자레인지','컴퓨터','선풍기','선풍기','믹서','믹서']
encoder = LabelEncoder() # LabelEncoder 객체 생성 후 fit()과 transform()으로 인코딩 수행
encoder.fit(items) # 카테고리 학습
labels = encoder.transform(items) # 변환
print("after: ", labels)

after:  [0 1 4 5 3 3 2 2]


In [50]:
encoder.classes_

array(['TV', '냉장고', '믹서', '선풍기', '전자레인지', '컴퓨터'], dtype='<U5')

In [51]:
encoder.inverse_transform(labels) # 인덱스 번호를 리스트 형태로 주면 해당하는 문자열을 반환

array(['TV', '냉장고', '전자레인지', '컴퓨터', '선풍기', '선풍기', '믹서', '믹서'], dtype='<U5')

In [52]:
from sklearn.preprocessing import OneHotEncoder

In [53]:
items = np.array(items).reshape(-1,1)
encoder = OneHotEncoder()
encoder.fit(items)
labels = encoder.transform(items)
labels.shape #OneHotEncoding 데이터 차원

(8, 6)

In [54]:
labels.toarray() # OneHotEncoding data

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]])

In [56]:
items = ['TV','냉장고','전자레인지','컴퓨터','선풍기','선풍기','믹서','믹서']
pd.get_dummies(items)

Unnamed: 0,TV,냉장고,믹서,선풍기,전자레인지,컴퓨터
0,True,False,False,False,False,False
1,False,True,False,False,False,False
2,False,False,False,False,True,False
3,False,False,False,False,False,True
4,False,False,False,True,False,False
5,False,False,False,True,False,False
6,False,False,True,False,False,False
7,False,False,True,False,False,False
