##10_교차검증&앙상블: 다수결 투표

###1. 데이터셋 불러오기

In [1]:
import pandas as pd
#https://www.kaggle.com/kartikmohan1999/universal-bank/data?select=UniversalBank.csv
bank_df = pd.read_csv('UniversalBank.csv')
bank_df.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


학습에 사용할 특성변수 선택하기

In [2]:
X = bank_df.drop(['ID', 'ZIP Code', 'Personal Loan'], axis = 1)
y = bank_df['Personal Loan']

###3. 데이터 분할

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1, stratify = y)

###5. 모델 추정: 다수결 투표 앙상블 & 교차 검증

####1. 학습에 사용할 개별 모델 정의

1. 로지스틱 회귀

In [4]:
from sklearn.linear_model import LogisticRegression

logistic = LogisticRegression(solver = 'liblinear', penalty = 'l2', C = 0.001, random_state = 1)

2. 결정 트리

In [5]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(max_depth = None, criterion = 'entropy', random_state = 1)

3. kNN

In [6]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 1, p = 2, metric = 'minkowski')

####2. 다수결 투표 앙상블: 로지스틱 회귀 + 결정 트리 + kNN

In [7]:
from sklearn.ensemble import VotingClassifier
voting_estimators = [('logistic', logistic), ('tree', tree), ('knn', knn)]
voting = VotingClassifier(estimators = voting_estimators, voting = 'soft')

####3. K-fold 교차 검증
로지스틱 회귀, 결정 트리, kNN, 다수결투표 앙상블 비교

In [8]:
from sklearn.model_selection import cross_val_score

clf_labels = ['Logistic Regression', 'Decision Tree', 'kNN', 'Majority Voting']
all_clf = [logistic, tree, knn, voting]

for clf, label in zip(all_clf, clf_labels):
  scores = cross_val_score(estimator = clf, X = X_train, y = y_train, cv = 10, scoring = 'roc_auc') # cv=10 : k-fold가 10개
  print("ROC AUC: %0.3f (+/- %0.3f) [%s]", (scores.mean(), scores.std(), label))

ROC AUC: %0.3f (+/- %0.3f) [%s] (0.9276195033668649, 0.01984630447185705, 'Logistic Regression')
ROC AUC: %0.3f (+/- %0.3f) [%s] (0.9499227861055811, 0.032735680131811405, 'Decision Tree')
ROC AUC: %0.3f (+/- %0.3f) [%s] (0.7120816883018249, 0.04722587272864442, 'kNN')
ROC AUC: %0.3f (+/- %0.3f) [%s] (0.9724206139059355, 0.01593603549077189, 'Majority Voting')


####4. 모델 최적화: GridSearchCV으로 모델의 하이퍼파라미터 튜닝
로지스틱 회귀 C값 3개, 결정 트리 깊이 3개, kNN의 이웃 k값 3개 -> 총 27가지 경우에서 최적의 파라미터 구하기

사용할 수 있는 scoring :
https://scikit-learn.org/stable/modules/model_evaluation.html

In [9]:
from sklearn.model_selection import GridSearchCV

params = {'logistic__C': [0.001, 0.1, 100.0], 'tree__max_depth': [1, 3, 5], 'knn__n_neighbors': [1, 3, 5]}

grid = GridSearchCV(estimator = voting, param_grid = params, cv = 10, scoring = 'roc_auc', iid = False)

grid.fit(X_train, y_train)


for r, _ in enumerate(grid.cv_results_['mean_test_score']):
  print("%0.3f +/- %0.3f %r" %(grid.cv_results_['mean_test_score'][r],
                              grid.cv_results_['std_test_score'][r] / 2.0,
                              grid.cv_results_['params'][r]))

print('최적의 파라미터: %s' %grid.best_params_)
print('AUC: %0.3f' %grid.best_score_)

0.934 +/- 0.009 {'knn__n_neighbors': 1, 'logistic__C': 0.001, 'tree__max_depth': 1}
0.975 +/- 0.005 {'knn__n_neighbors': 1, 'logistic__C': 0.001, 'tree__max_depth': 3}
0.978 +/- 0.005 {'knn__n_neighbors': 1, 'logistic__C': 0.001, 'tree__max_depth': 5}
0.952 +/- 0.009 {'knn__n_neighbors': 1, 'logistic__C': 0.1, 'tree__max_depth': 1}
0.981 +/- 0.005 {'knn__n_neighbors': 1, 'logistic__C': 0.1, 'tree__max_depth': 3}
0.983 +/- 0.006 {'knn__n_neighbors': 1, 'logistic__C': 0.1, 'tree__max_depth': 5}
0.955 +/- 0.009 {'knn__n_neighbors': 1, 'logistic__C': 100.0, 'tree__max_depth': 1}
0.983 +/- 0.005 {'knn__n_neighbors': 1, 'logistic__C': 100.0, 'tree__max_depth': 3}
0.985 +/- 0.005 {'knn__n_neighbors': 1, 'logistic__C': 100.0, 'tree__max_depth': 5}
0.938 +/- 0.009 {'knn__n_neighbors': 3, 'logistic__C': 0.001, 'tree__max_depth': 1}
0.981 +/- 0.005 {'knn__n_neighbors': 3, 'logistic__C': 0.001, 'tree__max_depth': 3}
0.984 +/- 0.005 {'knn__n_neighbors': 3, 'logistic__C': 0.001, 'tree__max_depth': 5



###6. 결과 분석

In [10]:
from sklearn.metrics import confusion_matrix, classification_report

best_voting = grid.best_estimator_
best_voting.fit(X_train, y_train)
y_pred = best_voting.predict(X_test)

print('Classification Report')
print(classification_report(y_test, y_pred))

Classification Report
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1356
           1       0.95      0.73      0.83       144

    accuracy                           0.97      1500
   macro avg       0.96      0.86      0.91      1500
weighted avg       0.97      0.97      0.97      1500

