# 앙상블

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import os
os.chdir('/content/drive/MyDrive/Universal_Bank')

### 과반수_투표

In [11]:
# 데이터 로더/ 학습에 사용할 특성변수 선택/ 데이터 분할
import pandas as pd
bank_df = pd.read_csv('UniversalBank.csv')
bank_df.head()

X = bank_df.drop (['ID','ZIP Code','Personal Loan'], axis=1)
y = bank_df['Personal Loan']

from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

# 앙상블에 사요할 3개의 모델 정의
from sklearn.tree import DecisionTreeClassifier # 결정 트리
from sklearn.neighbors import KNeighborsClassifier # K-최근접 이웃
from sklearn.linear_model import LogisticRegression # 로지스틱 회귀 모델

logistic = LogisticRegression(solver='liblinear',
                              penalty='l2',
                              C=0.001,
                              random_state=1)

tree = DecisionTreeClassifier(max_depth=None,
                              criterion='entropy',
                              random_state=1)

knn = KNeighborsClassifier(n_neighbors=1,
                            p=2,
                            metric='minkowski')

# 학습에 사용할 모델 앙상블 정의
from sklearn.ensemble import VotingClassifier # 과반수 투표(Majority Voting) 
voting_estimators = [('logistic', logistic), ('tree', tree), ('knn', knn)]
voting = VotingClassifier(estimators = voting_estimators,
                          voting='soft')

In [13]:
## K-fold 교차 검증을 통한 모델 평가

# 이전에는 Accuracy_score/Confusion_matrix 를 사용함
# cross_val_score의 scoring에 들어가는 스트링종류 : https://scikit-learn.org/stable/modules/model_evaluation.html 참고
from sklearn.model_selection import cross_val_score # 교차타당도 # 추가

clf_labels = ['Logistic regression', 'Decision tree', 'KNN', 'Majority voting']
all_clf = [logistic, tree, knn, voting]

for clf, label in zip(all_clf, clf_labels):
    scores = cross_val_score(estimator=clf,X=X_train,y=y_train,cv=10,scoring='roc_auc') # cv(cross validation )=교차검증 횟수
    print("ROC AUC: %0.3f (+/- %0.3f) [%s]", (scores.mean(), scores.std(), label))

ROC AUC: %0.3f (+/- %0.3f) [%s] (0.9276195033668649, 0.01984630447185705, 'Logistic regression')
ROC AUC: %0.3f (+/- %0.3f) [%s] (0.9499227861055811, 0.032735680131811405, 'Decision tree')
ROC AUC: %0.3f (+/- %0.3f) [%s] (0.7120816883018249, 0.04722587272864442, 'KNN')
ROC AUC: %0.3f (+/- %0.3f) [%s] (0.9724206139059355, 0.01593603549077189, 'Majority voting')


##### GridSearch 방식을 이용한 모델 최적화
: 모델의 하이퍼파라미터 튜닝에 사용

In [14]:
from sklearn.model_selection import GridSearchCV # 하이퍼파라미터 튜닝

params = {'logistic__C': [0.001, 0.1, 100.0],
          'tree__max_depth': [1, 3, 5],
          'knn__n_neighbors': [1, 3, 5]}
# 3 x 3 x 3 = 27가지 조합

grid = GridSearchCV(estimator=voting,
                    param_grid=params,
                    cv=10,
                    scoring='roc_auc',
                    iid=False)
grid.fit(X_train, y_train)

for r, _ in enumerate(grid.cv_results_['mean_test_score']):
    print("%0.3f +/- %0.3f %r"
          % (grid.cv_results_['mean_test_score'][r], 
             grid.cv_results_['std_test_score'][r] / 2.0, 
             grid.cv_results_['params'][r]))
    
print('최적의 파타미터: %s' % grid.best_params_)
print('AUC: %.3f' % grid.best_score_)    

0.934 +/- 0.009 {'knn__n_neighbors': 1, 'logistic__C': 0.001, 'tree__max_depth': 1}
0.975 +/- 0.005 {'knn__n_neighbors': 1, 'logistic__C': 0.001, 'tree__max_depth': 3}
0.978 +/- 0.005 {'knn__n_neighbors': 1, 'logistic__C': 0.001, 'tree__max_depth': 5}
0.952 +/- 0.009 {'knn__n_neighbors': 1, 'logistic__C': 0.1, 'tree__max_depth': 1}
0.981 +/- 0.005 {'knn__n_neighbors': 1, 'logistic__C': 0.1, 'tree__max_depth': 3}
0.983 +/- 0.006 {'knn__n_neighbors': 1, 'logistic__C': 0.1, 'tree__max_depth': 5}
0.955 +/- 0.009 {'knn__n_neighbors': 1, 'logistic__C': 100.0, 'tree__max_depth': 1}
0.983 +/- 0.005 {'knn__n_neighbors': 1, 'logistic__C': 100.0, 'tree__max_depth': 3}
0.985 +/- 0.005 {'knn__n_neighbors': 1, 'logistic__C': 100.0, 'tree__max_depth': 5}
0.938 +/- 0.009 {'knn__n_neighbors': 3, 'logistic__C': 0.001, 'tree__max_depth': 1}
0.981 +/- 0.005 {'knn__n_neighbors': 3, 'logistic__C': 0.001, 'tree__max_depth': 3}
0.984 +/- 0.005 {'knn__n_neighbors': 3, 'logistic__C': 0.001, 'tree__max_depth': 5



###배깅

In [16]:
# 데이터 로더/ 학습에 사용할 특성변수 선택/ 데이터 분할
import pandas as pd
bank_df = pd.read_csv('UniversalBank.csv')
bank_df.head()

X = bank_df.drop (['ID','ZIP Code','Personal Loan'], axis=1)
y = bank_df['Personal Loan']

from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

# 학습에 사용할 모델 개별 정의
from sklearn.tree import DecisionTreeClassifier # 결정 트리 # 배깅은 하나의 알고리즘 사용
tree = DecisionTreeClassifier(max_depth=None,
                              criterion='entropy',
                              random_state=1)

# 학습에 사용할 모델 앙상블 정의
from sklearn.ensemble import BaggingClassifier # 배깅(Bagging) 
bagging = BaggingClassifier(base_estimator=tree, # 수정
                            n_estimators=500, # 트리의 개수  
                            max_samples=1.0, 
                            max_features=1.0, 
                            bootstrap=True, # 입력데이터 샘플링의 중복 허용 
                            bootstrap_features=False, 
                            n_jobs=1, 
                            random_state=1)

# K-fold 교차 검증을 통한 모델 평가
from sklearn.model_selection import cross_val_score # 교차타당도 # 추가
clf_labels = ['Decision tree', 'Bagging'] # 차이 비교
all_clf = [tree, bagging]
for clf, label in zip(all_clf, clf_labels):
    scores = cross_val_score(estimator=clf,X=X_train,y=y_train,cv=10,scoring='roc_auc')
    print("ROC AUC: %0.3f (+/- %0.3f) [%s]", (scores.mean(), scores.std(), label))

ROC AUC: %0.3f (+/- %0.3f) [%s] (0.9499227861055811, 0.032735680131811405, 'Decision tree')
ROC AUC: %0.3f (+/- %0.3f) [%s] (0.9976668161065998, 0.001775473982951, 'Bagging')


###아다부스트

In [None]:
# 데이터 로더/ 학습에 사용할 특성변수 선택/ 데이터 분할
import pandas as pd
bank_df = pd.read_csv('UniversalBank.csv')
bank_df.head()

X = bank_df.drop (['ID','ZI PCode','Personal Loan'], axis=1)
y = bank_df['Personal Loan']

from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

# 학습에서 사용할 모델 개별 정의
from sklearn.tree import DecisionTreeClassifier # 결정 트리
tree = DecisionTreeClassifier(max_depth=1, # 배깅과 차이점: max_depth =1 로 변경
                              criterion='entropy',
                              random_state=1)

# 학습에 사용할 모델 앙상블 정의
from sklearn.ensemble import AdaBoostClassifier # 부스팅(Boosting) 
adaboost = AdaBoostClassifier(base_estimator=tree, # 수정
                              n_estimators=500,
                              learning_rate = 0.1, # 수정
                              random_state=1)

## K-fold 교차 검증을 통한 모델 평가
from sklearn.model_selection import cross_val_score # 교차타당도 # 추가
clf_labels = ['Decision tree', 'Ada boost']
all_clf = [tree, adaboost]
for clf, label in zip(all_clf, clf_labels):
    scores = cross_val_score(estimator=clf,X=X_train,y=y_train,cv=10,scoring='roc_auc')
    print("ROC AUC: %0.3f (+/- %0.3f) [%s]", (scores.mean(), scores.std(), label))

###파이프라인

In [None]:
# 데이터 로더/ 학습에 사용할 특성변수 선택/ 데이터 분할
import pandas as pd
bank_df = pd.read_csv('UniversalBank.csv')
bank_df.head()

X = bank_df.drop (['ID','ZIP Code','Personal Loan'], axis=1)
y = bank_df['Personal Loan']

from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

# 학습에 사용할 모델 개별 정의
from sklearn.tree import DecisionTreeClassifier # 결정 트리
tree = DecisionTreeClassifier(max_depth=None, criterion='gini',random_state=1)
tree.fit(X_train, y_train)

# 모델 검정
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score # 정확도, 민감도 등
y_pred = tree.predict(X_test)
print('잘못 분류된 샘플 개수: %d' % (y_test != y_pred).sum())
print('정확도: %.3f' % accuracy_score(y_test, y_pred))
print('정밀도: %.3f' % precision_score(y_true=y_test, y_pred=y_pred))
print('재현율: %.3f' % recall_score(y_true=y_test, y_pred=y_pred))
print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred))

# 교차검증
from sklearn.model_selection import cross_validate # 교차타당도
from sklearn.pipeline import make_pipeline # 파이프라인 구축
import numpy as np
scores = cross_validate(estimator=tree, 
                        X=X_train, 
                        y=y_train, 
                        scoring=['accuracy'], 
                        cv=10, 
                        n_jobs=-1,
                        return_train_score=False)
print('CV 정확도 점수: %s' % scores['test_accuracy'])
print('CV 정확도: %.3f +/- %.3f' % (np.mean(scores['test_accuracy']), 
                                 np.std(scores['test_accuracy'])))


## 파이프라인 학습

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV

#pipe_tree = make_pipeline( StandardScaler(), PCA(n_components=10), DecisionTreeClassifier())  # 98.514
pipe_tree = make_pipeline(DecisionTreeClassifier())

param_range1 = [1,2,3,4,5,6,7,8,9,10] # 수정
param_range2 = [10,20,30,40,50] # 수정

param_grid = [{'decisiontreeclassifier__max_depth': param_range1, # 수정
               'decisiontreeclassifier__min_samples_leaf': param_range2}] # 수정

gs = GridSearchCV(estimator=pipe_tree, # 수정
                  param_grid=param_grid, 
                  scoring='accuracy', 
                  cv=10,
                  n_jobs=-1)

gs = gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)

# 파이파라인 학습 모델 검정
from sklearn.metrics import confusion_matrix, classification_report

best_tree = gs.best_estimator_ # 최적의 파라미터가 적용된 모델
best_tree.fit(X_train, y_train) # 학습
y_pred = best_tree.predict(X_test) # 예측

print('Classification Report')
print(classification_report(y_test, y_pred))