##10_파이프라인

###1. 데이터셋 불러오기

In [None]:
import pandas as pd
#https://www.kaggle.com/kartikmohan1999/universal-bank/data?select=UniversalBank.csv
bank_df = pd.read_csv('UniversalBank.csv')
bank_df.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


학습에 사용할 특성변수 선택하기

In [None]:
X = bank_df.drop(['ID', 'ZIP Code', 'Personal Loan'], axis = 1)
y = bank_df['Personal Loan']

###3. 데이터 분할

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1, stratify = y)

###5. 모델 추정 및 결과 분석

####1. 결정트리

결정 트리

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth = None, criterion = 'gini', random_state = 1)

tree.fit(X_train, y_train)

y_pred = tree.predict(X_test)

결정 트리의 결과 분석

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

print('잘못 분류된 샘플의 개수: %d' %(y_test != y_pred).sum())
print('정확도: %.3f' %accuracy_score(y_test, y_pred))
print('정밀도: %.3f' %precision_score(y_true = y_test, y_pred = y_pred))
print('재현율: %.3f' %recall_score(y_true = y_test, y_pred = y_pred))
print('F1: %.3f' %f1_score(y_true = y_test, y_pred = y_pred))

잘못 분류된 샘플의 개수: 28
정확도: 0.981
정밀도: 0.914
재현율: 0.889
F1: 0.901


교차 검증 및 결과 분석

In [None]:
from sklearn.model_selection import cross_validate
import numpy as np

scores = cross_validate(estimator = tree, X = X_train, y = y_train,
                        scoring = ['accuracy'], cv = 10, n_jobs = -1, return_train_score = False)

print('CV 정확도 점수: %s' %scores['test_accuracy'])
print('CV 정확도: %.3f +/- %.3f' %(np.mean(scores['test_accuracy']), np.std(scores['test_accuracy'])))

CV 정확도 점수: [0.99428571 0.98       0.97714286 0.98285714 0.97714286 0.97714286
 0.99428571 0.98571429 0.96857143 0.98      ]
CV 정확도: 0.982 +/- 0.008


####2. 파이프라인 적용: 결정 트리

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline

pipe_tree1 = make_pipeline(DecisionTreeClassifier())

교차 검증

In [None]:
from sklearn.model_selection import GridSearchCV

param_range1 = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
param_range2 = [10, 20, 30, 40, 50]

param_grid = [{'decisiontreeclassifier__max_depth': param_range1,
               'decisiontreeclassifier__min_samples_leaf': param_range2}]

gs = GridSearchCV(estimator = pipe_tree1, param_grid = param_grid, scoring = 'accuracy', cv = 10, n_jobs = -1)

gs = gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)

0.9851428571428571
{'decisiontreeclassifier__max_depth': 5, 'decisiontreeclassifier__min_samples_leaf': 10}


파이프라인 모델의 결과 분석

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

best_tree = gs.best_estimator_
best_tree.fit(X_train, y_train)
y_pred = best_tree.predict(X_test)

print('Classification Report')
print(classification_report(y_test, y_pred))

Classification Report
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1356
           1       0.93      0.87      0.90       144

    accuracy                           0.98      1500
   macro avg       0.96      0.93      0.94      1500
weighted avg       0.98      0.98      0.98      1500



####3. 파이프라인: 표준화 + 주성분 분석 + 결정 트리

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline

pipe_tree2 = make_pipeline(StandardScaler(), PCA(n_components = 10), DecisionTreeClassifier())

교차 검증

In [None]:
from sklearn.model_selection import GridSearchCV

param_range1 = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
param_range2 = [10, 20, 30, 40, 50]

param_grid = [{'decisiontreeclassifier__max_depth': param_range1,
               'decisiontreeclassifier__min_samples_leaf': param_range2}]

gs = GridSearchCV(estimator = pipe_tree2, param_grid = param_grid, scoring = 'accuracy', cv = 10, n_jobs = -1)

gs = gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)

0.9465714285714284
{'decisiontreeclassifier__max_depth': 9, 'decisiontreeclassifier__min_samples_leaf': 10}


파이프라인 모델 결과 분석

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

best_tree = gs.best_estimator_
best_tree.fit(X_train, y_train)
y_pred = best_tree.predict(X_test)

print('Classification Report')
print(classification_report(y_test, y_pred))

Classification Report
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1356
           1       0.75      0.58      0.65       144

    accuracy                           0.94      1500
   macro avg       0.86      0.78      0.81      1500
weighted avg       0.94      0.94      0.94      1500

