## Basic Stacking model

### Package Load

In [4]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# 데이터 셋
from sklearn.datasets import load_breast_cancer

# 학습 및 테스트 데이터셋 분리
from sklearn.model_selection import train_test_split
# 정확도
from sklearn.metrics import accuracy_score

### Dataset load

In [6]:
cancer_data = load_breast_cancer()
display(cancer_data.keys())

X_features = cancer_data.data
y_labels = cancer_data.target

X_train, X_test, y_train, y_test = train_test_split(X_features, y_labels, test_size=0.2, random_state=156)

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

### 개별 Classifier와  최종 Stacking 데이터를 학습할 메타 Classifier 생성

In [7]:
# 개별 ML 모델을 위한 Classifier 생성.
knn_clf = KNeighborsClassifier(n_neighbors=10)
rf_clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, verbose=True)
ada_clf = AdaBoostClassifier(n_estimators=100)
dt_clf = DecisionTreeClassifier()

# 최종 Stacking Model를 위한 Classfifier생성
lr_clf = LogisticRegression(C=100)

### 개별 Classifier 학습/예측/평가

In [8]:
# 개별 모델들을 학습.
knn_clf.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)
ada_clf.fit(X_train, y_train)
dt_clf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.2s finished


DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [9]:
# 학습된 개별 모델들이 각자 반환하는 예측 데이터 셋을 생성하고 개별 모델의 정확도 측정. 
knn_pred = knn_clf.predict(X_test)
rf_pred = rf_clf.predict(X_test)
dt_pred = dt_clf.predict(X_test)
ada_pred = ada_clf.predict(X_test)

print('KNN 정확도: {0:.4f}'.format(accuracy_score(y_test, knn_pred)))
print('랜덤 포레스트 정확도: {0:.4f}'.format(accuracy_score(y_test, rf_pred)))
print('결정 트리 정확도: {0:.4f}'.format(accuracy_score(y_test, dt_pred)))
print('에이다부스트 정확도: {0:.4f} :'.format(accuracy_score(y_test, ada_pred)))

KNN 정확도: 0.9474
랜덤 포레스트 정확도: 0.9474
결정 트리 정확도: 0.9386
에이다부스트 정확도: 0.9649 :


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.0s finished


### 개별 모델의 예측 결과를 메타 모델이 학습할 수 있도록 스태킹 형태로 재 생성 

In [10]:
pred = np.array([knn_pred, rf_pred, dt_pred, ada_pred])
print(pred.shape)  # (4, 114)  : 모델갯수 - 4, 

# 전치행렬
# transpose를 이용해 행과 열의 위치 교환. 컬럼 레벨로 각 알고리즘의 예측 결과를 피처로 만듦. 
pred = np.transpose(pred)
display(pred.shape)

(4, 114)


(114, 4)

### 메타 모델 학습/예측/평가

In [11]:
lr_clf.fit(pred, y_test) # 어떻게 보면 오버피팅한것이다.보통 교차검증으로 학습한다.
final= lr_clf.predict(pred)
print('최종 메타 모델의 예측 정확도: {0:.4f}'.format(accuracy_score(y_test , final)))

최종 메타 모델의 예측 정확도: 0.9737


### CV 셋 기반의 Stacking

In [14]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

In [20]:
def get_stacking_base_datasets(model,  X_train_n, y_train_n, X_test_n, n_folds):
  """
  개별 기반 모델에서 최종 메타 모델이 사용할 학습 및 테스트용 데이터를 생성하기 위한 함수
  인자 : X_train_n : 학습 데이터, y_train_n : 레이블 데이터셋, X_test_n : 테스트 데이터셋
  """
  # 지정된 n_folds값으로 KFold 생성.
  kf = KFold(n_splits = n_folds, shuffle=False, random_state=0)

  #추후에 메타 모델이 사용할 학습 데이터 반환을 위한 넘파이 배열 초기화 
  train_fold_pred = np.zeros((X_train_n.shape[0], 1))
  test_pred = np.zeros((X_test_n.shape[0], n_folds))

  print(model.__class__.__name__, "model 시작")

  for fold_counter, (train_idx, valid_idx) in enumerate(kf.split(X_train_n)):
    # 입력된 학습 데이터에서 기반 모델이 학습/예측할 폴드 데이터 셋 추출 
    print("\n fold sets : ", fold_counter, '시작')
    X_tr = X_train_n[train_idx]
    y_tr = y_train_n[train_idx]
    X_te = X_train_n[valid_idx]  # 검증 데이터셋

    model.fit(X_tr, y_tr)  # 학습데이터용으로 학습
    
    #폴드 세트 내부에서 다시 만들어진 검증 데이터로 기반 모델 예측 후 
    #해당 결과를 학습용 데이터로 저장.(2차원)
    train_fold_pred[valid_idx, :] = model.predict(X_te).reshape(-1, 1)

    #입력된 원본 테스트 데이터를 폴드 세트내 학습된 기반 모델에서 예측 후 데이터 저장. 
    test_pred[:,fold_counter] = model.predict(X_test_n)

  # 폴드 세트 내에서 원본 테스트 데이터를 예측한 데이터를 평균하여 테스트 데이터로 생성 
  test_pred_mean = np.mean(test_pred, axis = 1).reshape(-1, 1)
  #train_fold_pred는 최종 메타 모델이 사용하는 학습 데이터, test_pred_mean은 테스트 데이터
  return train_fold_pred, test_pred_mean

#### step 1 - 개별모델 학습, 테스트 데이터셋교참검증

In [21]:
knn_train, knn_test = get_stacking_base_datasets(knn_clf, X_train, y_train, X_test, 7)
rf_train, rf_test = get_stacking_base_datasets(rf_clf, X_train, y_train, X_test, 7)
dt_train, dt_test = get_stacking_base_datasets(dt_clf, X_train, y_train, X_test, 7)
ada_train, ada_test = get_stacking_base_datasets(ada_clf, X_train, y_train, X_test, 7)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


KNeighborsClassifier model 시작

 fold sets :  0 시작

 fold sets :  1 시작

 fold sets :  2 시작

 fold sets :  3 시작

 fold sets :  4 시작

 fold sets :  5 시작

 fold sets :  6 시작
RandomForestClassifier model 시작

 fold sets :  0 시작


[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.0s finished



 fold sets :  1 시작


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.0s finished



 fold sets :  2 시작


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.



 fold sets :  3 시작


[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.0s finished



 fold sets :  4 시작


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.



 fold sets :  5 시작


[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.0s finished



 fold sets :  6 시작


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.0s finished


DecisionTreeClassifier model 시작

 fold sets :  0 시작

 fold sets :  1 시작

 fold sets :  2 시작

 fold sets :  3 시작

 fold sets :  4 시작

 fold sets :  5 시작

 fold sets :  6 시작
AdaBoostClassifier model 시작

 fold sets :  0 시작

 fold sets :  1 시작

 fold sets :  2 시작

 fold sets :  3 시작

 fold sets :  4 시작

 fold sets :  5 시작

 fold sets :  6 시작


#### step2 
* 최종 메타모델이 학습할 학습 및 테스트 데이터셋 생성
* 최종 메타모델의 학습, 예측 ,정확도 평가

In [23]:
Stack_final_X_train = np.concatenate((knn_train, rf_train, dt_train, ada_train), axis=1)
Stack_final_X_test  = np.concatenate((knn_test, rf_test, dt_test, ada_test), axis=1)

print("원본 학습 피처 데이터세트 Shape : ", X_train.shape, "원본 테스트 피처 Shape : ", X_test.shape)
print("Stacking 학습 피처 데이터세트 Shape : ", Stack_final_X_train.shape, "Stacking 테스트 피처 Shape : ", Stack_final_X_test.shape)

원본 학습 피처 데이터세트 Shape :  (455, 30) 원본 테스트 피처 Shape :  (114, 30)
Stacking 학습 피처 데이터세트 Shape :  (455, 4) Stacking 테스트 피처 Shape :  (114, 4)


In [25]:
lr_clf.fit(Stack_final_X_train, y_train)
stack_final = lr_clf.predict(Stack_final_X_test)

print("최종 메타 모델의 예측 정확도 : {0:.4f}".format(accuracy_score(y_test,stack_final)))

최종 메타 모델의 예측 정확도 : 0.9561
