# 앙상블

## 1. Voting

In [1]:
import os
import pickle
import numpy as np
#os.chdir("현재 경로")

In [4]:
X = np.load("./tatanic_X_train.npy")
y = np.load("./tatanic_y_train.npy")

In [6]:
X.shape, y.shape

((889, 27), (889,))

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier

In [8]:
from warnings import filterwarnings
filterwarnings('ignore')

In [9]:
clf1 = LogisticRegression(random_state=1)
clf2 = DecisionTreeClassifier(random_state=1)
clf3 = GaussianNB()
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')
# 묶어줌, (이름지정, 모델), 하드보팅은 다수결, 소프트보팅은 각 모델마다 가중치를 달리

In [10]:
from sklearn.model_selection import cross_val_score
cross_val_score(eclf, X, y, cv=5).mean() #바닐라앙상블 평균 정확도 약 80

0.8009268075922046

In [11]:
cross_val_score(clf1, X, y, cv=5).mean() #로지스틱 성능

0.8290420872214816

In [12]:
cross_val_score(clf2, X, y, cv=5).mean() #의사결정트리 성능

0.7829175395162826

In [13]:
cross_val_score(clf3, X, y, cv=5).mean() #나이브베이즈 성능 (안 좋음! 제외)

0.4600139655938551

In [14]:
clf1 = LogisticRegression(random_state=1)
clf2 = DecisionTreeClassifier(random_state=1)
eclf = VotingClassifier(estimators=[('lr', clf1), ('dt', clf2)], voting='hard')
# 두 개만 넣으면 동표가 나올 경우 랜덤으로 처리함. 그러니 홀수개를 넣는 것이 좋음

In [15]:
cross_val_score(eclf, X, y, cv=5).mean() #성능 향상

0.8222687742017394

## 2. Bagging

In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

In [3]:
X = np.load("./tatanic_X_train.npy")
y = np.load("./tatanic_y_train.npy")

In [4]:
clf2 = DecisionTreeClassifier(random_state=1) #하이 베리언스 모델 사용
eclf = BaggingClassifier(clf2, oob_score=True) #동일모델만 사용, 기본으로 10개의 bag

In [5]:
from sklearn.model_selection import cross_val_score
cross_val_score(eclf, X, y, cv=5).mean()

0.8178124801625086

In [6]:
# 하이퍼파라미터 서치 (DT의 파라미터 바꿔가면서 시도하기)
params ={
    "n_estimators" : [10,20,30,40,50,55], #bag 몇개 구성?
    "max_samples" : [0.5,0.6,0.7,0.8,0.9,1]} #DT에서 얼마나 샘플이 있을때 진행하느냐

In [7]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5, n_jobs=-1)
grid = grid.fit(X, y)

In [8]:
grid.best_score_

0.8301462317210349

In [9]:
grid.best_params_

{'max_samples': 0.6, 'n_estimators': 50}

In [10]:
grid.best_estimator_.oob_score_
# 학습하지 않은 데이터에 대해서도 이정도의 성능을 보인다.

0.8278965129358831

## 3. Random Forest

In [11]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np

In [12]:
X = np.load("./tatanic_X_train.npy")
y = np.load("./tatanic_y_train.npy")

In [13]:
from sklearn.ensemble import RandomForestClassifier
eclf = RandomForestClassifier(n_estimators=100, #DT100개 만든다
                              max_features=2,  #셀렉션되는 피처 수
                              n_jobs=-1, oob_score=True)

In [14]:
from sklearn.model_selection import cross_val_score
cross_val_score(eclf, X, y, cv=5).mean()

0.79871770456421

In [15]:
params ={
    "n_estimators" : [10, 20, 30, 50, 100],
    "max_features" : [1,2,3,4,5,6,7, 10, 15, 20, 25, len(X[0])]
    }

In [16]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5, n_jobs=-1)
grid = grid.fit(X, y)

In [17]:
grid.best_score_

0.8245219347581553

In [18]:
grid.best_params_

{'max_features': 27, 'n_estimators': 100}

In [19]:
grid.best_estimator_.oob_score_

0.8245219347581553

In [20]:
grid.best_estimator_.feature_importances_

array([2.03614769e-01, 2.24030782e-01, 1.77339082e-02, 8.33351280e-02,
       1.39919809e-01, 4.29137561e-02, 1.27756638e-02, 9.19597576e-03,
       6.48661312e-03, 1.23305410e-02, 1.98488422e-03, 9.94209319e-04,
       2.86890676e-03, 8.86547444e-03, 6.51408383e-03, 1.73166777e-01,
       4.20260917e-03, 9.19529744e-03, 1.27207608e-02, 2.10983104e-03,
       4.60084202e-03, 6.50431901e-03, 5.14488150e-03, 7.35814835e-03,
       4.92488024e-04, 9.09212779e-04, 3.03281807e-05])

In [21]:
np.argsort(grid.best_estimator_.feature_importances_)[::-1] #중요도 높은 변수 index

array([ 1,  0, 15,  4,  3,  5,  2,  6, 18,  9,  7, 17, 13, 23, 14, 21,  8,
       22, 20, 16, 12, 19, 10, 11, 25, 24, 26], dtype=int64)

In [22]:
from sklearn.ensemble import RandomForestRegressor

size = 10000
np.random.seed(seed=20)
X_seed = np.random.normal(0, 1, size)
X0 = X_seed + np.random.normal(0, .1, size)
X1 = X_seed + np.random.normal(0, .1, size)
X2 = X_seed + np.random.normal(0, .1, size)
X = np.array([X0, X1, X2]).T
Y = X0 + X1 + X2
  
rf = RandomForestRegressor(n_estimators=20, max_features=2)
rf.fit(X, Y);
print("Scores for X0, X1, X2:",rf.feature_importances_) 
# 시드 다를 때마다 중요도 바뀜
# 그래서 피처 간 상관관계를 고려해주는 것이 좋다.

Scores for X0, X1, X2: [0.17054065 0.28118646 0.54827289]


## 4. AdaBoost

In [23]:
elements = ['one', 'two', 'three'] 
weights = [0.2, 0.3, 0.5]

from numpy.random import choice
print(choice(elements, size=10, replace=True, p=weights))

['three' 'two' 'two' 'two' 'three' 'three' 'three' 'two' 'three' 'two']


In [24]:
X = np.load("./tatanic_X_train.npy")
y = np.load("./tatanic_y_train.npy")

In [25]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

eclf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2), #stump
                          n_estimators=500, 
                          learning_rate=0.1)

In [26]:
from sklearn.model_selection import cross_val_score
cross_val_score(eclf, X, y, cv=5).mean() #독립적 진행이 아니라 다소 느림

0.8166507966736495

In [27]:
from sklearn.tree import DecisionTreeClassifier
DecisionTreeClassifier()

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [28]:
AdaBoostClassifier()

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [29]:
params = {"base_estimator__criterion" : ["gini", "entropy"],
          "base_estimator__max_features" : [7,8],
          "base_estimator__max_depth" : [1,2],
          "n_estimators": [23,24, 25, 26, 27]}

In [30]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5, n_jobs=-1)
grid = grid.fit(X, y)

In [31]:
grid.best_score_

0.8245219347581553

In [32]:
grid.best_params_

{'base_estimator__criterion': 'gini',
 'base_estimator__max_depth': 2,
 'base_estimator__max_features': 8,
 'n_estimators': 26}

In [33]:
grid.best_estimator_.feature_importances_

array([0.08823664, 0.06996377, 0.00498134, 0.11422635, 0.19776761,
       0.05139278, 0.01670012, 0.03217644, 0.        , 0.00656069,
       0.        , 0.        , 0.        , 0.01520929, 0.02009994,
       0.27719737, 0.02854368, 0.00365942, 0.03799339, 0.        ,
       0.0049063 , 0.        , 0.02123222, 0.00915265, 0.        ,
       0.        , 0.        ])

## 5. Gradient Boosting

In [34]:
X = np.load("./tatanic_X_train.npy")
y = np.load("./tatanic_y_train.npy")

In [35]:
from sklearn.ensemble import GradientBoostingClassifier

In [36]:
eclf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1)
                                    # 라운드        트리의 영향력

In [37]:
from sklearn.model_selection import cross_val_score
cross_val_score(eclf, X, y, cv=5).mean()

0.8245730971878371

In [38]:
params ={
    "n_estimators" : [10, 20, 30, 50, 100, 200],
    "learning_rate" : [i for i in np.linspace(0.1,1, 10)]}

In [39]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5, n_jobs=-1)
grid = grid.fit(X, y)

In [40]:
grid.best_score_

0.8447694038245219

In [41]:
grid.best_params_

{'learning_rate': 0.4, 'n_estimators': 50}

## 6. XgBoost와 LightGBM

In [42]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import numpy as np
import xgboost as xgb

In [43]:
X = np.load("./tatanic_X_train.npy")
y = np.load("./tatanic_y_train.npy")

In [44]:
from sklearn.model_selection import train_test_split

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.3)

In [46]:
from xgboost import XGBClassifier

model = XGBClassifier(n_estimators=1000, max_depth=2, learning_rate=0.5, nthread=7)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [47]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.7940074906367042

In [48]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [49]:
param = {'max_depth': 2, 'eta': 0.5, 'silent': 1, 'objective': 'binary:logistic'}
param['nthread'] = 7
param['eval_metric'] = 'auc'
evallist = [(dtest, 'eval'), (dtrain, 'train')]
plst = param.items()

In [50]:
num_round = 50
bst = xgb.train(plst, dtrain, num_round, evallist)

[0]	eval-auc:0.814416	train-auc:0.852186
[1]	eval-auc:0.823913	train-auc:0.859748
[2]	eval-auc:0.831093	train-auc:0.866258
[3]	eval-auc:0.83258	train-auc:0.869812
[4]	eval-auc:0.839216	train-auc:0.882698
[5]	eval-auc:0.841762	train-auc:0.886599
[6]	eval-auc:0.845652	train-auc:0.890451
[7]	eval-auc:0.849228	train-auc:0.895908
[8]	eval-auc:0.853776	train-auc:0.898774
[9]	eval-auc:0.85083	train-auc:0.901539
[10]	eval-auc:0.84611	train-auc:0.903247
[11]	eval-auc:0.846939	train-auc:0.904282
[12]	eval-auc:0.848741	train-auc:0.905911
[13]	eval-auc:0.844966	train-auc:0.907551
[14]	eval-auc:0.841676	train-auc:0.912236
[15]	eval-auc:0.843364	train-auc:0.914145
[16]	eval-auc:0.848198	train-auc:0.916479
[17]	eval-auc:0.848455	train-auc:0.916821
[18]	eval-auc:0.851487	train-auc:0.918505
[19]	eval-auc:0.850858	train-auc:0.919457
[20]	eval-auc:0.85369	train-auc:0.920957
[21]	eval-auc:0.854462	train-auc:0.921724
[22]	eval-auc:0.853547	train-auc:0.922474
[23]	eval-auc:0.853318	train-auc:0.922245
[24]	e

In [51]:
ypred = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)

In [52]:
(sum((ypred>0.5) == y_test)) / 267.0

0.797752808988764

In [53]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import numpy as np

In [54]:
X = np.load("./tatanic_X_train.npy")
y = np.load("./tatanic_y_train.npy")

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [56]:
import lightgbm as lgb

estimator = lgb.LGBMClassifier(num_leaves=2)

param_grid = {
    'learning_rate': [0.01, 0.1, 0.05, 0.5, 1],
    'n_estimators': [20, 40, 60, 80, 100, 120]
}

gbm = GridSearchCV(estimator, param_grid, cv=5, scoring='roc_auc')

gbm.fit(X, y)

print('Best parameters found by grid search are:', gbm.best_params_)

Best parameters found by grid search are: {'learning_rate': 0.5, 'n_estimators': 80}


In [57]:
sum(gbm.best_estimator_.predict(X_test) == y_test) / (len(y_test)*1.0)

0.8426966292134831

In [58]:
sum(
    (gbm.best_estimator_.predict_proba(X_test)[:,1]> 0.51 ) == y_test) / (len(y_test)*1.0)

0.8426966292134831

In [59]:
from xgboost.sklearn import XGBClassifier
estimator = XGBClassifier()

In [60]:
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic'],
              'learning_rate': [0.05], #so called `eta` value
              'max_depth': [6],
              'min_child_weight': [11],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.7],
              'n_estimators': [5], #number of trees, change it to 1000 for better results
              'missing':[-999],
              'seed': [1337]}

In [61]:
clf = GridSearchCV(estimator, parameters, n_jobs=5, 
                   cv=5, 
                   scoring='roc_auc',
                   verbose=2, refit=True)

clf.fit(X, y)
print('Best parameters found by grid search are:', gbm.best_params_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:    6.2s remaining:    9.3s


Best parameters found by grid search are: {'learning_rate': 0.5, 'n_estimators': 80}


[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:    6.5s remaining:    0.0s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:    6.5s finished


In [62]:
sum(clf.best_estimator_.predict(X_test) == y_test) / (len(y_test)*1.0)

0.8164794007490637

In [63]:
sum(
    (clf.best_estimator_.predict_proba(X_test)[:,1]> 0.51 ) == y_test) / (len(y_test)*1.0)

0.8089887640449438

## 7. Stacking

In [64]:
import warnings
warnings.filterwarnings('ignore')

In [65]:
import numpy as np
import pandas as pd

In [66]:
from xgboost import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [67]:
X = np.load("./tatanic_X_train.npy")
y = np.load("./tatanic_y_train.npy")

In [68]:
estimator1 = XGBClassifier(max_depth=3, learning_rate=0.5, n_estimators=50, n_jobs=-1)
estimator2 = LGBMClassifier(max_depth=2, learning_rate=0.5, n_estimators=50, n_jobs=-1)
estimator3 = RandomForestClassifier(n_estimators=500, max_depth=3, n_jobs=-1)
estimator4 = SVC(probability=True)
estimator5 = MLPClassifier(hidden_layer_sizes=(512,256, 32))

In [69]:
base_estimators = [estimator1, estimator2, estimator3, estimator4, estimator5]

In [70]:
from sklearn.model_selection import train_test_split

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((533, 27), (356, 27), (533,), (356,))

In [72]:
for estimator in base_estimators:
            estimator.fit(X_train, y_train)

In [73]:
base_estimators[0].predict_proba(X_test)

array([[0.7332586 , 0.26674142],
       [0.09970379, 0.9002962 ],
       [0.00542223, 0.99457777],
       [0.04766387, 0.95233613],
       [0.61546797, 0.38453203],
       [0.88515174, 0.11484823],
       [0.9131147 , 0.08688527],
       [0.9196682 , 0.0803318 ],
       [0.36566865, 0.63433135],
       [0.7058456 , 0.29415444],
       [0.90497655, 0.09502345],
       [0.17381036, 0.82618964],
       [0.01251376, 0.98748624],
       [0.9415602 , 0.05843977],
       [0.9306256 , 0.0693744 ],
       [0.9549551 , 0.0450449 ],
       [0.00204998, 0.99795   ],
       [0.0696857 , 0.9303143 ],
       [0.03375584, 0.96624416],
       [0.6803266 , 0.31967342],
       [0.8671905 , 0.1328095 ],
       [0.91664433, 0.0833557 ],
       [0.8277751 , 0.17222488],
       [0.7332586 , 0.26674142],
       [0.7206651 , 0.2793349 ],
       [0.12987584, 0.87012416],
       [0.05774438, 0.9422556 ],
       [0.9606347 , 0.03936527],
       [0.8714287 , 0.12857129],
       [0.97934896, 0.02065103],
       [0.

In [74]:
meta_train_set = np.array([estimator.predict(X_test) for estimator in base_estimators]).T

In [75]:
meta_train_set #레이블

array([[0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       ...,
       [1., 1., 1., 1., 1.],
       [1., 1., 0., 0., 1.],
       [1., 1., 1., 1., 1.]])

In [76]:
from sklearn.model_selection import cross_val_score

In [77]:
for estimator in base_estimators:
    result = cross_val_score(estimator, meta_train_set, y_test, scoring="accuracy" , cv=5).mean()
    print(result)

0.8259780907668232
0.8343896713615024
0.8259780907668232
0.8372065727699531
0.8259780907668232


In [78]:
meta_train_set2 = np.array([estimator.predict_proba(X_test)[:,1] for estimator in base_estimators]).T

In [79]:
meta_train_set2 #확률값

array([[0.26674142, 0.18535585, 0.14068997, 0.15268261, 0.09296822],
       [0.90029621, 0.78001366, 0.58242612, 0.72990082, 0.6966851 ],
       [0.99457777, 0.99133981, 0.75759509, 0.78628824, 0.99999751],
       ...,
       [0.97402745, 0.98608408, 0.83816468, 0.77666348, 1.        ],
       [0.93144733, 0.77312784, 0.38093526, 0.18137743, 0.99799723],
       [0.63232291, 0.74981556, 0.65182915, 0.7688062 , 0.93438148]])

In [80]:
for estimator in base_estimators:
    result = cross_val_score(estimator, meta_train_set2, y_test, scoring="accuracy" , cv=5).mean()
    print(result)

0.8062597809076684
0.8202660406885759
0.828794992175274
0.8287167449139279
0.8231220657276996


In [81]:
new_X_test = np.concatenate([X_test, meta_train_set2], axis = 1)
new_X_test.shape

(356, 32)

In [82]:
for estimator in base_estimators:
    result = cross_val_score(estimator, new_X_test, y_test, scoring="accuracy" , cv=5).mean()
    print(result)

0.8145931142410016
0.820226917057903
0.8259389671361502
0.8343114241001566
0.8034428794992176


In [83]:
## Ref
## 투빅스 10기 이준걸님 실습안