## stacking

In [31]:
# データ準備
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
import numpy as np
df = sns.load_dataset('titanic')
df.dropna(inplace = True)
# X,yを作成
X = df.loc[:, (df.columns != 'survived') & (df.columns != 'alive')]
y = df['survived']

oe = OrdinalEncoder()
oe.set_output(transform='pandas')
X = oe.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

### stackingの実装
- StackingClassifierCVクラス
 - 2値分類にのみ対応
 - estimators引数：1層目のモデルのリスト[('model_name', model), ... ]
 - final_estimator引数：2層目のsklearnモデルインスタンス
 - cv引数：sklearnのCVオブジェクト
 - .fit, .predict_probaメソッドを実装

In [34]:
class StackingClassifierCV():
    def __init__(self, estimators, final_estimator, cv):
        self.estimators = estimators # [('rf', RandomForest()),('knn',KNeighborsClassifier), ...]
        self.final_estimator = final_estimator
        self.cv = cv
        
    def fit(self, X, y ):
        # まず1層目
        pred_features = {}
        for model_name, model in self.estimators:
            # それぞれのfoldのtrain, valのindex情報
            preds = []
            new_y = []
            for train_idx, val_idx in self.cv.split(X):
                X_train = X.iloc[train_idx]
                X_val = X.iloc[val_idx]
                y_train = y.iloc[train_idx]
                y_val = y.iloc[val_idx]
                
                model.fit(X_train, y_train)
                
                pred = model.predict_proba(X_val)[:, 1].tolist()
                preds += pred
                new_y += y_val.tolist()
            
            model.fit(X, y) # 予測値算出時に1️⃣の手法でやるため
            pred_features[model_name] = preds
        
        # ２層目のモデル学習
        
        new_x = pd.DataFrame(pred_features)
        self.final_estimator.fit(new_x, new_y)
    
    # 予測値算出方法
    # 以下のうち、今回は1️⃣を採用した。2️⃣なら各イテレーションのmodelを格納して使うように修正が必要
    # 1️⃣もう一度全てのデータで学習して予測
    # 2️⃣既に学習したモデルの平均値を予測結果とする
    def predict_proba(self, X):
        # 1層目のモデルで特徴量（予測値）を生成
        pred_features = {}
        
        for model_name, model in self.estimators:
            pred = model.predict_proba(X)[:, 1]
            pred_features[model_name] = pred
            
        
        new_X = pd.DataFrame(pred_features)
        
        final_pred = self.final_estimator.predict_proba(new_X)
        
        return final_pred
        
        

In [38]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold

cv = KFold(n_splits=5, shuffle=True, random_state= 0)
final_estimator = LogisticRegression()
stacking_cv = StackingClassifierCV(estimators= [('rf', RandomForestClassifier()), ('knn', KNeighborsClassifier()), ('logisticregression', LogisticRegression()) ],
                                  final_estimator=final_estimator,
                                  cv=cv)

stacking_cv.fit(X_train, y_train)
y_pred_stacking_cv = stacking_cv.predict_proba(X_test)




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [39]:
from sklearn.metrics import roc_auc_score
print(f"stackingCV AUC: {roc_auc_score(y_test, y_pred_stacking_cv[:,1])}")

stackingCV AUC: 0.8557692307692308
