In [3]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import lightgbm as lgb
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

In [5]:
xgb.__version__

'2.1.1'

In [4]:
baseline_train = pd.read_csv('../data/baseline/train.csv')
baseline_test = pd.read_csv('../data/baseline/test.csv')

In [None]:
def ensemble_vote(train, test):
    # xyの用意
    x_train = train.drop(['Survived'], axis=1, inplace=False)
    y_train = train['Survived']
    x_test = test.drop(['PassengerId'], axis=1, inplace=False)
    #パラメーターの用意
    lgbm_params = {
            'objective': 'binary',
            'metric': 'binary_error',
            'boosting_type': 'gbdt',
            'num_leaves': 32,
            'learning_rate': 0.05,
            'feature_fraction': 0.9,
            'random_state': 42
        }
    num_round = 100
    # インスタンス化
    lgbm = lgb.LGBMClassifier(**lgbm_params, n_estimators=num_round)
    rf = RandomForestClassifier(n_estimators=num_round, random_state=42)
    ensemble = VotingClassifier(estimators=[('rf', rf), ('lgbm', lgbm)], voting='soft')

    ensemble.fit(x_train, y_train)
    y_pred = ensemble.predict(x_test)
    result = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': y_pred})
    return result



In [None]:
def ensemble_stakking(train, test):
     # xyの用意
    x_train = train.drop(['Survived'], axis=1, inplace=False)
    y_train = train['Survived']
    x_test = test.drop(['PassengerId'], axis=1, inplace=False)
    #パラメーターの用意
    lgbm_params = {
            'objective': 'binary',
            'metric': 'binary_error',
            'boosting_type': 'gbdt',
            'num_leaves': 32,
            'learning_rate': 0.05,
            'feature_fraction': 0.9,
            'random_state': 42
        }
    num_round = 100
    # インスタンス化
    lgbm = lgb.LGBMClassifier(**lgbm_params, n_estimators=num_round)
    rf = RandomForestClassifier(n_estimators=num_round, random_state=42)
    # ベースモデル学習
    rf.fit(x_train, y_train)
    lgbm.fit(x_train, y_train)
    # 各Xデータから、予測を出す
    train_preds_rf = rf.predict(x_train)
    train_preds_lgbm = lgbm.predict(x_train)
    test_preds_rf = rf.predict(x_test)
    test_preds_lgbm = lgbm.predict(x_test)
    # 予測からメタデータを出す
    meta_x_train = np.column_stack((train_preds_rf, train_preds_lgbm))
    meta_x_test = np.column_stack((test_preds_rf, test_preds_lgbm))
    # メタモデルの訓練
    meta_model = xgb.XGBClassifier(n_estimators=100, random_state=42)
    meta_model.fit(meta_x_train, y_train)
    # メタモデルの予想
    final_preds = meta_model.predict(meta_x_test)
    final_preds_binary = (final_preds >= 0.5).astype(int)
    result = pd.DataFrame({'PassengerId': test['PassengerId'].values, 'Survived': final_preds_binary})
    return result

In [None]:
def ensemble_proba_stakking(train, test):
     # xyの用意
    x_train = train.drop(['Survived'], axis=1, inplace=False)
    y_train = train['Survived']
    x_test = test.drop(['PassengerId'], axis=1, inplace=False)
    #パラメーターの用意
    lgbm_params = {
            'objective': 'binary',
            'metric': 'binary_error',
            'boosting_type': 'gbdt',
            'num_leaves': 32,
            'learning_rate': 0.05,
            'feature_fraction': 0.9,
            'random_state': 42
        }
    num_round = 100
    # インスタンス化
    lgbm = lgb.LGBMClassifier(**lgbm_params, n_estimators=num_round)
    rf = RandomForestClassifier(n_estimators=num_round, random_state=42)

    # ベースモデル学習
    rf.fit(x_train, y_train)
    lgbm.fit(x_train, y_train)
    
    # 各Xデータから、予測を出す
    train_preds_rf = rf.predict_proba(x_train)[:, 1]
    train_preds_lgbm = lgbm.predict_proba(x_train)[:, 1]
    

    test_preds_rf = rf.predict_proba(x_test)[:, 1]
    test_preds_lgbm = lgbm.predict_proba(x_test)[:, 1]
    
    # 予測からメタデータを出す
    meta_x_train = np.column_stack((train_preds_rf, train_preds_lgbm, train_preds_lr))
    meta_x_test = np.column_stack((test_preds_rf, test_preds_lgbm, test_preds_lr))
    # メタモデルの訓練
    meta_model = xgb.XGBClassifier(n_estimators=100, random_state=42)
    meta_model.fit(meta_x_train, y_train)
    # メタモデルの予想
    final_preds = meta_model.predict(meta_x_test)
    final_preds_binary = (final_preds >= 0.5).astype(int)
    result = pd.DataFrame({'PassengerId': test['PassengerId'].values, 'Survived': final_preds_binary})
    return result

In [9]:
result = ensemble_proba_stakking(baseline_train, baseline_test)
result.head()

[LightGBM] [Info] Number of positive: 342, number of negative: 549
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000431 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 221
[LightGBM] [Info] Number of data points in the train set: 891, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383838 -> initscore=-0.473288
[LightGBM] [Info] Start training from score -0.473288


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,1


In [None]:
def ensemble_stakking_withtrain(train, test):
     # xyの用意
    x_train = train.drop(['Survived'], axis=1, inplace=False)
    y_train = train['Survived']
    x_test = test.drop(['PassengerId'], axis=1, inplace=False)
    #パラメーターの用意
    lgbm_params = {
            'objective': 'binary',
            'metric': 'binary_error',
            'boosting_type': 'gbdt',
            'num_leaves': 32,
            'learning_rate': 0.05,
            'feature_fraction': 0.9,
            'random_state': 42
        }
    num_round = 100
    # インスタンス化
    lgbm = lgb.LGBMClassifier(**lgbm_params, n_estimators=num_round)
    rf = RandomForestClassifier(n_estimators=num_round, random_state=42)
    # ベースモデル学習
    rf.fit(x_train, y_train)
    lgbm.fit(x_train, y_train)
    # 各Xデータから、予測を出す
    train_preds_rf = rf.predict(x_train)
    train_preds_lgbm = lgbm.predict(x_train)
    test_preds_rf = rf.predict(x_test)
    test_preds_lgbm = lgbm.predict(x_test)
    # 予測からメタデータを出す
    meta_x_train = np.column_stack((train_preds_rf, train_preds_lgbm, x_train))
    meta_x_test = np.column_stack((test_preds_rf, test_preds_lgbm, x_test))
    # メタモデルの訓練
    meta_model = xgb.XGBClassifier(n_estimators=100, random_state=42)
    meta_model.fit(meta_x_train, y_train)
    # メタモデルの予想
    final_preds = meta_model.predict(meta_x_test)
    final_preds_binary = (final_preds >= 0.5).astype(int)
    result = pd.DataFrame({'PassengerId': test['PassengerId'].values, 'Survived': final_preds_binary})
    return result

In [8]:
result = ensemble_stakking_withtrain(baseline_train, baseline_test)
result.head()

[LightGBM] [Info] Number of positive: 342, number of negative: 549
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000253 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 221
[LightGBM] [Info] Number of data points in the train set: 891, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383838 -> initscore=-0.473288
[LightGBM] [Info] Start training from score -0.473288


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,0


In [None]:
def stakking_proba_withtrain(train, test):
     # xyの用意
    x_train = train.drop(['Survived'], axis=1, inplace=False)
    y_train = train['Survived']
    x_test = test.drop(['PassengerId'], axis=1, inplace=False)
    #パラメーターの用意
    lgbm_params = {
            'objective': 'binary',
            'metric': 'binary_error',
            'boosting_type': 'gbdt',
            'num_leaves': 32,
            'learning_rate': 0.05,
            'feature_fraction': 0.9,
            'random_state': 42
        }
    num_round = 100
    # インスタンス化
    lgbm = lgb.LGBMClassifier(**lgbm_params, n_estimators=num_round)
    rf = RandomForestClassifier(n_estimators=num_round, random_state=42)
    # ベースモデル学習
    rf.fit(x_train, y_train)
    lgbm.fit(x_train, y_train)
    # 各Xデータから、予測を出す
    train_preds_rf = rf.predict_proba(x_train)[:, 1]
    train_preds_lgbm = lgbm.predict_proba(x_train)[:, 1]
    test_preds_rf = rf.predict_proba(x_test)[:, 1]
    test_preds_lgbm = lgbm.predict_proba(x_test)[:, 1]
    # 予測からメタデータを出す
    meta_x_train = np.column_stack((train_preds_rf, train_preds_lgbm, x_train))
    meta_x_test = np.column_stack((test_preds_rf, test_preds_lgbm, x_test))
    # メタモデルの訓練
    meta_model = xgb.XGBClassifier(n_estimators=100, random_state=42)
    meta_model.fit(meta_x_train, y_train)
    # メタモデルの予想
    final_preds = meta_model.predict(meta_x_test)
    final_preds_binary = (final_preds >= 0.5).astype(int)
    result = pd.DataFrame({'PassengerId': test['PassengerId'].values, 'Survived': final_preds_binary})
    return result

In [6]:
result = stakking_proba_withtrain(baseline_train, baseline_test)
result.head()

Exception in thread Thread-24 (_readerthread):
Traceback (most recent call last):
  File "c:\anaconda3\Lib\threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "c:\anaconda3\Lib\threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "c:\anaconda3\Lib\subprocess.py", line 1597, in _readerthread
    buffer.append(fh.read())
                  ^^^^^^^^^
  File "<frozen codecs>", line 322, in decode
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x83 in position 24: invalid start byte
'NoneType' object has no attribute 'splitlines'
  File "c:\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 202, in _count_physical_cores
    cpu_info = cpu_info.stdout.splitlines()
               ^^^^^^^^^^^^^^^^^^^^^^^^^^


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,0


In [None]:
def stakking_proba_withtrain(train, test):
     # xyの用意
    x_train = train.drop(['Survived'], axis=1, inplace=False)
    y_train = train['Survived']
    x_test = test.drop(['PassengerId'], axis=1, inplace=False)
    #パラメーターの用意
    lgbm_params = {
            'objective': 'binary',
            'metric': 'binary_error',
            'boosting_type': 'gbdt',
            'num_leaves': 32,
            'learning_rate': 0.05,
            'feature_fraction': 0.9,
            'random_state': 42
        }
    num_round = 100
    # インスタンス化
    lgbm = lgb.LGBMClassifier(**lgbm_params, n_estimators=num_round)
    rf = RandomForestClassifier(n_estimators=num_round, random_state=42)
    # ベースモデル学習
    rf.fit(x_train, y_train)
    lgbm.fit(x_train, y_train)
    # 各Xデータから、予測を出す
    train_preds_rf = rf.predict_proba(x_train)[:, 1]
    train_preds_lgbm = lgbm.predict_proba(x_train)[:, 1]
    test_preds_rf = rf.predict_proba(x_test)[:, 1]
    test_preds_lgbm = lgbm.predict_proba(x_test)[:, 1]
    # 予測からメタデータを出す
    meta_x_train = np.column_stack((train_preds_rf, train_preds_lgbm, x_train))
    meta_x_test = np.column_stack((test_preds_rf, test_preds_lgbm, x_test))
    # メタモデルの訓練
    meta_model = xgb.XGBClassifier(n_estimators=100, random_state=42)
    meta_model.fit(meta_x_train, y_train)
    # メタモデルの予想
    final_preds = meta_model.predict(meta_x_test)
    final_preds_binary = (final_preds >= 0.5).astype(int)
    result = pd.DataFrame({'PassengerId': test['PassengerId'].values, 'Survived': final_preds_binary})
    return result

In [None]:


def kfold_stacking(train, test, n_splits=5):
    # xyの用意
    x_train = train.drop(['Survived'], axis=1)
    y_train = train['Survived']
    x_test = test.drop(['PassengerId'], axis=1)

    # パラメーターの用意
    lgbm_params = {
        'objective': 'binary',
        'metric': 'binary_error',
        'boosting_type': 'gbdt',
        'num_leaves': 32,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'random_state': 42
    }
    num_round = 100

    # インスタンス化
    lgbm = lgb.LGBMClassifier(**lgbm_params, n_estimators=num_round)
    rf = RandomForestClassifier(n_estimators=num_round, random_state=42)
    lr = LogisticRegression(max_iter=1000)
    scaler = StandardScaler()

    # スタッキング用のデータ準備
    meta_x_train = np.zeros((x_train.shape[0], 3))  # 3つのベースモデル
    meta_x_test = np.zeros((x_test.shape[0], 3))

    # KFold で分割
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    for train_idx, val_idx in kf.split(x_train):
        X_train_fold, X_val_fold = x_train.iloc[train_idx], x_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # スケーリング
        scaler.fit(X_train_fold)
        X_train_fold_scaled = scaler.transform(X_train_fold)
        X_val_fold_scaled = scaler.transform(X_val_fold)

        # モデル学習
        rf.fit(X_train_fold, y_train_fold)
        lgbm.fit(X_train_fold, y_train_fold)
        lr.fit(X_train_fold_scaled, y_train_fold)

        # 予測を記録
        meta_x_train[val_idx, 0] = rf.predict_proba(X_val_fold)[:, 1]
        meta_x_train[val_idx, 1] = lgbm.predict_proba(X_val_fold)[:, 1]
        meta_x_train[val_idx, 2] = lr.predict_proba(X_val_fold_scaled)[:, 1]

    # テストデータの予測（全ベースモデルを full train して予測）
    rf.fit(x_train, y_train)
    lgbm.fit(x_train, y_train)
    scaler.fit(x_train)
    x_train_scaled = scaler.transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    lr.fit(x_train_scaled, y_train)

    meta_x_test[:, 0] = rf.predict_proba(x_test)[:, 1]
    meta_x_test[:, 1] = lgbm.predict_proba(x_test)[:, 1]
    meta_x_test[:, 2] = lr.predict_proba(x_test_scaled)[:, 1]

    # メタモデルの訓練
    meta_model = xgb.XGBClassifier(n_estimators=100, random_state=42)
    meta_model.fit(meta_x_train, y_train)

    # メタモデルの予測
    final_preds = meta_model.predict(meta_x_test)

    # 結果を DataFrame にまとめる
    result = pd.DataFrame({'PassengerId': test['PassengerId'].values, 'Survived': final_preds})
    return result
