# 1. Data Loader

In [1]:
import numpy as np
import pandas as pd

np.random.seed(42)

In [2]:
data_dir = "../result_file/preprocess_results/"
submission_dir = "../../../kaggle_data/creditcard_overdue/open/"

train_bin10 = pd.read_csv(os.path.join(data_dir, "train_income_bin10.csv"))

X_test = pd.read_csv(os.path.join(data_dir, "test_income_bin10.csv"))

submission = pd.read_csv(os.path.join(submission_dir, "sample_submission.csv"))

# 2. Data split

In [3]:
X_train = train_bin10.drop(['credit'], axis=1)
y_train = train_bin10['credit']

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2, random_state=42)

# 3. Training : Stacking

## 3-1. Tuning each model's hyperparameters

In [5]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

### 3-1-1. Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier()

forest_param_grid = {
    "max_depth" : [21],
    "n_estimators" : [900],
    "criterion" : ["gini"],
    "random_state" : [42]
}

gs = GridSearchCV(
    forest, forest_param_grid,
    cv=k_fold, scoring="neg_log_loss", verbose=2
)

gs.fit(X_train, y_train)

best_forest = gs.best_estimator_

print("Best Score : ", gs.best_score_)
print("Best params : \n", gs.best_params_)

Fitting 10 folds for each of 2 candidates, totalling 20 fits
[CV] END criterion=gini, max_depth=9, n_estimators=800, random_state=42; total time=  10.1s
[CV] END criterion=gini, max_depth=9, n_estimators=800, random_state=42; total time=  10.0s
[CV] END criterion=gini, max_depth=9, n_estimators=800, random_state=42; total time=   9.9s
[CV] END criterion=gini, max_depth=9, n_estimators=800, random_state=42; total time=  10.0s
[CV] END criterion=gini, max_depth=9, n_estimators=800, random_state=42; total time=   9.9s
[CV] END criterion=gini, max_depth=9, n_estimators=800, random_state=42; total time=  10.0s
[CV] END criterion=gini, max_depth=9, n_estimators=800, random_state=42; total time=  10.0s
[CV] END criterion=gini, max_depth=9, n_estimators=800, random_state=42; total time=  10.0s
[CV] END criterion=gini, max_depth=9, n_estimators=800, random_state=42; total time=   9.9s
[CV] END criterion=gini, max_depth=9, n_estimators=800, random_state=42; total time=  10.0s
[CV] END criterion=

#### 3-1-1-1. Random Forest (without GridSearchCV)

In [6]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(
    n_estimators=400,
    max_depth=9,
    criterion="gini",
    random_state=42,
    verbose=1
)

best_forest = forest.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    5.0s finished


### 3-1-2. LightGBM

In [22]:
from lightgbm import LGBMClassifier

lgb = LGBMClassifier()

lgb_param_grid = {
    "n_estimators" : [2500],
    "learning_rate" : [0.01],
    "max_depth" : [21],
    "random_state" : [42]
}

gs = GridSearchCV(
    lgb, lgb_param_grid,
    cv=k_fold, scoring="neg_log_loss", verbose=2
)

gs.fit(X_train, y_train)

best_lgb = gs.best_estimator_

print("Best Score : ", gs.best_score_)
print("Best Params : \n", gs.best_params_)

Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] END learning_rate=0.01, max_depth=21, n_estimators=2500, random_state=42; total time=   7.1s
[CV] END learning_rate=0.01, max_depth=21, n_estimators=2500, random_state=42; total time=   7.7s
[CV] END learning_rate=0.01, max_depth=21, n_estimators=2500, random_state=42; total time=   7.7s
[CV] END learning_rate=0.01, max_depth=21, n_estimators=2500, random_state=42; total time=   7.9s
[CV] END learning_rate=0.01, max_depth=21, n_estimators=2500, random_state=42; total time=   7.0s
[CV] END learning_rate=0.01, max_depth=21, n_estimators=2500, random_state=42; total time=   7.9s
[CV] END learning_rate=0.01, max_depth=21, n_estimators=2500, random_state=42; total time=   7.1s
[CV] END learning_rate=0.01, max_depth=21, n_estimators=2500, random_state=42; total time=   8.0s
[CV] END learning_rate=0.01, max_depth=21, n_estimators=2500, random_state=42; total time=   7.1s
[CV] END learning_rate=0.01, max_depth=21, n_estimators=2

#### 3-1-2-1. LightGBM (without GridSearchCV)

In [7]:
from lightgbm import LGBMClassifier

lgb = LGBMClassifier(
    random_state=42,
    colsample_bytree=0.2557260109926193,
    learning_rate=0.001,
    max_depth=21,
    n_estimators=3000,
    num_leaves=171,
    reg_alpha=0.7115353581785044,
    reg_lambda=5.658115293998945,
    subsample=0.9262904583735796,
    subsample_freq=1,
)

best_lgb = lgb.fit(X_train, y_train, verbose=1)

### 3-1-3. XGBoost

In [12]:
from xgboost import XGBClassifier

xgb = XGBClassifier()

xgb_param_grid = {
    "n_estimators" : [2500],
    "eval_metric" : ["mlogloss"],
    "learning_rate" : [0.01],
    "max_depth" : [9],
    "use_label_encoder" : [False],
    "reg_lambda" : [1.7256912198205319],
    "seed" : [42]
}

gs = GridSearchCV(
    xgb, xgb_param_grid,
    cv=k_fold, scoring="neg_log_loss", verbose=2
)

gs.fit(X_train, y_train)

best_xgb = gs.best_estimator_

print("Best Score : ", gs.best_score_)
print("Best params : \n", gs.best_params_)

Fitting 10 folds for each of 2 candidates, totalling 20 fits
[CV] END eval_metric=mlogloss, learning_rate=0.01, max_depth=9, n_estimators=2700, reg_lambda=1.7256912198205319, seed=42, use_label_encoder=False; total time= 1.8min
[CV] END eval_metric=mlogloss, learning_rate=0.01, max_depth=9, n_estimators=2700, reg_lambda=1.7256912198205319, seed=42, use_label_encoder=False; total time= 1.9min
[CV] END eval_metric=mlogloss, learning_rate=0.01, max_depth=9, n_estimators=2700, reg_lambda=1.7256912198205319, seed=42, use_label_encoder=False; total time= 1.9min
[CV] END eval_metric=mlogloss, learning_rate=0.01, max_depth=9, n_estimators=2700, reg_lambda=1.7256912198205319, seed=42, use_label_encoder=False; total time= 1.9min
[CV] END eval_metric=mlogloss, learning_rate=0.01, max_depth=9, n_estimators=2700, reg_lambda=1.7256912198205319, seed=42, use_label_encoder=False; total time= 1.9min
[CV] END eval_metric=mlogloss, learning_rate=0.01, max_depth=9, n_estimators=2700, reg_lambda=1.72569121

#### 3-1-3-1. XGBoost (without GridSearchCV)

In [8]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=10000,
    eval_metric="mlogloss",
    max_bin=788,
    tree_method="gpu_hist",
    eta=0.01,
    alpha=7.105038963844129,
    colsample_bytree=0.25505629740052566,
    gamma=0.4999381950212869,
    max_depth=9,
    use_label_encoder=False,
    reg_lambda=1.7256912198205319,
    min_child_weight=2.286836198630466,
    subsample=0.618417952155855,
    random_state=42,
    verbosity=1
)

best_xgb = xgb.fit(X_train, y_train)

### 3-1-4. CatBoost

In [10]:
from catboost import CatBoostClassifier

cat = CatBoostClassifier()

cat_param_grid = {
    "verbose" : [0],
    "task_type" : ["GPU"],
    "l2_leaf_reg" : [9.847870133539244],
    "loss_function" : ["MultiClass"],
    "random_seed" : [42],
    "n_estimators" : [8000],
    "learning_rate" : [0.01],
    "grow_policy" : ["Depthwise"],
    "max_depth" : [7]
}

gs = GridSearchCV(
    cat, cat_param_grid,
    cv=k_fold, scoring="neg_log_loss", verbose=2
)

gs.fit(X_train, y_train)

best_cat = gs.best_estimator_

print("Best Score : ", gs.best_score_)
print("Best Params : \n", gs.best_params_)

m 31s	remaining: 4.38s
7634:	learn: 0.5183615	total: 1m 31s	remaining: 4.37s
7635:	learn: 0.5183450	total: 1m 31s	remaining: 4.36s
7636:	learn: 0.5183006	total: 1m 31s	remaining: 4.34s
7637:	learn: 0.5182788	total: 1m 31s	remaining: 4.33s
7638:	learn: 0.5182641	total: 1m 31s	remaining: 4.32s
7639:	learn: 0.5182461	total: 1m 31s	remaining: 4.31s
7640:	learn: 0.5182258	total: 1m 31s	remaining: 4.3s
7641:	learn: 0.5182019	total: 1m 31s	remaining: 4.28s
7642:	learn: 0.5181807	total: 1m 31s	remaining: 4.27s
7643:	learn: 0.5181739	total: 1m 31s	remaining: 4.26s
7644:	learn: 0.5181528	total: 1m 31s	remaining: 4.25s
7645:	learn: 0.5181457	total: 1m 31s	remaining: 4.24s
7646:	learn: 0.5181137	total: 1m 31s	remaining: 4.22s
7647:	learn: 0.5180998	total: 1m 31s	remaining: 4.21s
7648:	learn: 0.5180871	total: 1m 31s	remaining: 4.2s
7649:	learn: 0.5180685	total: 1m 31s	remaining: 4.19s
7650:	learn: 0.5180411	total: 1m 31s	remaining: 4.18s
7651:	learn: 0.5180130	total: 1m 31s	remaining: 4.16s
7652:	l

#### 3-1-4-1. CatBoost (without GridSearchCV)

In [9]:
from catboost import CatBoostClassifier

cat = CatBoostClassifier(
    verbose=1,
    task_type="GPU",
    od_type="Iter",
    od_wait=200,
    l2_leaf_reg=9.847870133539244,
    loss_function="MultiClass",
    bagging_temperature=1.288692494969795,
    random_seed=42,
    n_estimators=8000,
    learning_rate=0.01,
    grow_policy="Depthwise",
    max_depth=7,
    min_data_in_leaf=1,
    penalties_coefficient=2.1176668909602734
)

best_cat = cat.fit(X_train, y_train)

:	learn: 0.5172287	total: 1m 32s	remaining: 4.5s
7630:	learn: 0.5172123	total: 1m 32s	remaining: 4.48s
7631:	learn: 0.5171889	total: 1m 32s	remaining: 4.47s
7632:	learn: 0.5171742	total: 1m 32s	remaining: 4.46s
7633:	learn: 0.5171549	total: 1m 32s	remaining: 4.45s
7634:	learn: 0.5171240	total: 1m 32s	remaining: 4.43s
7635:	learn: 0.5171051	total: 1m 32s	remaining: 4.42s
7636:	learn: 0.5170910	total: 1m 32s	remaining: 4.41s
7637:	learn: 0.5170694	total: 1m 32s	remaining: 4.4s
7638:	learn: 0.5170526	total: 1m 32s	remaining: 4.38s
7639:	learn: 0.5170312	total: 1m 32s	remaining: 4.37s
7640:	learn: 0.5170170	total: 1m 32s	remaining: 4.36s
7641:	learn: 0.5170046	total: 1m 32s	remaining: 4.35s
7642:	learn: 0.5169815	total: 1m 32s	remaining: 4.34s
7643:	learn: 0.5169759	total: 1m 32s	remaining: 4.32s
7644:	learn: 0.5169506	total: 1m 32s	remaining: 4.31s
7645:	learn: 0.5169429	total: 1m 32s	remaining: 4.3s
7646:	learn: 0.5169121	total: 1m 32s	remaining: 4.29s
7647:	learn: 0.5169025	total: 1m 32

### 3-1-5. Ridge

In [18]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

hist = HistGradientBoostingClassifier()

hist_param_grid = {
    "loss" : ["categorical_crossentropy"],
    "learning_rate" : [0.01],
    "max_iter" : [500],
    "max_depth" : [21],
    "scoring" : ["neg_log_loss"],
    "random_state" : [42],
    "l2_regularization" : [1.766059063693552]   
}

gs = GridSearchCV(
    hist, hist_param_grid,
    cv=k_fold, scoring="neg_log_loss", verbose=2
)

gs.fit(X_train, y_train)

best_hist = gs.best_estimator_

print("Best Score : ", gs.best_score_)
print("Best params : \n", gs.best_params_)

Fitting 10 folds for each of 3 candidates, totalling 30 fits
[CV] END l2_regularization=1.766059063693552, learning_rate=0.01, loss=categorical_crossentropy, max_depth=19, max_iter=500, random_state=42, scoring=neg_log_loss; total time= 1.7min
[CV] END l2_regularization=1.766059063693552, learning_rate=0.01, loss=categorical_crossentropy, max_depth=19, max_iter=500, random_state=42, scoring=neg_log_loss; total time= 1.8min
[CV] END l2_regularization=1.766059063693552, learning_rate=0.01, loss=categorical_crossentropy, max_depth=19, max_iter=500, random_state=42, scoring=neg_log_loss; total time= 1.8min
[CV] END l2_regularization=1.766059063693552, learning_rate=0.01, loss=categorical_crossentropy, max_depth=19, max_iter=500, random_state=42, scoring=neg_log_loss; total time= 1.8min
[CV] END l2_regularization=1.766059063693552, learning_rate=0.01, loss=categorical_crossentropy, max_depth=19, max_iter=500, random_state=42, scoring=neg_log_loss; total time= 1.8min
[CV] END l2_regularizati

#### 3-1-5-1. Ridge (without GridSearchCV)

In [10]:
from sklearn.linear_model import RidgeClassifier
from sklearn.calibration import CalibratedClassifierCV

hist = CalibratedClassifierCV(
    RidgeClassifier(random_state=42),
    cv=5
)

best_ridge = hist.fit(X_train, y_train)

## 3-2. Stacking models

### 3-2-1. stacking with sklearn

In [37]:
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from catboost import CatBoostClassifier
from sklearn.ensemble import StackingClassifier

from sklearn.model_selection import StratifiedKFold


kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

estimators = [
    ('rf', RandomForestClassifier(
        n_estimators=400,
        max_depth=9,
        criterion="gini",
        random_state=42,
        verbose=0
    )),
    ('lgbm', LGBMClassifier(
        random_state=42,
        colsample_bytree=0.2557260109926193,
        learning_rate=0.001,
        max_depth=21,
        n_estimators=3000,
        num_leaves=171,
        reg_alpha=0.7115353581785044,
        reg_lambda=5.658115293998945,
        subsample=0.9262904583735796,
        subsample_freq=1,
    )),
    ('xgb', XGBClassifier(
        n_estimators=10000,
        eval_metric="mlogloss",
        max_bin=788,
        tree_method="gpu_hist",
        eta=0.01,
        alpha=7.105038963844129,
        colsample_bytree=0.25505629740052566,
        gamma=0.4999381950212869,
        max_depth=9,
        use_label_encoder=False,
        reg_lambda=1.7256912198205319,
        min_child_weight=2.286836198630466,
        subsample=0.618417952155855,
        random_state=42,
        verbosity=0
    )),
    ('ridge', CalibratedClassifierCV(
        RidgeClassifier(random_state=42),
        cv=5
    )),
    ('cat', CatBoostClassifier(
        verbose=0,
        task_type="GPU",
        od_type="Iter",
        od_wait=200,
        l2_leaf_reg=9.847870133539244,
        loss_function="MultiClass",
        bagging_temperature=1.288692494969795,
        random_seed=42,
        n_estimators=8000,
        learning_rate=0.01,
        grow_policy="Depthwise",
        max_depth=7,
        min_data_in_leaf=1,
        penalties_coefficient=2.1176668909602734
    ))
]


clf = StackingClassifier(
    estimators=estimators, final_estimator=CalibratedClassifierCV(
        base_estimator=LogisticRegression(
            C=100,
            penalty='elasticnet',
            solver='saga',
            max_iter=700,
            multi_class='multinomial',
            warm_start=True,
            l1_ratio=0.5
        ),
        cv=5
    ),
    stack_method = 'predict', verbose=2, cv=kfold
)

In [38]:
clf = clf.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   19.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   27.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.3min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.7min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  8.5min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent work

In [39]:
print(f"Validation Score : {clf.score(X_val, y_val)}")

Validation Score : 0.6889644746787604


In [40]:
pred_val = clf.predict_proba(X_val)

pred_val

array([[0.10812168, 0.18963874, 0.70223958],
       [0.23519922, 0.75149184, 0.01330894],
       [0.10812168, 0.18963874, 0.70223958],
       ...,
       [0.10812168, 0.18963874, 0.70223958],
       [0.25384151, 0.41394933, 0.33220916],
       [0.10812168, 0.18963874, 0.70223958]])

In [None]:
y_val

In [41]:
from sklearn.metrics import log_loss
from tensorflow.keras.utils import to_categorical

logloss = log_loss(to_categorical(y_val), pred_val)
print(logloss)

0.7955055161645241


### 3-2-2. implement Stacking models

In [16]:
from sklearn.model_selection import StratifiedKFold

def get_stacking_data(model, X_train_n, y_train_n, X_test_n, n_folds=5):
    kfold = StratifiedKFold(n_splits=n_folds)
    
    train_fold_pred = np.zeros((X_train_n.shape[0], 1))
    test_pred = np.zeros((X_test_n.shape[0], n_folds))
    print("model : ", model.__class__.__name__)

    for cnt, (train_idx, valid_idx) in enumerate(kfold.split(X_train_n,y_train_n)):
        print(f"Fold : {cnt+1}")
        X_train_ = X_train_n.iloc[train_idx]
        y_train_ = y_train_n.iloc[train_idx]
        X_validation = X_train_n.iloc[valid_idx]

        model.fit(X_train_, y_train_)

        train_fold_pred[valid_idx, :] = model.predict(X_validation).reshape(-1, 1)

        if model.__class__.__name__ == 'CatBoostClassifier':
            test_pred[:, cnt] = model.predict(X_test_n).reshape(-1,) ###
        else:
            test_pred[:, cnt] = model.predict(X_test_n)
    
    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1, 1)

    return train_fold_pred, test_pred_mean

### 3-2-2-1. Validation step

In [17]:
cat_train, cat_val = get_stacking_data(best_cat, X_train, y_train, X_val)

s
7629:	learn: 0.5006218	total: 1m 34s	remaining: 4.58s
7630:	learn: 0.5006095	total: 1m 34s	remaining: 4.57s
7631:	learn: 0.5005829	total: 1m 34s	remaining: 4.56s
7632:	learn: 0.5005556	total: 1m 34s	remaining: 4.55s
7633:	learn: 0.5005434	total: 1m 34s	remaining: 4.54s
7634:	learn: 0.5005244	total: 1m 34s	remaining: 4.52s
7635:	learn: 0.5005052	total: 1m 34s	remaining: 4.51s
7636:	learn: 0.5004882	total: 1m 34s	remaining: 4.5s
7637:	learn: 0.5004742	total: 1m 34s	remaining: 4.49s
7638:	learn: 0.5004618	total: 1m 34s	remaining: 4.47s
7639:	learn: 0.5004355	total: 1m 34s	remaining: 4.46s
7640:	learn: 0.5004123	total: 1m 34s	remaining: 4.45s
7641:	learn: 0.5003947	total: 1m 34s	remaining: 4.44s
7642:	learn: 0.5003743	total: 1m 34s	remaining: 4.42s
7643:	learn: 0.5003506	total: 1m 34s	remaining: 4.41s
7644:	learn: 0.5003230	total: 1m 34s	remaining: 4.4s
7645:	learn: 0.5003072	total: 1m 34s	remaining: 4.39s
7646:	learn: 0.5002861	total: 1m 34s	remaining: 4.37s
7647:	learn: 0.5002762	total

In [18]:
forest_train, forest_val = get_stacking_data(best_forest, X_train, y_train, X_val)
lgb_train, lgb_val = get_stacking_data(best_lgb, X_train, y_train, X_val)
xgb_train, xgb_val = get_stacking_data(best_xgb, X_train, y_train, X_val)
# cat_train, cat_val = get_stacking_data(best_cat, X_train, y_train, X_val)
ridge_train, ridge_val = get_stacking_data(best_ridge, X_train, y_train, X_val)

model :  RandomForestClassifier
Fold : 1
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    3.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Fold : 2
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    3.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend Sequenti

In [20]:
stack_X_train = np.concatenate((forest_train, lgb_train, xgb_train, cat_train, ridge_train), axis=1)
stack_X_val = np.concatenate((forest_val, lgb_val, xgb_val, cat_val, ridge_val), axis=1)

In [21]:
stack_X_train

array([[2., 2., 2., 2., 2.],
       [2., 2., 1., 1., 2.],
       [2., 2., 2., 2., 2.],
       ...,
       [2., 2., 2., 2., 2.],
       [2., 2., 2., 2., 2.],
       [2., 2., 2., 2., 2.]])

In [23]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier()

forest_param_grid = {
    "max_depth" : [5, 9, 11],
    "n_estimators" : [500, 800, 1100],
    "criterion" : ["gini", "entropy"],
    "random_state" : [42]
}

gs = GridSearchCV(
    forest, forest_param_grid,
    cv=k_fold, scoring="neg_log_loss", verbose=2
)

gs.fit(stack_X_train, y_train)

best_model = gs.best_estimator_

print("Best Score : ", gs.best_score_)
print("Best Params : ", gs.best_params_)

best_model.fit(stack_X_train, y_train)

stack_pred_val = best_model.predict_proba(stack_X_val)

Fitting 10 folds for each of 18 candidates, totalling 180 fits
[CV] END criterion=gini, max_depth=5, n_estimators=500, random_state=42; total time=   1.5s
[CV] END criterion=gini, max_depth=5, n_estimators=500, random_state=42; total time=   1.5s
[CV] END criterion=gini, max_depth=5, n_estimators=500, random_state=42; total time=   1.4s
[CV] END criterion=gini, max_depth=5, n_estimators=500, random_state=42; total time=   1.4s
[CV] END criterion=gini, max_depth=5, n_estimators=500, random_state=42; total time=   1.4s
[CV] END criterion=gini, max_depth=5, n_estimators=500, random_state=42; total time=   1.4s
[CV] END criterion=gini, max_depth=5, n_estimators=500, random_state=42; total time=   1.4s
[CV] END criterion=gini, max_depth=5, n_estimators=500, random_state=42; total time=   1.4s
[CV] END criterion=gini, max_depth=5, n_estimators=500, random_state=42; total time=   1.4s
[CV] END criterion=gini, max_depth=5, n_estimators=500, random_state=42; total time=   1.4s
[CV] END criterio

In [40]:
from lightgbm import LGBMClassifier

lgb = LGBMClassifier()

lgb_param_grid = {
    "n_estimators" : [700],
    "learning_rate" : [0.001, 0.01, 0.1],
    "max_depth" : [9],
    "random_state" : [42]
}

gs = GridSearchCV(
    lgb, lgb_param_grid,
    cv=k_fold, scoring="neg_log_loss", verbose=2
)

gs.fit(stack_X_train, y_train)

best_model = gs.best_estimator_

print("Best Score : ", gs.best_score_)
print("Best Params : ", gs.best_params_)

best_model.fit(stack_X_train, y_train)

stack_pred_val = best_model.predict_proba(stack_X_val)

Fitting 10 folds for each of 3 candidates, totalling 30 fits
[CV] END learning_rate=0.001, max_depth=9, n_estimators=700, random_state=42; total time=   1.5s
[CV] END learning_rate=0.001, max_depth=9, n_estimators=700, random_state=42; total time=   1.3s
[CV] END learning_rate=0.001, max_depth=9, n_estimators=700, random_state=42; total time=   1.4s
[CV] END learning_rate=0.001, max_depth=9, n_estimators=700, random_state=42; total time=   1.3s
[CV] END learning_rate=0.001, max_depth=9, n_estimators=700, random_state=42; total time=   1.3s
[CV] END learning_rate=0.001, max_depth=9, n_estimators=700, random_state=42; total time=   1.4s
[CV] END learning_rate=0.001, max_depth=9, n_estimators=700, random_state=42; total time=   1.3s
[CV] END learning_rate=0.001, max_depth=9, n_estimators=700, random_state=42; total time=   1.3s
[CV] END learning_rate=0.001, max_depth=9, n_estimators=700, random_state=42; total time=   1.3s
[CV] END learning_rate=0.001, max_depth=9, n_estimators=700, rando

### 3-2-2-2. test step

In [None]:
forest_train, forest_test = get_stacking_data(best_forest, X_train, y_train, X_test)
ada_train, ada_test = get_stacking_data(best_ada, X_train, y_train, X_test)
xgb_train, xgb_test = get_stacking_data(best_xgb, X_train, y_train, X_test)

In [None]:
stack_X_train = np.concatenate((forest_train, ada_train, xgb_train), axis=1)
stack_X_test = np.concatenate((forest_test, ata_test, xgb_test), axis=1)

In [None]:
best_model.fit(stack_X_train, y_train)

stack_pred_test = best_model.predict_proba(stack_X_test)

In [None]:
stack_pred_test

# 4. Evaluating : logloss

In [None]:
from sklearn.metrics import log_loss
from tensorflow.keras.utils import to_categorical

logloss = log_loss(to_categorical(y_val), stack_pred_val)
print(logloss)

# 5. Submission

In [None]:
submit_file_dir = "../result_file/performace_result/"

In [None]:
stack_pred_test = pd.DataFrame(stack_pred_test)
stack_pred_test.columns = ['0', '1', '2']

submission['0'] = stack_pred_test['0']
submission['1'] = stack_pred_test['1']
submission['2'] = stack_pred_test['2']

submission.to_csv(os.path.join(submit_file_dir, "result_stacking.csv"), index=False)