# 1. Data Loader

In [1]:
import numpy as np
import pandas as pd

np.random.seed(42)

In [2]:
data_dir = "../result_file/preprocess_results/"
submission_dir = "../../../kaggle_data/creditcard_overdue/open/"

train_bin10 = pd.read_csv(os.path.join(data_dir, "train_income_bin10.csv"))

X_test = pd.read_csv(os.path.join(data_dir, "test_income_bin10.csv"))

submission = pd.read_csv(os.path.join(submission_dir, "sample_submission.csv"))

# 2. Data split

In [3]:
X_train = train_bin10.drop(['credit'], axis=1)
y_train = train_bin10['credit']

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2, random_state=42)

# 3. Training : Stacking

## 3-1. Tuning each model's hyperparameters

In [5]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

### 3-1-1. Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier()

forest_param_grid = {
    "max_depth" : [21],
    "n_estimators" : [900],
    "criterion" : ["gini"],
    "random_state" : [42]
}

gs = GridSearchCV(
    forest, forest_param_grid,
    cv=k_fold, scoring="neg_log_loss", verbose=2
)

gs.fit(X_train, y_train)

best_forest = gs.best_estimator_

print("Best Score : ", gs.best_score_)
print("Best params : \n", gs.best_params_)

Fitting 10 folds for each of 2 candidates, totalling 20 fits
[CV] END criterion=gini, max_depth=9, n_estimators=800, random_state=42; total time=  10.1s
[CV] END criterion=gini, max_depth=9, n_estimators=800, random_state=42; total time=  10.0s
[CV] END criterion=gini, max_depth=9, n_estimators=800, random_state=42; total time=   9.9s
[CV] END criterion=gini, max_depth=9, n_estimators=800, random_state=42; total time=  10.0s
[CV] END criterion=gini, max_depth=9, n_estimators=800, random_state=42; total time=   9.9s
[CV] END criterion=gini, max_depth=9, n_estimators=800, random_state=42; total time=  10.0s
[CV] END criterion=gini, max_depth=9, n_estimators=800, random_state=42; total time=  10.0s
[CV] END criterion=gini, max_depth=9, n_estimators=800, random_state=42; total time=  10.0s
[CV] END criterion=gini, max_depth=9, n_estimators=800, random_state=42; total time=   9.9s
[CV] END criterion=gini, max_depth=9, n_estimators=800, random_state=42; total time=  10.0s
[CV] END criterion=

#### 3-1-1-1. Random Forest (without GridSearchCV)

In [23]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(
    max_depth=21,
    n_estimators=900,
    criterion="gini",
    random_state=42
)

best_forest = forest.fit(X_train, y_train)

### 3-1-2. LightGBM

In [22]:
from lightgbm import LGBMClassifier

lgb = LGBMClassifier()

lgb_param_grid = {
    "n_estimators" : [2500],
    "learning_rate" : [0.01],
    "max_depth" : [21],
    "random_state" : [42]
}

gs = GridSearchCV(
    lgb, lgb_param_grid,
    cv=k_fold, scoring="neg_log_loss", verbose=2
)

gs.fit(X_train, y_train)

best_lgb = gs.best_estimator_

print("Best Score : ", gs.best_score_)
print("Best Params : \n", gs.best_params_)

Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] END learning_rate=0.01, max_depth=21, n_estimators=2500, random_state=42; total time=   7.1s
[CV] END learning_rate=0.01, max_depth=21, n_estimators=2500, random_state=42; total time=   7.7s
[CV] END learning_rate=0.01, max_depth=21, n_estimators=2500, random_state=42; total time=   7.7s
[CV] END learning_rate=0.01, max_depth=21, n_estimators=2500, random_state=42; total time=   7.9s
[CV] END learning_rate=0.01, max_depth=21, n_estimators=2500, random_state=42; total time=   7.0s
[CV] END learning_rate=0.01, max_depth=21, n_estimators=2500, random_state=42; total time=   7.9s
[CV] END learning_rate=0.01, max_depth=21, n_estimators=2500, random_state=42; total time=   7.1s
[CV] END learning_rate=0.01, max_depth=21, n_estimators=2500, random_state=42; total time=   8.0s
[CV] END learning_rate=0.01, max_depth=21, n_estimators=2500, random_state=42; total time=   7.1s
[CV] END learning_rate=0.01, max_depth=21, n_estimators=2

#### 3-1-2-1. LightGBM (without GridSearchCV)

In [None]:
from lightgbm import LGBMClassifier

lgb = LGBMClassifier(
    n_estimators=2500,
    learning_rate=0.01,
    max_depth=21,
    random_state=42
)

best_lgb = lgb.fit(X_train, y_train)

### 3-1-3. XGBoost

In [12]:
from xgboost import XGBClassifier

xgb = XGBClassifier()

xgb_param_grid = {
    "n_estimators" : [2500],
    "eval_metric" : ["mlogloss"],
    "learning_rate" : [0.01],
    "max_depth" : [9],
    "use_label_encoder" : [False],
    "reg_lambda" : [1.7256912198205319],
    "seed" : [42]
}

gs = GridSearchCV(
    xgb, xgb_param_grid,
    cv=k_fold, scoring="neg_log_loss", verbose=2
)

gs.fit(X_train, y_train)

best_xgb = gs.best_estimator_

print("Best Score : ", gs.best_score_)
print("Best params : \n", gs.best_params_)

Fitting 10 folds for each of 2 candidates, totalling 20 fits
[CV] END eval_metric=mlogloss, learning_rate=0.01, max_depth=9, n_estimators=2700, reg_lambda=1.7256912198205319, seed=42, use_label_encoder=False; total time= 1.8min
[CV] END eval_metric=mlogloss, learning_rate=0.01, max_depth=9, n_estimators=2700, reg_lambda=1.7256912198205319, seed=42, use_label_encoder=False; total time= 1.9min
[CV] END eval_metric=mlogloss, learning_rate=0.01, max_depth=9, n_estimators=2700, reg_lambda=1.7256912198205319, seed=42, use_label_encoder=False; total time= 1.9min
[CV] END eval_metric=mlogloss, learning_rate=0.01, max_depth=9, n_estimators=2700, reg_lambda=1.7256912198205319, seed=42, use_label_encoder=False; total time= 1.9min
[CV] END eval_metric=mlogloss, learning_rate=0.01, max_depth=9, n_estimators=2700, reg_lambda=1.7256912198205319, seed=42, use_label_encoder=False; total time= 1.9min
[CV] END eval_metric=mlogloss, learning_rate=0.01, max_depth=9, n_estimators=2700, reg_lambda=1.72569121

#### 3-1-3-1. XGBoost (without GridSearchCV)

In [27]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=2500,
    eval_metric="mlogloss",
    learning_rate=0.01,
    max_depth=9,
    use_label_encoder=False,
    reg_lambda=1.7256912198205319,
    random_state=42
)

best_xgb = xgb.fit(X_train, y_train)

### 3-1-4. CatBoost

In [10]:
from catboost import CatBoostClassifier

cat = CatBoostClassifier()

cat_param_grid = {
    "verbose" : [0],
    "task_type" : ["GPU"],
    "l2_leaf_reg" : [9.847870133539244],
    "loss_function" : ["MultiClass"],
    "random_seed" : [42],
    "n_estimators" : [8000],
    "learning_rate" : [0.01],
    "grow_policy" : ["Depthwise"],
    "max_depth" : [7]
}

gs = GridSearchCV(
    cat, cat_param_grid,
    cv=k_fold, scoring="neg_log_loss", verbose=2
)

gs.fit(X_train, y_train)

best_cat = gs.best_estimator_

print("Best Score : ", gs.best_score_)
print("Best Params : \n", gs.best_params_)

m 31s	remaining: 4.38s
7634:	learn: 0.5183615	total: 1m 31s	remaining: 4.37s
7635:	learn: 0.5183450	total: 1m 31s	remaining: 4.36s
7636:	learn: 0.5183006	total: 1m 31s	remaining: 4.34s
7637:	learn: 0.5182788	total: 1m 31s	remaining: 4.33s
7638:	learn: 0.5182641	total: 1m 31s	remaining: 4.32s
7639:	learn: 0.5182461	total: 1m 31s	remaining: 4.31s
7640:	learn: 0.5182258	total: 1m 31s	remaining: 4.3s
7641:	learn: 0.5182019	total: 1m 31s	remaining: 4.28s
7642:	learn: 0.5181807	total: 1m 31s	remaining: 4.27s
7643:	learn: 0.5181739	total: 1m 31s	remaining: 4.26s
7644:	learn: 0.5181528	total: 1m 31s	remaining: 4.25s
7645:	learn: 0.5181457	total: 1m 31s	remaining: 4.24s
7646:	learn: 0.5181137	total: 1m 31s	remaining: 4.22s
7647:	learn: 0.5180998	total: 1m 31s	remaining: 4.21s
7648:	learn: 0.5180871	total: 1m 31s	remaining: 4.2s
7649:	learn: 0.5180685	total: 1m 31s	remaining: 4.19s
7650:	learn: 0.5180411	total: 1m 31s	remaining: 4.18s
7651:	learn: 0.5180130	total: 1m 31s	remaining: 4.16s
7652:	l

#### 3-1-4-1. CatBoost (without GridSearchCV)

In [None]:
from catboost import CatBoostClassifier

cat = CatBoostClassifier(
    verbose=0,
    task_type="GPU",
    l2_leaf_reg=9.847870133539244,
    loss_function="MultiClass",
    random_seed=42,
    n_estimators=8000,
    learning_rate=0.01,
    grow_policy="Depthwise",
    max_depth=7
)

best_cat = cat.fit(X_train, y_train)

### 3-1-5. HistGradientBoostingClassifier

In [18]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

hist = HistGradientBoostingClassifier()

hist_param_grid = {
    "loss" : ["categorical_crossentropy"],
    "learning_rate" : [0.01],
    "max_iter" : [500],
    "max_depth" : [21],
    "scoring" : ["neg_log_loss"],
    "random_state" : [42],
    "l2_regularization" : [1.766059063693552]   
}

gs = GridSearchCV(
    hist, hist_param_grid,
    cv=k_fold, scoring="neg_log_loss", verbose=2
)

gs.fit(X_train, y_train)

best_hist = gs.best_estimator_

print("Best Score : ", gs.best_score_)
print("Best params : \n", gs.best_params_)

Fitting 10 folds for each of 3 candidates, totalling 30 fits
[CV] END l2_regularization=1.766059063693552, learning_rate=0.01, loss=categorical_crossentropy, max_depth=19, max_iter=500, random_state=42, scoring=neg_log_loss; total time= 1.7min
[CV] END l2_regularization=1.766059063693552, learning_rate=0.01, loss=categorical_crossentropy, max_depth=19, max_iter=500, random_state=42, scoring=neg_log_loss; total time= 1.8min
[CV] END l2_regularization=1.766059063693552, learning_rate=0.01, loss=categorical_crossentropy, max_depth=19, max_iter=500, random_state=42, scoring=neg_log_loss; total time= 1.8min
[CV] END l2_regularization=1.766059063693552, learning_rate=0.01, loss=categorical_crossentropy, max_depth=19, max_iter=500, random_state=42, scoring=neg_log_loss; total time= 1.8min
[CV] END l2_regularization=1.766059063693552, learning_rate=0.01, loss=categorical_crossentropy, max_depth=19, max_iter=500, random_state=42, scoring=neg_log_loss; total time= 1.8min
[CV] END l2_regularizati

#### 3-1-5-1. HistGradientBoostingClassifier (without GridSearchCV)

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

hist = HistGradientBoostingClassifier(
    loss="categorical_crossentropy",
    learning_rate=0.01,
    max_iter=500,
    max_depth=9,
    scoring="neg_log_loss",
    l2_regularization=1.766059063693552
    random_state=42,
)

best_hist = hist.fit(X_train, y_train)

## 3-2. Stacking models

In [34]:
from sklearn.model_selection import StratifiedKFold

def get_stacking_data(model, X_train_n, y_train_n, X_test_n, n_folds=5):
    kfold = StratifiedKFold(n_splits=n_folds)
    
    train_fold_pred = np.zeros((X_train_n.shape[0], 1))
    test_pred = np.zeros((X_test_n.shape[0], n_folds))
    print("model : ", model.__class__.__name__)

    for cnt, (train_idx, valid_idx) in enumerate(kfold.split(X_train_n,y_train_n)):
        print(f"Fold : {cnt+1}")
        X_train_ = X_train_n.iloc[train_idx]
        y_train_ = y_train_n.iloc[train_idx]
        X_validation = X_train_n.iloc[valid_idx]

        model.fit(X_train_, y_train_)

        train_fold_pred[valid_idx, :] = model.predict(X_validation).reshape(-1, 1)

        test_pred[:, cnt] = model.predict(X_test_n)
    
    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1, 1)

    return train_fold_pred, test_pred_mean

### 3-2-1. Validation step

In [35]:
forest_train, forest_val = get_stacking_data(best_forest, X_train, y_train, X_val)
lgb_train, lgb_val = get_stacking_data(best_lgb, X_train, y_train, X_val)
xgb_train, xgb_val = get_stacking_data(best_xgb, X_train, y_train, X_val)
#cat_train, cat_val = get_stacking_data(best_cat, X_train, y_train, X_val)
hist_train, hist_val = get_stacking_data(best_hist, X_train, y_train, X_val)

model :  RandomForestClassifier
Fold : 1
Fold : 2
Fold : 3
Fold : 4
Fold : 5
model :  LGBMClassifier
Fold : 1
Fold : 2
Fold : 3
Fold : 4
Fold : 5
model :  XGBClassifier
Fold : 1
Fold : 2
Fold : 3
Fold : 4
Fold : 5
model :  HistGradientBoostingClassifier
Fold : 1
Fold : 2
Fold : 3
Fold : 4
Fold : 5


In [36]:
stack_X_train = np.concatenate((forest_train, lgb_train, xgb_train, hist_train), axis=1)
stack_X_val = np.concatenate((forest_val, lgb_val, xgb_val, hist_val), axis=1)

In [44]:
stack_X_train

array([[2., 2., 2., 2.],
       [1., 1., 1., 1.],
       [2., 2., 2., 2.],
       ...,
       [1., 2., 2., 2.],
       [2., 2., 2., 2.],
       [2., 2., 2., 2.]])

In [43]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier()

forest_param_grid = {
    "max_depth" : [3, 5],
    "n_estimators" : [1100, 2000],
    "criterion" : ["gini", "entropy"],
    "random_state" : [42]
}

gs = GridSearchCV(
    forest, forest_param_grid,
    cv=k_fold, scoring="neg_log_loss", verbose=2
)

gs.fit(stack_X_train, y_train)

best_model = gs.best_estimator_

print("Best Score : ", gs.best_score_)
print("Best Params : ", gs.best_params_)

best_model.fit(stack_X_train, y_train)

stack_pred_val = best_model.predict_proba(stack_X_val)

Fitting 10 folds for each of 8 candidates, totalling 80 fits
[CV] END criterion=gini, max_depth=3, n_estimators=1100, random_state=42; total time=   3.8s
[CV] END criterion=gini, max_depth=3, n_estimators=1100, random_state=42; total time=   3.6s
[CV] END criterion=gini, max_depth=3, n_estimators=1100, random_state=42; total time=   3.5s
[CV] END criterion=gini, max_depth=3, n_estimators=1100, random_state=42; total time=   3.6s
[CV] END criterion=gini, max_depth=3, n_estimators=1100, random_state=42; total time=   3.6s
[CV] END criterion=gini, max_depth=3, n_estimators=1100, random_state=42; total time=   3.6s
[CV] END criterion=gini, max_depth=3, n_estimators=1100, random_state=42; total time=   3.6s
[CV] END criterion=gini, max_depth=3, n_estimators=1100, random_state=42; total time=   3.6s
[CV] END criterion=gini, max_depth=3, n_estimators=1100, random_state=42; total time=   3.6s
[CV] END criterion=gini, max_depth=3, n_estimators=1100, random_state=42; total time=   3.6s
[CV] END 

In [40]:
from lightgbm import LGBMClassifier

lgb = LGBMClassifier()

lgb_param_grid = {
    "n_estimators" : [700],
    "learning_rate" : [0.001, 0.01, 0.1],
    "max_depth" : [9],
    "random_state" : [42]
}

gs = GridSearchCV(
    lgb, lgb_param_grid,
    cv=k_fold, scoring="neg_log_loss", verbose=2
)

gs.fit(stack_X_train, y_train)

best_model = gs.best_estimator_

print("Best Score : ", gs.best_score_)
print("Best Params : ", gs.best_params_)

best_model.fit(stack_X_train, y_train)

stack_pred_val = best_model.predict_proba(stack_X_val)

Fitting 10 folds for each of 3 candidates, totalling 30 fits
[CV] END learning_rate=0.001, max_depth=9, n_estimators=700, random_state=42; total time=   1.5s
[CV] END learning_rate=0.001, max_depth=9, n_estimators=700, random_state=42; total time=   1.3s
[CV] END learning_rate=0.001, max_depth=9, n_estimators=700, random_state=42; total time=   1.4s
[CV] END learning_rate=0.001, max_depth=9, n_estimators=700, random_state=42; total time=   1.3s
[CV] END learning_rate=0.001, max_depth=9, n_estimators=700, random_state=42; total time=   1.3s
[CV] END learning_rate=0.001, max_depth=9, n_estimators=700, random_state=42; total time=   1.4s
[CV] END learning_rate=0.001, max_depth=9, n_estimators=700, random_state=42; total time=   1.3s
[CV] END learning_rate=0.001, max_depth=9, n_estimators=700, random_state=42; total time=   1.3s
[CV] END learning_rate=0.001, max_depth=9, n_estimators=700, random_state=42; total time=   1.3s
[CV] END learning_rate=0.001, max_depth=9, n_estimators=700, rando

### 3-2-2. test step

In [None]:
forest_train, forest_test = get_stacking_data(best_forest, X_train, y_train, X_test)
ada_train, ada_test = get_stacking_data(best_ada, X_train, y_train, X_test)
xgb_train, xgb_test = get_stacking_data(best_xgb, X_train, y_train, X_test)

In [None]:
stack_X_train = np.concatenate((forest_train, ada_train, xgb_train), axis=1)
stack_X_test = np.concatenate((forest_test, ata_test, xgb_test), axis=1)

In [None]:
best_model.fit(stack_X_train, y_train)

stack_pred_test = best_model.predict_proba(stack_X_test)

In [None]:
stack_pred_test

# 4. Evaluating : logloss

In [None]:
from sklearn.metrics import log_loss
from tensorflow.keras.utils import to_categorical

logloss = log_loss(to_categorical(y_val), stack_pred_val)
print(logloss)

# 5. Submission

In [None]:
submit_file_dir = "../result_file/performace_result/"

In [None]:
stack_pred_test = pd.DataFrame(stack_pred_test)
stack_pred_test.columns = ['0', '1', '2']

submission['0'] = stack_pred_test['0']
submission['1'] = stack_pred_test['1']
submission['2'] = stack_pred_test['2']

submission.to_csv(os.path.join(submit_file_dir, "result_stacking.csv"), index=False)