# 1. Data Loader

In [53]:
import numpy as np
import pandas as pd

np.random.seed(42)

In [54]:
data_dir = "../result_file/preprocess_results/"
submission_dir = "../../../kaggle_data/creditcard_overdue/open/"

train_bin10 = pd.read_csv(os.path.join(data_dir, "train_income_bin10.csv"))

test_bin10 = pd.read_csv(os.path.join(data_dir, "test_income_bin10.csv"))

submission = pd.read_csv(os.path.join(submission_dir, "sample_submission.csv"))

# 2. Data split

In [55]:
X_train = train_bin10.drop(['credit'], axis=1)
y_train = train_bin10['credit']

In [56]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2, random_state=42)

# 3. Training : Stacking

## 3-1. Tuning each model's hyperparameters

In [57]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

### 3-1-1. Random Forest

In [58]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier()

forest_param_grid = {
    "max_depth" : [21, 23],
    "n_estimators" : [800, 900],
    "criterion" : ["gini", "entropy"],
    "random_state" : [42]
}

gs = GridSearchCV(
    forest, forest_param_grid,
    cv=k_fold, scoring="neg_log_loss", verbose=1
)

gs.fit(X_train, y_train)

best_forest = gs.best_estimator_

print("Best Score : ", gs.best_score_)
print("Best params : \n", gs.best_params_)

Fitting 10 folds for each of 8 candidates, totalling 80 fits


KeyboardInterrupt: 

### 3-1-2. AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

adaboost = AdaBoostClassifier()

adaboost_param_grid = {
    "n_estimators" : [500, 600, 700],
    "learning_rate" : [0.01, 0.05, 0.1],
    "algorithm" : ["SAMME", "SAMME.R"]
    "random_state" : [42]
}

gs = GridSearchCV(
    adaboost, adaboost_param_grid,
    cv=k_fold, scoring="neg_log_loss", verbose=1, n_jobs=-1
)

gs.fit(X_train, y_train)

best_ada = gs.best_estimator_

print("Best Score : ", gs.best_score_)
print("Best params : \n", gs.best_params_)

### 3-1-3. XGBoost

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier()

xgb_param_grid = {
    "n_estimators" : [500, 600, 700],
    "learning_rate" : [0.01, 0.05, 0.1],
    "max_depth" : [19, 21, 23]
}

gs = GridSearchCV(
    xgb, xgb_param_grid,
    cv=k_fold, scoring="neg_log_loss", verbose=1, n_jobs=-1
)

gs.fit(X_train, y_train)

best_xgb = gs.best_estimator_

print("Best Score : ", gs.best_score_)
print("Best Params : ", gs.best_params_)

## 3-2. Stacking models

In [None]:
from sklearn.model_selection import StratifiedKFold

def get_stacking_data(model, X_train, y_train, X_test, n_folds=10):
    k_fold = StratifiedKFold(n_splits=n_folds)

    train_fold_predict = np.zeros((X_train.shape[0], 1))
    test_predict = np.zeros((X_test.shape[0], n_folds))
    print("model : ", model.__class__.__name__)

    for idx, (train_index, valid_index) in enumerate(k_fold.split(X_train, y_train)):
        X_train = X_train[train_index]
        y_train = y_train[train_index]
        X_val = X_train[valid_index]

        model.fit(X_train, y_train)

        train_fold_predict[valid_index, :] = model.predict(X_val).reshape(-1, 1)
        test_predict[:, idx] = model.predict(X_test)
    
    test_predict_mean = np.mean(test_predict, axis=1).reshape(-1, 1)

    return train_fold_predict, test_predict_mean

### 3-2-1. Validation step

In [None]:
forest_train, forest_val = get_stacking_data(best_forest, X_train, y_train, X_val)
ada_train, ada_val = get_stacking_data(best_ada, X_train, y_train, X_val)
xgb_train, xgb_val = get_stacking_data(best_xgb, X_train, y_train, X_val)

In [None]:
stack_X_train = np.concatenate((forest_train, ada_train, xgb_train), axis=1)
stack_X_val = np.concatenate((forest_val, ata_val, xgb_val), axis=1)

In [None]:
from lightgbm import LGBMClassifier

lgb = LGBMClassifier()

lgb_param_grid = {
    "n_estimators" : [500, 600, 700],
    "learning_rate" : [0.01, 0.05, 0.1],
    "max_depth" : [19, 21, 23]
}

gs = GridSearchCV(
    lgb, lgb_param_grid,
    cv=k_fold, scoring="neg_log_loss", verbose=1, n_jobs=-1
)

gs.fit(stack_X_train, y_train)

best_lgb = gs.best_estimator_

print("Best Score : ", gs.best_score_)
print("Best Params : ", gs.best_params_)

best_lgb.fit(stack_X_train, y_train)

stack_pred_val = best_lgb.predict_proba(stack_X_val)

### 3-2-2. test step

In [None]:
forest_train, forest_test = get_stacking_data(best_forest, X_train, y_train, X_test)
ada_train, ada_test = get_stacking_data(best_ada, X_train, y_train, X_test)
xgb_train, xgb_test = get_stacking_data(best_xgb, X_train, y_train, X_test)

In [None]:
stack_X_train = np.concatenate((forest_train, ada_train, xgb_train), axis=1)
stack_X_test = np.concatenate((forest_test, ata_test, xgb_test), axis=1)

In [None]:
best_lgb.fit(stack_X_train, y_train)

stack_pred_test = best_lgb.predict_proba(stack_X_test)

In [None]:
stack_pred_test

# 4. Evaluating : logloss

In [None]:
from sklearn.metrics import log_loss
from tensorflow.keras.utils import to_categorical

logloss = log_loss(to_categorical(y_val), stack_pred_val)
print(logloss)

# 5. Submission

In [None]:
submit_file_dir = "../result_file/performace_result/"

In [None]:
stack_pred_test = pd.DataFrame(stack_pred_test)
stack_pred_test.columns = ['0', '1', '2']

submission['0'] = stack_pred_test['0']
submission['1'] = stack_pred_test['1']
submission['2'] = stack_pred_test['2']

submission.to_csv(os.path.join(submit_file_dir, "result_stack_rf_ada_xgb-lgbm.csv"), index=False)