# Data Loader

In [37]:
import numpy as np
import pandas as pd
import os

np.random.seed(42)
pd.set_option('display.max_columns', 500)

In [38]:
data_dir = "../result_file/preprocess_results/"
submision_dir = "../../../kaggle_data/creditcard_overdue/open/"

train_bin5 = pd.read_csv(os.path.join(data_dir, "train_income_bin5.csv"))
train_bin7 = pd.read_csv(os.path.join(data_dir, "train_income_bin7.csv"))
train_bin10 = pd.read_csv(os.path.join(data_dir, "train_income_bin10.csv"))

test_bin5 = pd.read_csv(os.path.join(data_dir, "test_income_bin5.csv"))
test_bin7 = pd.read_csv(os.path.join(data_dir, "test_income_bin7.csv"))
test_bin10 = pd.read_csv(os.path.join(data_dir, "test_income_bin10.csv"))

submission = pd.read_csv(os.path.join(submision_dir, "sample_submission.csv"))

# Data split

In [39]:
X_train_bin5 = train_bin5.drop(['credit'], axis=1)
y_train_bin5 = train_bin5['credit']

X_train_bin7 = train_bin7.drop(['credit'], axis=1)
y_train_bin7 = train_bin7['credit']

X_train_bin10 = train_bin10.drop(['credit'], axis=1)
y_train_bin10 = train_bin10['credit']

In [40]:
from sklearn.model_selection import train_test_split

X_train_bin5, X_val_bin5, y_train_bin5, y_val_bin5 = train_test_split(X_train_bin5, y_train_bin5, stratify=y_train_bin5, test_size=0.2, random_state=42)

X_train_bin7, X_val_bin7, y_train_bin7, y_val_bin7 = train_test_split(X_train_bin7, y_train_bin7, stratify=y_train_bin7, test_size=0.2, random_state=42)

X_train_bin10, X_val_bin10, y_train_bin10, y_val_bin10 = train_test_split(X_train_bin10, y_train_bin10, stratify=y_train_bin10, test_size=0.2, random_state=42)

# Training : Random Forest

In [41]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

## (version 1) income_total : bins=5

In [42]:
feat_labels = X_train_bin5.columns

forest = RandomForestClassifier()

forest_param_grid = {"max_depth" : [21, 23],
                    "n_estimators" : [700, 800],
                    "criterion" : ["gini", "entropy"],
                    "random_state" : [42]}

gs = GridSearchCV(forest, forest_param_grid, 
                       cv=k_fold, scoring="neg_log_loss", verbose=1, n_jobs=-1)

gs.fit(X_train_bin5, y_train_bin5)

best_model = gs.best_estimator_

print('best_score : ', gs.best_score_)
print('best_params : \n', gs.best_params_)

best_model.fit(X_train_bin5, y_train_bin5)

importances = best_model.feature_importances_

val_pred_bin5 = best_model.predict_proba(X_val_bin5)
predict_bin5 = best_model.predict_proba(test_bin5)

Fitting 10 folds for each of 8 candidates, totalling 80 fits
best_score :  -0.7290458803174797
best_params : 
 {'criterion': 'gini', 'max_depth': 21, 'n_estimators': 800, 'random_state': 42}


## (version 2) income_total : bins=7

In [43]:
feat_labels = X_train_bin7.columns

forest = RandomForestClassifier()

forest_param_grid = {"max_depth" : [21, 23],
                    "n_estimators" : [800, 900],
                    "criterion" : ["gini", "entropy"],
                    "random_state" : [42]}

gs = GridSearchCV(forest, forest_param_grid, 
                       cv=k_fold, scoring="neg_log_loss", verbose=1, n_jobs=-1)

gs.fit(X_train_bin7, y_train_bin7)

best_model = gs.best_estimator_

print('best_score : ', gs.best_score_)
print('best_params : \n', gs.best_params_)

best_model.fit(X_train_bin7, y_train_bin7)

importances = best_model.feature_importances_

val_pred_bin7 = best_model.predict_proba(X_val_bin7)
predict_bin7 = best_model.predict_proba(test_bin7)

Fitting 10 folds for each of 8 candidates, totalling 80 fits
best_score :  -0.7288365618426148
best_params : 
 {'criterion': 'gini', 'max_depth': 21, 'n_estimators': 900, 'random_state': 42}


## (version 3) income_total : bins=10

In [44]:
feat_labels = X_train_bin10.columns

forest = RandomForestClassifier()

forest_param_grid = {"max_depth" : [21, 23],
                    "n_estimators" : [800, 900],
                    "criterion" : ["gini", "entropy"],
                    "random_state" : [42]}

gs = GridSearchCV(forest, forest_param_grid, 
                       cv=k_fold, scoring="neg_log_loss", verbose=1, n_jobs=-1)

gs.fit(X_train_bin10, y_train_bin10)

best_model = gs.best_estimator_

print('best_score : ', gs.best_score_)
print('best_params : \n', gs.best_params_)

best_model.fit(X_train_bin10, y_train_bin10)

importances = best_model.feature_importances_

val_pred_bin10 = best_model.predict_proba(X_val_bin10)
predict_bin10 = best_model.predict_proba(test_bin10)

Fitting 10 folds for each of 8 candidates, totalling 80 fits
best_score :  -0.7274561985418982
best_params : 
 {'criterion': 'gini', 'max_depth': 21, 'n_estimators': 900, 'random_state': 42}


# Evaluation : logloss

In [45]:
from sklearn.metrics import log_loss
from tensorflow.keras.utils import to_categorical

logloss = log_loss(to_categorical(y_val_bin5), val_pred_bin5)
print(logloss)

0.7289340999568911


In [46]:
logloss = log_loss(to_categorical(y_val_bin7), val_pred_bin7)
print(logloss)

0.7265824045165541


In [47]:
logloss = log_loss(to_categorical(y_val_bin5), val_pred_bin10)
print(logloss)

0.7248155437153727


# Submission

In [48]:
submit_file_dir = "../result_file/performance_result/"

In [49]:
predict_bin5 = pd.DataFrame(predict_bin5)
predict_bin5.columns = ['0', '1', '2']

submission['0'] = predict_bin5['0']
submission['1'] = predict_bin5['1']
submission['2'] = predict_bin5['2']

submission.to_csv(os.path.join(submit_file_dir, "result_bin5.csv"), index=False)

In [50]:
predict_bin7 = pd.DataFrame(predict_bin7)
predict_bin7.columns = ['0', '1', '2']

submission['0'] = predict_bin7['0']
submission['1'] = predict_bin7['1']
submission['2'] = predict_bin7['2']

submission.to_csv(os.path.join(submit_file_dir, "result_bin7.csv"), index=False)

In [51]:
predict_bin10 = pd.DataFrame(predict_bin5)
predict_bin10.columns = ['0', '1', '2']

submission['0'] = predict_bin10['0']
submission['1'] = predict_bin10['1']
submission['2'] = predict_bin10['2']

submission.to_csv(os.path.join(submit_file_dir, "result_bin10.csv"), index=False)