# Set Environment

In [1]:
# # 필요 package 설치
# !pip install pytorch_tabnet     # tabnet 모델 설치
# !pip install catboost           # catboost 모델 설치
# !pip install ctgan==0.3.1.dev0  # gan 설치

In [2]:
# google drive mount
# from google.colab import drive
# drive.mount('/content/drive')
# %cd /content/drive/MyDrive/SKKU_study/01_Final_assignment/source/make_module

In [3]:
# import Custom packages
from model_tabnet import ModelTabnet
from model_forest import ModelForest
from model_catboost import ModelCatBoost
from ensemble_hard import Hard_voting
from ensemble_soft import EnsembleSoft
import Metrics
import sampling

# import packages
from ctgan import CTGANSynthesizer
from sklearn.metrics import accuracy_score, f1_score
from collections import Counter
import pandas as pd
import numpy as np
import os
import random
import warnings
warnings.filterwarnings('ignore')

In [4]:
# set directories
main_dir = '../'
data_dir = os.path.join(main_dir, 'data')
processed_data_dir = os.path.join(data_dir, 'processed')

os.makedirs(data_dir, exist_ok=True)
os.makedirs(processed_data_dir, exist_ok=True)

# Load Dataset (Vanilla)

In [5]:
dfs = pd.read_csv(os.path.join(processed_data_dir, 'processed.csv'))
train = pd.read_csv(os.path.join(processed_data_dir, 'train.csv'))
test = pd.read_csv(os.path.join(processed_data_dir, 'test.csv'))

all_col_list = [col for col in dfs.columns if col != 'Credit_Score']
train.shape, test.shape # original shape check

((75000, 23), (25000, 23))

# Sampling
- Vanilla: {0: 21782, 1: 40214, 2: 13004}
- SMOTENC: {0: 40214, 1: 40214, 2: 40214}, auto
- SMOTE+Tomeklink: {0: 38217, 1: 37268, 2: 39177}, auto
- GAN: {0: 31393, 1: 40214, 2: 29886}

## SMOTENC

In [6]:
# ratio = {0: 21782, 1: 40214, 2: 13004}
train_smtnc = sampling.train_smote_nc(train, 'auto')
print('sampled with SMOTENC dataset shape %s' % Counter(train_smtnc['Credit_Score']))

sampled with SMOTENC dataset shape Counter({2: 40214, 1: 40214, 0: 40214})


## SMOTE + Tomeklink

In [7]:
# ratio = {0: 21782, 1: 40214, 2: 13004} 
train_smttm = sampling.train_smote_tomek(train, 'auto')
print('sampled with SMOTETOMEK dataset shape %s' % Counter(train_smttm['Credit_Score']))

sampled with SMOTETOMEK dataset shape Counter({2: 39177, 0: 38217, 1: 37268})


## GAN

In [8]:
# gan model은 시간상의 문제로 인해 기존 생성되어 있는 over sampled file 직접 로드하는 형식으로 작성 필요
# train_gan=sampling.train_gan(train)
train_gan = pd.read_csv(os.path.join(data_dir, 'output_GAN_data', 'generate_gan_v4.csv'))
train_gan.drop(columns='Unnamed: 0', inplace=True)

random_list = [] # class 0은 9000개정도 oversampling
for i in range(0, 5000):
    random_list.append(random.randint(0, 12499))
train_gan_0 = train_gan[
    (train_gan.Credit_Score==0) &
    (train_gan.Customer_ID.isin(random_list))
    ]

train_gan_2 = train_gan[train_gan.Credit_Score==2]
train_gan = pd.concat([train_gan_0, train_gan_2], ignore_index=True)
train_gan = pd.concat([train_gan, train], ignore_index=True)
print('sampled with GAN dataset shape %s' % Counter(train_gan['Credit_Score']))

sampled with GAN dataset shape Counter({1: 40214, 0: 31427, 2: 29886})


# Modeling
- 각 모델 단위로 4가지 Sampling (Vanilla, SMOTENC, SMOTE+Tomeklink, GAN) 방식으로 Sampling된 데이터를 Train으로 사용하여 훈련
- 훈련된 모델 중 f1 score (macro)를 기준으로 최적의 Matching (Model X Sampling Method)를 선택
- 총 3개 (best_tabnet, best_rf, best_catboost) model 결과를 ensemble 단계로 전달

## TABNET

### Vanilla

In [9]:
model_tabnet = ModelTabnet(
    dfs_train=train,
    dfs_test=test,
)
tabnet_proba = model_tabnet.predicted_proba

Customer_ID :  12500
Month :  8
Credit_Mix :  3
Credit_History_Age :  2
Occupation :  15
Payment_of_Min_Amount :  3
Payment_Behaviour :  6
!!!Pretrain Start!!!
Device used : cpu
No early stopping will be performed, last training weights will be used.
epoch 0  | loss: 5.59501 |  0:00:19s
epoch 1  | loss: 2.31056 |  0:00:37s
epoch 2  | loss: 1.64646 |  0:00:54s
epoch 3  | loss: 1.31633 |  0:01:10s
epoch 4  | loss: 1.16968 |  0:01:26s
epoch 5  | loss: 1.06616 |  0:01:45s
epoch 6  | loss: 1.03727 |  0:02:07s
epoch 7  | loss: 1.02319 |  0:02:24s
epoch 8  | loss: 1.0124  |  0:02:41s
epoch 9  | loss: 0.99334 |  0:02:59s
epoch 10 | loss: 0.9759  |  0:03:16s
epoch 11 | loss: 0.96596 |  0:03:34s
epoch 12 | loss: 0.95693 |  0:03:49s
epoch 13 | loss: 0.95113 |  0:04:05s
epoch 14 | loss: 0.94717 |  0:04:17s
epoch 15 | loss: 0.94465 |  0:04:29s
epoch 16 | loss: 0.94413 |  0:04:46s
epoch 17 | loss: 0.94022 |  0:04:59s
epoch 18 | loss: 0.93614 |  0:05:10s
epoch 19 | loss: 0.93025 |  0:05:20s
epoch 20 

KeyboardInterrupt: 

In [None]:
# feature importance of tabnet
print('*** Feature Importance of TABNET ***')
for i in range(len(all_col_list)):
    print(all_col_list[i], ': ', model_tabnet.model_tabnet.feature_importances_.round(3)[i])

In [None]:
# check probability of tabnet predict result
tabnet_proba.head()

### SMOTENC

In [None]:
model_tabnet_smtnc = ModelTabnet(
    dfs_train=train,
    dfs_test=test,
    dfs_sampled=train_smtnc,
)
tabnet_proba_smtnc = model_tabnet_smtnc.predicted_proba

In [None]:
# feature importance of tabnet x SMOTENC
print('*** Feature Importance of TABNET x SMOTENC ***')
for i in range(len(all_col_list)):
    print(all_col_list[i], ': ', model_tabnet_smtnc.model_tabnet.feature_importances_.round(3)[i])

In [None]:
tabnet_proba_smtnc.head()

### SMOTE + Tomeklink

In [None]:
model_tabnet_smttm = ModelTabnet(
    dfs_train=train,
    dfs_test=test,
    dfs_sampled=train_smttm,
)
tabnet_proba_smttm = model_tabnet_smttm.predicted_proba

In [None]:
# feature importance of tabnet X SMOTETOMEK
print('*** Feature Importance of TABNET X SMOTETOMEK ***')
for i in range(len(all_col_list)):
    print(all_col_list[i], ': ', model_tabnet_smttm.model_tabnet.feature_importances_.round(3)[i])

In [None]:
tabnet_proba_smttm.head()

### GAN

In [None]:
model_tabnet_gan = ModelTabnet(
    dfs_train=train,
    dfs_test=test,
    dfs_sampled=train_gan,
)
tabnet_proba_gan = model_tabnet_gan.predicted_proba

In [None]:
# feature importance of tabnet X GAN
print('*** Feature Importance of TABNET X GAN ***')
for i in range(len(all_col_list)):
    print(all_col_list[i], ': ', model_tabnet_gan.model_tabnet.feature_importances_.round(3)[i])

In [None]:
tabnet_proba_gan.head()

### Select Best Model with TABNET

In [None]:
# acc_vanilla = accuracy_score(test['Credit_Score'], model_tabnet.predict['predict'])
# acc_smtnc = accuracy_score(test['Credit_Score'], model_tabnet_smtnc.predict['predict'])
# acc_smttm = accuracy_score(test['Credit_Score'], model_tabnet_smttm.predict['predict'])
# acc_gan = accuracy_score(test['Credit_Score'], model_tabnet_gan.predict['predict'])

acc_vanilla = f1_score(test['Credit_Score'], model_tabnet.predict['predict'], average='macro')
acc_smtnc = f1_score(test['Credit_Score'], model_tabnet_smtnc.predict['predict'], average='macro')
acc_smttm = f1_score(test['Credit_Score'], model_tabnet_smttm.predict['predict'], average='macro')
acc_gan = f1_score(test['Credit_Score'], model_tabnet_gan.predict['predict'], average='macro')

accuracy_dict = {
    'Vanilla': acc_vanilla,
    'SMOTENC': acc_smtnc,
    'SMOTETOMEK': acc_smttm,
    'GAN': acc_gan,
}
best_acc = max(accuracy_dict.values())
best_model_idx = list(accuracy_dict.values()).index(best_acc)
best_model = list(accuracy_dict.keys())[best_model_idx]
# print('Best TABNET accuracy is ', best_acc, 'with ', best_model)
print('Best TABNET f1 score is ', best_acc, 'with ', best_model)

if best_model == 'Vanilla':
    best_tabnet = model_tabnet
elif best_model == 'SMOTENC':
    best_tabnet = model_tabnet_smtnc
elif best_model == 'SMOTETOMEK':
    best_tabnet = model_tabnet_smttm
else:
    best_tabnet = model_tabnet_gan

## RF

### Vanilla

In [None]:
model_forest = ModelForest(
    train,
    test,
    "plain"
)
result_forest = model_forest.predict_proba

In [None]:
# feature importance
print('*** Feature Importance of RandomForest ***')
for i in range(len(all_col_list)):
    print(all_col_list[i], ': ', model_forest.feature_importance.round(3)[i])

In [None]:
result_forest.head()

### SmoteNC

In [None]:
model_forest_smtnc = ModelForest(
    train_smtnc,
    test, 
    "plain"
)
forest_proba_smtnc = model_forest_smtnc.predict_proba

In [None]:
# feature importance
print('*** Feature Importance of RandomForest x SMOTETNC ***')
for i in range(len(all_col_list)):
    print(all_col_list[i], ': ', model_forest_smtnc.feature_importance.round(3)[i])

In [None]:
forest_proba_smtnc.head()

### SMOTE + Tomeklink

In [None]:
model_forest_smttm = ModelForest(
    train_smttm,
    test ,
    "plain"
)
forest_proba_smttm = model_forest_smttm.predict_proba

In [None]:
# feature importance
print('*** Feature Importance of RandomForest x SMOTETOMEK ***')
for i in range(len(all_col_list)):
    print(all_col_list[i], ': ', model_forest_smttm.feature_importance.round(3)[i])

In [None]:
forest_proba_smttm.head()

### GAN

In [None]:
model_forest_gan = ModelForest(
    train_gan,
    test,
    "plain"
)
forest_proba_gan = model_forest_gan.predict_proba

In [None]:
# feature importance
print('*** Feature Importance of RandomForest x GAN ***')
for i in range(len(all_col_list)):
    print(all_col_list[i], ': ', model_forest_gan.feature_importance.round(3)[i])

In [None]:
forest_proba_gan.head()

### Select Best Model with RF

In [None]:
# acc_vanilla = accuracy_score(test['Credit_Score'], model_forest.predict['predict'])
# acc_smtnc = accuracy_score(test['Credit_Score'], model_forest_smtnc.predict['predict'])
# acc_smttm = accuracy_score(test['Credit_Score'], model_forest_smttm.predict['predict'])
# acc_gan = accuracy_score(test['Credit_Score'], model_forest_gan.predict['predict'])

acc_vanilla = f1_score(test['Credit_Score'], model_forest.predict['predict'], average='macro')
acc_smtnc = f1_score(test['Credit_Score'], model_forest_smtnc.predict['predict'], average='macro')
acc_smttm = f1_score(test['Credit_Score'], model_forest_smttm.predict['predict'], average='macro')
acc_gan = f1_score(test['Credit_Score'], model_forest_gan.predict['predict'], average='macro')

accuracy_dict = {
    'Vanilla': acc_vanilla,
    'SMOTENC': acc_smtnc,
    'SMOTETOMEK': acc_smttm,
    'GAN': acc_gan
}
best_acc = max(accuracy_dict.values())
best_model_idx = list(accuracy_dict.values()).index(best_acc)
best_model = list(accuracy_dict.keys())[best_model_idx]
print('Best RANDOM FOREST f1 score is ', best_acc, 'with ', best_model)

if best_model == 'Vanilla':
    best_rf = model_forest
elif best_model == 'SMOTENC':
    best_rf = model_forest_smtnc
elif best_model =='GAN':
    best_rf = model_forest_gan
else:
    best_rf = model_forest_smttm

## CatBoost

### Vanilla

In [None]:
model_catboost = ModelCatBoost(
    train,
    test,
    "optimize"
)

In [None]:
print('*** Feature Importance of CatBoost ***')
for i in range(len(all_col_list)):
    print(all_col_list[i], ': ', model_catboost.feature_importance.round(3)[i])

In [None]:
result_catboost = model_catboost.predict_proba
result_catboost.head()

### SMOTENC

In [None]:
model_cat_smtnc = ModelCatBoost(
    train_smtnc,
    test,'optimize'
)
cat_proba_smtnc = model_cat_smtnc.predict_proba

In [None]:
# feature importance
print('*** Feature Importance of CATBOOST x SMOTETNC ***')
for i in range(len(all_col_list)):
    print(all_col_list[i], ': ', model_cat_smtnc.feature_importance.round(3)[i])

In [None]:
cat_proba_smtnc.head()

### SMOTE + Tomeklink

In [None]:
model_cat_smttm = ModelCatBoost(
    train_smttm,
    test,'optimize'
)
cat_proba_smttm = model_cat_smttm.predict_proba

In [None]:
# feature importance
print('*** Feature Importance of CATBOOST x SMOTETOMEK ***')
for i in range(len(all_col_list)):
    print(all_col_list[i], ': ', model_cat_smttm.feature_importance.round(3)[i])

In [None]:
cat_proba_smttm.head()

### GAN

In [None]:
model_cat_gan = ModelCatBoost(
    train_gan,
    test,'optimize'
)
cat_proba_gan = model_cat_gan.predict_proba

In [None]:
# feature importance
print('*** Feature Importance of CATBOOST x GAN ***')
for i in range(len(all_col_list)):
    print(all_col_list[i], ': ', model_cat_gan.feature_importance.round(3)[i])

In [None]:
cat_proba_gan.head()

### Select Best Model with CatBoot

In [None]:
# acc_vanilla = accuracy_score(test['Credit_Score'], model_catboost.predict['predict'])
# acc_smtnc = accuracy_score(test['Credit_Score'], model_cat_smtnc.predict['predict'])
# acc_smttm = accuracy_score(test['Credit_Score'], model_cat_smttm.predict['predict'])
# acc_gan = accuracy_score(test['Credit_Score'], model_cat_gan.predict['predict'])

acc_vanilla = f1_score(test['Credit_Score'], model_catboost.predict['predict'], average='macro')
acc_smtnc = f1_score(test['Credit_Score'], model_cat_smtnc.predict['predict'], average='macro')
acc_smttm = f1_score(test['Credit_Score'], model_cat_smttm.predict['predict'], average='macro')
acc_gan = f1_score(test['Credit_Score'], model_cat_gan.predict['predict'], average='macro')

accuracy_dict = {
    'Vanilla': acc_vanilla,
    'SMOTENC': acc_smtnc,
    'SMOTETOMEK': acc_smttm,
    'GAN' : acc_gan
}
best_acc = max(accuracy_dict.values())
best_model_idx = list(accuracy_dict.values()).index(best_acc)
best_model = list(accuracy_dict.keys())[best_model_idx]
print('Best CATBOOST f1 score is ', best_acc, 'with ', best_model)

if best_model == 'Vanilla':
    best_cat = model_catboost
elif best_model == 'SMOTENC':
    best_cat = model_cat_smtnc
elif best_model == 'GAN':
    best_cat = model_cat_gan
else:
    best_cat = model_cat_smttm

# Ensemble
- Ensemble with Best 3 Models From TABNET, RF, CatBoost individually

## Hard Voting

In [None]:
# proba concat (25000 * 3)
merge_proba = pd.concat([best_tabnet.predicted_proba, best_rf.predict_proba, best_cat.predict_proba])
merge_proba

In [None]:
result_hv = Hard_voting(merge_proba).predict
result_hv.head()

## Soft Voting
soft_ratio: 

```
[[tabnet 0 class probability, catboost 0 class probability, rf 0 class probability],
[tabnet 1 class probability, catboost 1 class probability, rf 1 class probability],
[tabnet 2 class probability, catboost 2 class probability, rf 2 class probability]]
```




In [None]:
# soft_ratio가 없는 경우 기본 값은 1:1:1
# ensemble_soft = EnsembleSoft(best_tabnet.predicted_proba, best_cat.predict_proba, best_rf.predict_proba)
# print('accuracy: ', accuracy_score(test['Credit_Score'], ensemble_soft.predict['predict']))

In [None]:
# soft_ratio 지정하여 최적의 성능 내는 ratio test 중임 (sampling에서 최적 구한거 가지고 빡쎄게 tuning 필요할듯)
soft_ratio = [[0.35, 0.65/2, 0.65/2], [0.3, 0.35, 0.35], [0.35, 0.65/2, 0.65/2]]
ensemble_soft = EnsembleSoft(best_tabnet.predicted_proba, best_cat.predict_proba, best_rf.predict_proba, soft_ratio)
ensemble_soft.predict_proba

In [None]:
print('accuracy: ', accuracy_score(test['Credit_Score'], ensemble_soft.predict['predict']))

# Performance
- target: 5 models
    - 3 simple models with Best matching sampled dataset
    - ensemble with hard voting model
    - ensemble with soft voting model
- 1st section: barchart
    - f1 score, accuracy, auroc(ovr), auroc(ovo), auprc(ap)
- 2nd section: roc_curve, pr_curve (w/ 수치)
- 3rd section: confusion metrics

## Metrics for Unbalanced Dataset

In [None]:
# vanilla + hard ,soft
proba_list = [result_catboost, result_forest, tabnet_proba, ensemble_soft.predict_proba]
pred_list = [model_catboost.predict, model_forest.predict, model_tabnet.predict,result_hv,ensemble_soft.predict]

Metrics.Compare_Model(proba_list,pred_list,test)

In [None]:
# best + hard ,soft

proba_list = [best_cat.predict_proba,best_rf.predict_proba,best_tabnet.predicted_proba,ensemble_soft.predict_proba]
pred_list = [best_cat.predict, best_rf.predict, best_tabnet.predict ,result_hv,ensemble_soft.predict]

Metrics.Compare_Model(proba_list,pred_list,test)

## Graph for Unbalanced Dataset


In [None]:
# PR Curve & ROC Curve (Vanilla data)
proba_list = [result_catboost,result_forest,tabnet_proba, ensemble_soft.predict_proba]
Metrics.PR_Curve(proba_list,test)
Metrics.ROC_Curve(proba_list,test)

In [None]:
# PR Curve & ROC Curve (Best Model)
proba_list = [best_cat.predict_proba,best_rf.predict_proba,best_tabnet.predicted_proba,ensemble_soft.predict_proba]
Metrics.PR_Curve(proba_list,test)
Metrics.ROC_Curve(proba_list,test)

## Confusion Metrics


In [None]:
# vanilla version
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(1, 5 , figsize = (30,5))
predict_dict = {
    "CATBOOST": model_catboost.predict,
    "RF": model_forest.predict,
    "TABNET": model_tabnet.predict,
    "Hard Voting": result_hv,
    "Soft Voting": ensemble_soft.predict
}

for i in range(0, 5):
    cf_matrix = confusion_matrix(test['Credit_Score'], list(predict_dict.values())[i].iloc[:, 1])
    ax = sns.heatmap(cf_matrix, annot=True, cmap='Blues', ax=axes[i])
    ax.set_title(list(predict_dict.keys())[i])

plt.show()

In [None]:
# best model version
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(1, 5 , figsize = (30,5))
predict_dict = {
    "CATBOOST": best_cat.predict,
    "RF": best_rf.predict,
    "TABNET": best_tabnet.predict,
    "Hard Voting": result_hv,
    "Soft Voting": ensemble_soft.predict
}

for i in range(0, 5):
    cf_matrix = confusion_matrix(test['Credit_Score'], list(predict_dict.values())[i].iloc[:, 1])
    ax = sns.heatmap(cf_matrix, annot=True, cmap='Blues', ax=axes[i])
    ax.set_title(list(predict_dict.keys())[i])

plt.show()