In [2]:
import imblearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data = pd.read_csv("mice_scaled.csv")

In [4]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [5]:
from sklearn.metrics import confusion_matrix
def plot_cm(classifier, predictions):
    cm = confusion_matrix(y_test, predictions)
    
    plt.clf()
    plt.imshow(cm, interpolation='nearest', cmap='RdBu')
    classNames = ['Bankrupted','Not Bankrupted']
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    tick_marks = np.arange(len(classNames))
    plt.xticks(tick_marks, classNames, rotation=45)
    plt.yticks(tick_marks, classNames)
    s = [['TN','FP'], ['FN', 'TP']]
    
    for i in range(2):
        for j in range(2):
            plt.text(j,i, str(s[i][j])+" = "+str(cm[i][j]), 
                     horizontalalignment='center', color='White')
    
    plt.show()
        
    tn, fp, fn, tp = cm.ravel()

    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    F1 = 2*recall*precision/(recall+precision)

    print('Recall={0:0.3f}'.format(recall),'\nPrecision={0:0.3f}'.format(precision))
    print('F1={0:0.3f}'.format(F1))

In [6]:
from sklearn.metrics import average_precision_score, precision_recall_curve
def plot_aucprc(classifier, scores):
    precision, recall, _ = precision_recall_curve(y_test, scores, pos_label=0)
    average_precision = average_precision_score(y_test, scores)

    print('Average precision-recall score: {0:0.3f}'.format(
          average_precision))

    plt.plot(recall, precision, label='area = %0.3f' % average_precision, color="green")
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision Recall Curve')
    plt.legend(loc="best")
    plt.show()

In [7]:
data_X = data.iloc[:, :-1]
data_y = data["class"]
data_X["Attr70"] = data_X["Attr70"].astype("category")

In [8]:
xgb_X = data.iloc[:, :-1]
xgb_y = data["class"]

## Decision Tree

In [9]:
model = DecisionTreeClassifier()

In [10]:
re_stf = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 2)
scores = cross_val_score(model, data_X, data_y, scoring = "f1_micro", cv = re_stf)

In [11]:
print("Mean CV F1 : %.3f"% np.mean(scores))

Mean CV F1 : 0.911


In [12]:
scores = cross_val_score(model, data_X, data_y, scoring = "roc_auc", cv = re_stf)

In [13]:
print("Mean CV ROC_AUC : %.3f"% np.mean(scores))

Mean CV ROC_AUC : 0.609


## Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
forest = RandomForestClassifier(n_estimators=500, random_state=24)

In [18]:
scores = cross_val_score(forest, data_X, data_y, scoring = "f1_micro", cv = re_stf)
print("Mean CV F1 : %.3f"% np.mean(scores))
scores = cross_val_score(forest, data_X, data_y, scoring = "roc_auc", cv = re_stf)
print("Mean CV ROC_AUC : %.3f"% np.mean(scores))

Mean CV F1 : 0.952
Mean CV ROC_AUC : 0.808


## XGBoost

In [19]:
from xgboost import XGBClassifier

In [20]:
xgb_model = XGBClassifier(learning_rate = 0.05, n_estimators  = 300)

In [21]:
scores = cross_val_score(xgb_model, xgb_X, xgb_y, scoring = "f1_micro", cv = re_stf)
print("Mean CV F1 : %.3f"% np.mean(scores))
scores = cross_val_score(xgb_model, xgb_X, xgb_y, scoring = "roc_auc", cv = re_stf)
print("Mean CV ROC_AUC : %.3f"% np.mean(scores))

Mean CV F1 : 0.955
Mean CV ROC_AUC : 0.846


## SMOTE

In [23]:
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score

In [22]:
def smote_random_class(data, model) :
    
    bank = data[data["class"] == 1]
    not_bank = data[data["class"] == 0]
    
    idx = list(bank.index)
    idx_len = round(len(idx)/5)
    
    first = list(np.random.choice(idx, idx_len, replace = False))
    idx2 = [x for x in idx if x not in first]
    second = list(np.random.choice(idx2, idx_len, replace = False))
    idx3 = [x for x in idx2 if x not in second]
    third = list(np.random.choice(idx3, idx_len, replace = False))
    idx4 = [x for x in idx3 if x not in third]
    fourth = list(np.random.choice(list(idx4), idx_len, replace =False))
    fifth = [x for x in idx4 if x not in fourth]
    
    not_idx = list(not_bank.index)
    not_idx_len = round(len(not_idx)/5)
    
    not_first = list(np.random.choice(not_idx, not_idx_len, replace = False))
    not_idx2 = [x for x in not_idx if x not in not_first]
    not_second = list(np.random.choice(not_idx2, not_idx_len, replace = False))
    not_idx3 = [x for x in not_idx2 if x not in not_second]
    not_third = list(np.random.choice(not_idx3, not_idx_len, replace = False))
    not_idx4 = [x for x in not_idx3 if x not in not_third]
    not_fourth = list(np.random.choice(list(not_idx4), not_idx_len, replace =False))
    not_fifth = [x for x in not_idx4 if x not in not_fourth]
    
    lst = [first, second, third, fourth, fifth]
    not_lst = [not_first, not_second, not_third, not_fourth, not_fifth]
    
    for i in range(len(lst)):
        lst[i].extend(not_lst[i])
        
    f1_scores = []
    roc_scores = []

    for i in range(5):
        print("------------ ",i+1, " ------------\n")
        test_X = xgb_X.iloc[lst[i]]
        test_y = xgb_y.iloc[lst[i]]
        train_X = xgb_X.drop(lst[i])
        train_y = xgb_y.drop(lst[i])
        X_resampled, y_resampled = SMOTE().fit_sample(train_X, train_y)
        
        model.fit(X_resampled, y_resampled)
        pred_y = model.predict(test_X)
        f = f1_score(test_y, pred_y, average = 'micro')
        r = roc_auc_score(test_y, pred_y)
        
        f1_scores.append(f)
        roc_scores.append(r)
        
        print("f1 - ", f, ", roc - ", r)
        
    print("F1 : ", np.mean(f1_scores))
    print("ROC AUC : ", np.mean(roc_scores))

## Decision Tree

In [24]:
smote_random_class(data, DecisionTreeClassifier())

------------  1  ------------

f1 -  0.8519061583577714 , roc -  0.574281905744754
------------  2  ------------

f1 -  0.8519061583577714 , roc -  0.6267414860681115
------------  3  ------------

f1 -  0.8482404692082113 , roc -  0.598576711386309
------------  4  ------------

f1 -  0.8475073313782991 , roc -  0.6178620571035431
------------  5  ------------

f1 -  0.8579795021961932 , roc -  0.6364846299158509
F1 :  0.8515079238996494
ROC AUC :  0.6107893580437136


In [25]:
smote_random_class(data, RandomForestClassifier(n_estimators=500))

------------  1  ------------

f1 -  0.9222873900293255 , roc -  0.5720889232886137
------------  2  ------------

f1 -  0.9208211143695014 , roc -  0.6237745098039217
------------  3  ------------

f1 -  0.9164222873900293 , roc -  0.6476823185414518
------------  4  ------------

f1 -  0.9215542521994134 , roc -  0.5979317165462676
------------  5  ------------

f1 -  0.9158125915080527 , roc -  0.634219903829641
F1 :  0.9193795270992645
ROC AUC :  0.6151394744019791


In [29]:
smote_random_class(data, XGBClassifier(n_estimators = 600, learning_rate = 0.2))

------------  1  ------------

f1 -  0.9127565982404692 , roc -  0.6260749914000688
------------  2  ------------

f1 -  0.9318181818181818 , roc -  0.6689241486068112
------------  3  ------------

f1 -  0.907624633431085 , roc -  0.6758255933952529
------------  4  ------------

f1 -  0.9178885630498533 , roc -  0.6681286549707602
------------  5  ------------

f1 -  0.9114202049780381 , roc -  0.6581336939721794
F1 :  0.9163016363035255
ROC AUC :  0.6594174164690145


## SMOTE + UnderSampling
## Decision Tree

In [27]:
def smote_under_random_class(data, model, k) :
    
    bank = data[data["class"] == 1]
    not_bank = data[data["class"] == 0]
    
    idx = list(bank.index)
    idx_len = round(len(idx)/5)
    
    first = list(np.random.choice(idx, idx_len, replace = False))
    idx2 = [x for x in idx if x not in first]
    second = list(np.random.choice(idx2, idx_len, replace = False))
    idx3 = [x for x in idx2 if x not in second]
    third = list(np.random.choice(idx3, idx_len, replace = False))
    idx4 = [x for x in idx3 if x not in third]
    fourth = list(np.random.choice(list(idx4), idx_len, replace =False))
    fifth = [x for x in idx4 if x not in fourth]
    
    not_idx = list(not_bank.index)
    not_idx_len = round(len(not_idx)/5)
    
    not_first = list(np.random.choice(not_idx, not_idx_len, replace = False))
    not_idx2 = [x for x in not_idx if x not in not_first]
    not_second = list(np.random.choice(not_idx2, not_idx_len, replace = False))
    not_idx3 = [x for x in not_idx2 if x not in not_second]
    not_third = list(np.random.choice(not_idx3, not_idx_len, replace = False))
    not_idx4 = [x for x in not_idx3 if x not in not_third]
    not_fourth = list(np.random.choice(list(not_idx4), not_idx_len, replace =False))
    not_fifth = [x for x in not_idx4 if x not in not_fourth]
    
    lst = [first, second, third, fourth, fifth]
    not_lst = [not_first, not_second, not_third, not_fourth, not_fifth]
    
    for i in range(len(lst)):
        lst[i].extend(not_lst[i])
        
    f1_scores = []
    roc_scores = []

    for i in range(5):
        print("------------ ",i+1, " ------------\n")
        test_X = xgb_X.iloc[lst[i]]
        test_y = xgb_y.iloc[lst[i]]
        train_X = xgb_X.drop(lst[i])
        train_y = xgb_y.drop(lst[i])
        X_resampled, y_resampled = SMOTE(sampling_strategy = 0.1, k_neighbors = k).fit_sample(train_X, train_y)
        X_sample, y_sample = RandomUnderSampler(sampling_strategy = 0.5).fit_sample(X_resampled, y_resampled)
        
        model.fit(X_sample, y_sample)
        pred_y = model.predict(test_X)
        f = f1_score(test_y, pred_y, average = 'micro')
        r = roc_auc_score(test_y, pred_y)
        
        f1_scores.append(f)
        roc_scores.append(r)
        
        print("f1 - ", f, ", roc - ", r)
        
    print("F1 : ", np.mean(f1_scores))
    print("ROC AUC : ", np.mean(roc_scores))

In [30]:
smote_under_random_class(data, DecisionTreeClassifier(), 1)

------------  1  ------------

f1 -  0.8013196480938416 , roc -  0.6393833849329206
------------  2  ------------

f1 -  0.7859237536656891 , roc -  0.6181415548675611
------------  3  ------------

f1 -  0.8225806451612904 , roc -  0.6833935328517371
------------  4  ------------

f1 -  0.7851906158357771 , roc -  0.6374269005847953
------------  5  ------------

f1 -  0.8038067349926794 , roc -  0.699703760947965
F1 :  0.7997642795498555
ROC AUC :  0.6556098268369958


In [31]:
smote_under_random_class(data, RandomForestClassifier(n_estimators = 200), 4)

------------  1  ------------

f1 -  0.9120234604105572 , roc -  0.6781475748194015
------------  2  ------------

f1 -  0.907624633431085 , roc -  0.6627106983144135
------------  3  ------------

f1 -  0.9039589442815249 , roc -  0.7001203990368078
------------  4  ------------

f1 -  0.9127565982404692 , roc -  0.7047643618851049
------------  5  ------------

f1 -  0.9136163982430454 , roc -  0.7642216211574789
F1 :  0.9099960069213363
ROC AUC :  0.7019929310426413


In [32]:
smote_under_random_class(data, XGBClassifier(n_estimators = 200, learning_rate = 0.1), k=3)

------------  1  ------------

f1 -  0.9032258064516129 , roc -  0.7849802201582388
------------  2  ------------

f1 -  0.8980938416422287 , roc -  0.7101393188854488
------------  3  ------------

f1 -  0.8848973607038123 , roc -  0.7294031647746818
------------  4  ------------

f1 -  0.8929618768328446 , roc -  0.7336601307189543
------------  5  ------------

f1 -  0.8755490483162518 , roc -  0.6851064743259487
F1 :  0.8909455867893501
ROC AUC :  0.7286578617726545
