In [1]:
import imblearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv("mice_scaled.csv")

In [3]:
data.head()

Unnamed: 0,Attr2,Attr3,Attr4,Attr5,Attr6,Attr9,Attr10,Attr13,Attr19,Attr21,...,Attr59,Attr61,Attr64,Attr65,Attr66,Attr67,Attr68,Attr69,Attr70,class
0,-0.564475,-0.173956,-0.124649,0.01279,-0.014341,-0.451866,0.321246,-0.004382,-0.518534,-0.021175,...,-0.018746,-0.711244,-1.053473,-0.126576,0.402104,-0.058312,0.017091,-0.205098,0,0
1,-1.698654,1.841882,1.12921,0.0119,-0.221494,-0.386067,0.779132,0.060436,-0.181104,0.029315,...,-0.047671,2.085748,2.08358,-1.260417,0.206697,0.197628,-0.035273,16.246606,1,0
2,-0.014504,0.018601,-0.329322,0.012822,-0.009368,0.074404,0.103175,-0.03077,-1.019254,-0.140062,...,-0.035084,-1.07473,-0.268523,-0.367238,0.838218,-0.364028,0.004898,-0.035524,0,0
3,-0.995975,1.018673,0.878251,0.028528,0.037467,-0.376297,0.510253,0.006106,2.226964,-0.043613,...,-0.046055,0.449361,0.294264,0.090047,-1.226899,0.011336,-0.031391,-0.170398,0,0
4,-0.035359,-0.373743,-0.607667,-0.001269,-0.009368,0.46472,0.11326,0.002117,0.293229,-0.023797,...,-0.046561,-0.076763,-0.291498,-0.010448,-0.354641,0.064521,0.561108,-0.125715,0,0


In [4]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler

In [5]:
from sklearn.metrics import confusion_matrix
def plot_cm(classifier, predictions):
    cm = confusion_matrix(y_test, predictions)
    
    plt.clf()
    plt.imshow(cm, interpolation='nearest', cmap='RdBu')
    classNames = ['Bankrupted','Not Bankrupted']
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    tick_marks = np.arange(len(classNames))
    plt.xticks(tick_marks, classNames, rotation=45)
    plt.yticks(tick_marks, classNames)
    s = [['TN','FP'], ['FN', 'TP']]
    
    for i in range(2):
        for j in range(2):
            plt.text(j,i, str(s[i][j])+" = "+str(cm[i][j]), 
                     horizontalalignment='center', color='White')
    
    plt.show()
        
    tn, fp, fn, tp = cm.ravel()

    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    F1 = 2*recall*precision/(recall+precision)

    print('Recall={0:0.3f}'.format(recall),'\nPrecision={0:0.3f}'.format(precision))
    print('F1={0:0.3f}'.format(F1))

In [6]:
from sklearn.metrics import average_precision_score, precision_recall_curve
def plot_aucprc(classifier, scores):
    precision, recall, _ = precision_recall_curve(y_test, scores, pos_label=0)
    average_precision = average_precision_score(y_test, scores)

    print('Average precision-recall score: {0:0.3f}'.format(
          average_precision))

    plt.plot(recall, precision, label='area = %0.3f' % average_precision, color="green")
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision Recall Curve')
    plt.legend(loc="best")
    plt.show()

# Classification
## Decision Tree

In [7]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [8]:
data_X = data.iloc[:, :-1]
data_y = data["class"]

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(data_X, 
                                                    data_y,
                                                   test_size = 0.3,
                                                   random_state = 24)

In [10]:
model = DecisionTreeClassifier()

In [11]:
re_stf = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)
scores = cross_val_score(model, X_train, Y_train, scoring = "f1_micro", cv = re_stf)

In [12]:
print("Mean CV F1 : %.3f"% np.mean(scores))

Mean CV F1 : 0.913


In [13]:
scores = cross_val_score(model, X_train, Y_train, scoring = "roc_auc", cv = re_stf)

In [14]:
print("Mean CV ROC_AUC : %.3f"% np.mean(scores))

Mean CV ROC_AUC : 0.607


In [15]:
from sklearn.metrics import roc_auc_score

In [16]:
model.fit(X_train,Y_train)
pred_y = model.predict(X_test)

In [17]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [18]:
print("F1 : %.3f" % f1_score(Y_test, pred_y, average = 'micro'))
print("ROC AUC : %.3f" % roc_auc_score(Y_test, pred_y))

F1 : 0.910
ROC AUC : 0.585


## Random Forest

In [19]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
forest = RandomForestClassifier(n_estimators=500, random_state=24)

In [21]:
scores = cross_val_score(forest, X_train, Y_train, scoring = "f1_micro", cv = re_stf)
print("Mean CV F1 : %.3f"% np.mean(scores))
scores = cross_val_score(forest, X_train, Y_train, scoring = "roc_auc", cv = re_stf)
print("Mean CV ROC_AUC : %.3f"% np.mean(scores))

KeyboardInterrupt: 

In [None]:
forest.fit(X_train, Y_train)
pred_y = forest.predict(X_test)
print("F1 : %.3f" % f1_score(Y_test, pred_y, average = 'micro'))
print("ROC AUC : %.3f" % roc_auc_score(Y_test, pred_y))

## XGBOOST

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
n_estimators = [200, 300, 400, 500, 600]
learning_rate = [0.01, 0.05, 0.1, 0.15, 0.2]
param_grid = dict(learning_rate=learning_rate, n_estimators=n_estimators)

In [None]:
xgb_greedy_model = XGBClassifier()
grid_search = GridSearchCV(xgb_greedy_model, param_grid, scoring="roc_auc", cv=re_stf)
grid_result = grid_search.fit(X_train, Y_train)

In [None]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
# plot results
scores = np.array(means).reshape(len(learning_rate), len(n_estimators))
for i, value in enumerate(learning_rate):
    plt.plot(n_estimators, scores[i], label='learning_rate: ' + str(value))
plt.legend()
plt.xlabel('n_estimators')
plt.ylabel('ROC Score')

## SMOTE

## Decision Tree

In [None]:
steps = [("over", SMOTE()), ("model", DecisionTreeClassifier())]
pipeline = Pipeline(steps = steps)

In [None]:
scores2 = cross_val_score(pipeline, X_train, Y_train, scoring = "f1_micro", cv = re_stf)

In [None]:
print("Mean CV F1 : %.3f"% np.mean(scores2))

In [None]:
scores2 = cross_val_score(pipeline, X_train, Y_train, scoring = "roc_auc", cv = re_stf)

In [None]:
print("Mean CV ROC_AUC : %.3f"% np.mean(scores2))

In [None]:
pipeline.fit(X_train, Y_train)
pred_y2 = pipeline.predict(X_test)

In [None]:
print("F1 : %.3f" % f1_score(Y_test, pred_y2, average = 'micro'))
print("ROC AUC : %.3f" % roc_auc_score(Y_test, pred_y2))

## Random Forest

In [None]:
steps = [("over", SMOTE()), ("model", RandomForestClassifier(n_estimators=500, random_state=24))]
pipeline2 = Pipeline(steps = steps)

In [None]:
scores2 = cross_val_score(pipeline2, X_train, Y_train, scoring = "f1_micro", cv = re_stf)
print("Mean CV F1 : %.3f"% np.mean(scores2))

In [None]:
scores2 = cross_val_score(pipeline2, X_train, Y_train, scoring = "roc_auc", cv = re_stf)
print("Mean CV ROC_AUC : %.3f"% np.mean(scores2))

In [None]:
pipeline2.fit(X_train, Y_train)
pred_y2 = pipeline2.predict(X_test)

In [None]:
print("F1 : %.3f" % f1_score(Y_test, pred_y2, average = 'micro'))
print("ROC AUC : %.3f" % roc_auc_score(Y_test, pred_y2))

## XGBOOST

In [None]:
steps = [("over", SMOTE()), ("model", XGBclassi)]
pipeline2 = Pipeline(steps = steps)

In [None]:
def grid_smote_only_search(model) :
    
    steps = [("over", SMOTE()), ("model", model)]
    pipeline = Pipeline(steps = steps)

    f1_scores = cross_val_score(pipeline, X_train, Y_train, scoring = "f1_micro", cv = re_stf)
    print("Mean F1 : %.3f" % (np.mean(f1_scores)))
    roc_scores = cross_val_score(pipeline, X_train, Y_train, scoring = "roc_auc", cv = re_stf)
    print("Mean ROC AUC : %.3f" % (np.mean(roc_scores)))

    pipeline.fit(X_train, Y_train)
    pred_y = pipeline.predict(X_test)
    print("\nF1 : %.3f" % f1_score(Y_test, pred_y, average = 'micro'))
    print("ROC AUC : %.3f" % roc_auc_score(Y_test, pred_y))

In [None]:
for n in n_estimators:
    for rate in learning_rate:
        print("\n N_estimators : ", n, " & Learning_rate : ", rate)
        grid_smote_only_search(XGBClassifier(n_estimators = n, learning_rate = rate))

In [None]:
#얘들은 나중에 돌려보기
#pred_y = xgb_model.predict(X_test)
#scores = xgb_model.predict_proba(X_test)[:,1]

#plot_cm(xgb_model, pred_y)
#plot_aucprc(xgb_model, scores)

# SMOTE combined with Undersampling
## Decision Tree

In [None]:
over2 = SMOTE(sampling_strategy = 0.1)
under2 = RandomUnderSampler(sampling_strategy = 0.5)
steps2 = [('over', over2), ('under', under2), ('model', DecisionTreeClassifier())]
pipeline2 = Pipeline(steps = steps2)

In [None]:
f1_scores2 = cross_val_score(pipeline2, X_train, Y_train, scoring = "f1_micro", cv = re_stf)
print("Mean CV F1 : %.3f"% np.mean(f1_scores2))

In [None]:
roc_scores2 = cross_val_score(pipeline2, X_train, Y_train, scoring = "roc_auc", cv = re_stf)
print("Mean CV ROC_AUC : %.3f"% np.mean(roc_scores2))

In [None]:
pipeline2.fit(X_train, Y_train)
pred_y2 = pipeline2.predict(X_test)

In [None]:
print("F1 : %.3f" % f1_score(Y_test, pred_y2, average = 'micro'))
print("ROC AUC : %.3f" % roc_auc_score(Y_test, pred_y2))

## Random Forest

In [None]:
steps3 = [('over', over2), ('under', under2), ('model', RandomForestClassifier(n_estimators=500, random_state=24))]
pipeline3 = Pipeline(steps = steps3)

In [None]:
f1_scores3 = cross_val_score(pipeline3, X_train, Y_train, scoring = "f1_micro", cv = re_stf)
print("Mean CV F1 : %.3f"% np.mean(f1_scores3))

In [None]:
roc_scores3 = cross_val_score(pipeline3, X_train, Y_train, scoring = "roc_auc", cv = re_stf)
print("Mean CV ROC_AUC : %.3f"% np.mean(roc_scores3))

In [None]:
pipeline3.fit(X_train, Y_train)
pred_y3 = pipeline3.predict(X_test)

In [None]:
print("F1 : %.3f" % f1_score(Y_test, pred_y3, average = 'micro'))
print("ROC AUC : %.3f" % roc_auc_score(Y_test, pred_y3))

## GRID SEARCH
## Decision Tree

In [None]:
def grid_smote_search(lst, model) :
    
    for k in lst:
        over4 = SMOTE(sampling_strategy = 0.1, k_neighbors = k)
        under4 = RandomUnderSampler(sampling_strategy = 0.5)
        steps4 = [("over", over4), ("under", under4), ("model", model)]
        pipeline4 = Pipeline(steps = steps4)

        print("\n------- k = ", k,"-------")
        f1_scores4 = cross_val_score(pipeline4, X_train, Y_train, scoring = "f1_micro", cv = re_stf)
        print("Mean F1 : %.3f" % (np.mean(f1_scores4)))
        roc_scores4 = cross_val_score(pipeline4, X_train, Y_train, scoring = "roc_auc", cv = re_stf)
        print("Mean ROC AUC : %.3f" % (np.mean(roc_scores4)))

        pipeline4.fit(X_train, Y_train)
        pred_y4 = pipeline4.predict(X_test)
        print("\nF1 : %.3f" % f1_score(Y_test, pred_y4, average = 'micro'))
        print("ROC AUC : %.3f" % roc_auc_score(Y_test, pred_y4))

In [None]:
k_values = [1,2,3,4,5,6,7]

In [None]:
grid_smote_search(k_values, DecisionTreeClassifier())

## Random Forest

In [None]:
grid_smote_search(k_values, RandomForestClassifier(n_estimators=400))

In [None]:
grid_smote_search(k_values, RandomForestClassifier(n_estimators=500))

## XGBOOST

## 1) SMOTE만

In [None]:
n_estimators = [200, 300, 400, 500, 600]
learning_rate = [0.01, 0.05, 0.1, 0.15, 0.2]
param_grid = dict(learning_rate=learning_rate, n_estimators=n_estimators)

In [None]:
X_resampled, y_resampled = SMOTE().fit_sample(X_train,list(Y_train))

In [None]:
xgb_model = XGBClassifier()

In [None]:
grid_search2 = GridSearchCV(xgb_model, param_grid, scoring="roc_auc", cv=re_stf)
grid_result2 = grid_search2.fit(X_train, Y_train)

In [None]:
print("Best: %f using %s" % (grid_result2.best_score_, grid_result2.best_params_))
means = grid_result2.cv_results_['mean_test_score']
stds = grid_result2.cv_results_['std_test_score']
params = grid_result2.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
# plot results
scores = np.array(means).reshape(len(learning_rate), len(n_estimators))
for i, value in enumerate(learning_rate):
    plt.plot(n_estimators, scores[i], label='learning_rate: ' + str(value))
plt.legend()
plt.xlabel('n_estimators')
plt.ylabel('ROC Score')

In [None]:
best_xgb_model = XGBClassifier(grid_result2.best_params_)

In [None]:
best_xgb_model.fit(X_resampled, y_resampled)
pred_y3 = best_xgb_model.predict(X_test)

In [None]:
print("F1 : %.3f" % f1_score(Y_test, pred_y3, average = 'micro'))
print("ROC AUC : %.3f" % roc_auc_score(Y_test, pred_y3, average = 'macro'))

## SMOTE + Under

In [None]:
for n in n_estimators:
    for rate in learning_rate:
        print("\n N_estimators : ", n, " & Learning_rate : ", rate)
        grid_smote_search(k_values, XGBClassifier(n_estimators = n, learning_rate = rate))