In [1]:
import matplotlib.pyplot as plt
import imblearn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve

# AdaBoost

In [2]:
from sklearn.ensemble import AdaBoostClassifier

In [3]:
mice = pd.read_csv("final_mice.csv")
median = pd.read_csv("final_median.csv")
knn = pd.read_csv("final_knn.csv")

## 결론: mice, median, knn 3개의 데이터 모두 auc가 0.82보다 낮게 나옴

## Mice 

In [7]:
mice

Unnamed: 0,Attr2,Attr3,Attr4,Attr5,Attr6,Attr9,Attr10,Attr13,Attr19,Attr21,...,Attr59,Attr61,Attr64,Attr65,Attr66,Attr67,Attr68,Attr69,Attr70,class
0,-0.564475,-0.173956,-0.124649,0.012790,-0.014341,-0.451866,0.321246,-0.004382,-0.518534,-0.021175,...,-0.018746,-0.711244,-1.053473,-0.126576,0.402104,-0.058312,0.017091,-0.205098,0,0
1,-1.698654,1.841882,1.129210,0.011900,-0.221494,-0.386067,0.779132,0.060436,-0.181104,0.029315,...,-0.047671,2.085748,2.083580,-1.260417,0.206697,0.197628,-0.035273,16.246606,1,0
2,-0.014504,0.018601,-0.329322,0.012822,-0.009368,0.074404,0.103175,-0.030770,-1.019254,-0.140062,...,-0.035084,-1.074730,-0.268523,-0.367238,0.838218,-0.364028,0.004898,-0.035524,0,0
3,-0.995975,1.018673,0.878251,0.028528,0.037467,-0.376297,0.510253,0.006106,2.226964,-0.043613,...,-0.046055,0.449361,0.294264,0.090047,-1.226899,0.011336,-0.031391,-0.170398,0,0
4,-0.035359,-0.373743,-0.607667,-0.001269,-0.009368,0.464720,0.113260,0.002117,0.293229,-0.023797,...,-0.046561,-0.076763,-0.291498,-0.010448,-0.354641,0.064521,0.561108,-0.125715,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6817,-0.302640,0.833387,0.173968,0.007133,-0.009368,0.731705,0.238287,-0.006336,-0.335395,-0.069110,...,-0.046626,-0.274731,1.307586,-0.093395,0.275162,-0.036523,-0.018076,-0.083613,0,0
6818,0.943360,-0.275863,-0.574800,0.007010,-0.009368,0.665545,-0.416693,-0.002679,-0.336932,0.032059,...,0.035472,-0.637787,0.385932,-0.097624,0.278523,-0.004389,0.162646,-0.122447,0,0
6819,1.252417,0.372057,-0.264240,0.015432,-0.139613,0.703610,-0.610580,-0.013476,-1.135287,-0.023817,...,1.906143,-1.098679,1.604219,-0.149836,1.200535,-3.260097,-0.004700,-0.083613,0,0
6820,-0.036358,0.041477,-0.266804,0.004884,0.010881,-0.387448,0.090059,0.005660,0.411020,0.058544,...,-0.021460,-0.480416,-0.415008,-0.037036,-0.142735,0.035682,-0.009190,-0.200115,0,1


In [42]:
data_X = mice.iloc[:, :-1]
data_y = mice["class"]
X_train, X_test, Y_train, Y_test = train_test_split(data_X, 
                                                    data_y,
                                                   test_size = 0.3,
                                                   random_state = 24)
sm = SMOTE()
X_train_sm, Y_train_sm = sm.fit_sample(X_train, Y_train)

In [43]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
model = AdaBoostClassifier()
re_stf = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)
scores = cross_val_score(model, X_train, Y_train, scoring = "f1_micro", cv = re_stf)

In [49]:
def grid_smote_search(lst, model) :
    
    for k in lst:
        over4 = SMOTE(k_neighbors = k)
        under4 = RandomUnderSampler()
        steps4 = [("over", over4), ("under", under4), ("model", model)]
        pipeline4 = Pipeline(steps = steps4)

        print("\n------- k = ", k,"-------")
        f1_scores4 = cross_val_score(pipeline4, X_train, Y_train, scoring = "f1_micro", cv = re_stf)
        print("Mean F1 : %.3f" % (np.mean(f1_scores4)))
        roc_scores4 = cross_val_score(pipeline4, X_train, Y_train, scoring = "roc_auc", cv = re_stf)
        print("Mean ROC AUC : %.3f" % (np.mean(roc_scores4)))

        pipeline4.fit(X_train, Y_train)
        pred_y4 = pipeline4.predict(X_test)
        print("\nF1 : %.3f" % f1_score(Y_test, pred_y4, average = 'micro'))
        print("ROC AUC : %.3f" % roc_auc_score(Y_test, pred_y4))

In [51]:
from sklearn.model_selection import GridSearchCV
k_values = [1,2,3,4,5,6,7,8]
n_estimators = [200]
learning_rate = [0.001, 0.005, 0.01, 0.05, 0.1]
ada_greedy_model = AdaBoostClassifier()

In [36]:
for n in n_estimators:
    for rate in learning_rate:
        print("\n N_estimators : ", n, " & Learning_rate : ", rate)
        grid_smote_search(k_values, AdaBoostClassifier(n_estimators = n, learning_rate = rate))


 N_estimators :  100  & Learning_rate :  0.001

------- k =  1 -------
Mean F1 : 0.930
Mean ROC AUC : 0.706

F1 : 0.954
ROC AUC : 0.567

------- k =  2 -------
Mean F1 : 0.889
Mean ROC AUC : 0.705

F1 : 0.858
ROC AUC : 0.668

------- k =  3 -------
Mean F1 : 0.893
Mean ROC AUC : 0.706

F1 : 0.849
ROC AUC : 0.626

------- k =  4 -------
Mean F1 : 0.889
Mean ROC AUC : 0.727

F1 : 0.954
ROC AUC : 0.567

------- k =  5 -------
Mean F1 : 0.884
Mean ROC AUC : 0.706

F1 : 0.820
ROC AUC : 0.602

------- k =  6 -------
Mean F1 : 0.886
Mean ROC AUC : 0.713

F1 : 0.812
ROC AUC : 0.690

------- k =  7 -------
Mean F1 : 0.875
Mean ROC AUC : 0.717

F1 : 0.954
ROC AUC : 0.590

------- k =  8 -------
Mean F1 : 0.886
Mean ROC AUC : 0.696

F1 : 0.838
ROC AUC : 0.643

 N_estimators :  100  & Learning_rate :  0.005

------- k =  1 -------
Mean F1 : 0.923
Mean ROC AUC : 0.782

F1 : 0.954
ROC AUC : 0.590

------- k =  2 -------
Mean F1 : 0.926
Mean ROC AUC : 0.782

F1 : 0.954
ROC AUC : 0.590

------- k =  

Mean F1 : 0.919
Mean ROC AUC : 0.766

F1 : 0.954
ROC AUC : 0.576

------- k =  8 -------
Mean F1 : 0.917
Mean ROC AUC : 0.764

F1 : 0.954
ROC AUC : 0.590

 N_estimators :  300  & Learning_rate :  0.005

------- k =  1 -------
Mean F1 : 0.930
Mean ROC AUC : 0.797

F1 : 0.954
ROC AUC : 0.590

------- k =  2 -------
Mean F1 : 0.929
Mean ROC AUC : 0.805

F1 : 0.954
ROC AUC : 0.590

------- k =  3 -------
Mean F1 : 0.930
Mean ROC AUC : 0.802

F1 : 0.920
ROC AUC : 0.659

------- k =  4 -------
Mean F1 : 0.928
Mean ROC AUC : 0.803

F1 : 0.913
ROC AUC : 0.655

------- k =  5 -------
Mean F1 : 0.929
Mean ROC AUC : 0.801

F1 : 0.913
ROC AUC : 0.641

------- k =  6 -------
Mean F1 : 0.931
Mean ROC AUC : 0.798

F1 : 0.887
ROC AUC : 0.678

------- k =  7 -------
Mean F1 : 0.928
Mean ROC AUC : 0.798

F1 : 0.954
ROC AUC : 0.590

------- k =  8 -------
Mean F1 : 0.927
Mean ROC AUC : 0.805

F1 : 0.954
ROC AUC : 0.590

 N_estimators :  300  & Learning_rate :  0.01

------- k =  1 -------
Mean F1 : 0.915


F1 : 0.913
ROC AUC : 0.651

------- k =  6 -------
Mean F1 : 0.913
Mean ROC AUC : 0.807

F1 : 0.900
ROC AUC : 0.649

------- k =  7 -------
Mean F1 : 0.917
Mean ROC AUC : 0.809

F1 : 0.919
ROC AUC : 0.640

------- k =  8 -------
Mean F1 : 0.918
Mean ROC AUC : 0.808

F1 : 0.916
ROC AUC : 0.657

 N_estimators :  500  & Learning_rate :  0.01

------- k =  1 -------
Mean F1 : 0.905
Mean ROC AUC : 0.814

F1 : 0.895
ROC AUC : 0.669

------- k =  2 -------
Mean F1 : 0.910
Mean ROC AUC : 0.814

F1 : 0.907
ROC AUC : 0.657

------- k =  3 -------
Mean F1 : 0.905
Mean ROC AUC : 0.812

F1 : 0.894
ROC AUC : 0.664

------- k =  4 -------
Mean F1 : 0.904
Mean ROC AUC : 0.813

F1 : 0.889
ROC AUC : 0.647

------- k =  5 -------
Mean F1 : 0.908
Mean ROC AUC : 0.815

F1 : 0.885
ROC AUC : 0.645

------- k =  6 -------
Mean F1 : 0.905
Mean ROC AUC : 0.813

F1 : 0.883
ROC AUC : 0.654

------- k =  7 -------
Mean F1 : 0.908
Mean ROC AUC : 0.810

F1 : 0.909
ROC AUC : 0.662

------- k =  8 -------
Mean F1 : 0

## Median

In [37]:
median

Unnamed: 0,Attr1,Attr2,Attr3,Attr4,Attr5,Attr6,Attr9,Attr10,Attr12,Attr13,...,Attr59,Attr61,Attr64,Attr65,Attr66,Attr67,Attr68,Attr69,Attr70,class
0,-0.305882,-0.567830,-0.191750,-0.118907,0.009449,-0.004600,-0.452027,0.328989,-0.022936,-0.015081,...,-0.018722,-0.708407,-1.073419,-0.125311,0.413225,-0.048981,0.005588,-0.294502,0,0
1,2.511211,-1.708722,1.979607,-0.257969,0.008522,-0.287835,-0.386111,0.799984,14.966639,0.046834,...,-0.047639,2.069378,-0.216780,-0.075797,0.209906,0.128412,-0.034505,-0.144385,1,0
2,-2.072361,-0.014605,0.015663,-0.327737,0.009483,0.002200,0.075177,0.104675,-0.025949,-0.040287,...,-0.035054,-1.069397,-0.247814,-0.368230,0.866998,-0.260872,-0.003748,-0.144385,0,0
3,0.305626,-1.001884,1.092889,0.904366,0.025844,0.066235,-0.376323,0.523407,-0.019647,-0.005064,...,-0.046023,0.444228,0.344121,0.093344,-1.281735,-0.000707,-0.031532,-0.229892,0,0
4,0.445815,-0.035583,-0.406950,-0.611737,-0.005197,0.002200,0.466185,0.115048,-0.021337,-0.008874,...,-0.046528,-0.078284,-0.271979,-0.008094,-0.374159,0.036156,0.422113,-0.146693,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6813,-0.145983,-0.304446,0.893308,0.185775,0.003555,0.002200,0.733643,0.243655,-0.022305,-0.016948,...,-0.046581,-0.274893,1.409927,-0.091819,0.281144,-0.033878,-0.021338,-0.057104,0,0
6814,-0.131569,0.948928,-0.301519,-0.578201,0.003428,0.002200,0.667366,-0.430077,-0.022312,-0.013455,...,0.035477,-0.635454,0.440538,-0.096087,0.284641,-0.011606,0.117032,-0.139949,0,0
6815,-0.880778,1.259814,0.396387,-0.261334,0.012201,-0.175882,0.705497,-0.629514,-0.023296,-0.023768,...,1.905625,-1.093181,1.721923,-0.148789,1.243986,-2.268140,-0.011097,-0.068233,0,0
6816,0.259915,-0.036588,0.040304,-0.263949,0.001213,0.029885,-0.387494,0.091183,-0.021091,-0.005489,...,-0.021438,-0.479165,-0.401887,-0.034932,-0.153674,0.016167,-0.014534,-0.269657,0,1


In [38]:
X,y = oversample.fit_resample(median.iloc[:,:-1], median["class"])

In [39]:
data_X = median.iloc[:, :-1]
data_y = median["class"]
X_train, X_test, Y_train, Y_test = train_test_split(data_X, 
                                                    data_y,
                                                   test_size = 0.3,
                                                   random_state = 24)

In [40]:
for n in n_estimators:
    for rate in learning_rate:
        print("\n N_estimators : ", n, " & Learning_rate : ", rate)
        grid_smote_search(k_values, AdaBoostClassifier(n_estimators = n, learning_rate = rate))


 N_estimators :  100  & Learning_rate :  0.001

------- k =  1 -------
Mean F1 : 0.807
Mean ROC AUC : 0.691

F1 : 0.829
ROC AUC : 0.614

------- k =  2 -------
Mean F1 : 0.786
Mean ROC AUC : 0.697

F1 : 0.754
ROC AUC : 0.651

------- k =  3 -------
Mean F1 : 0.807
Mean ROC AUC : 0.701

F1 : 0.862
ROC AUC : 0.639

------- k =  4 -------
Mean F1 : 0.788
Mean ROC AUC : 0.699

F1 : 0.759
ROC AUC : 0.649

------- k =  5 -------
Mean F1 : 0.795
Mean ROC AUC : 0.713

F1 : 0.759
ROC AUC : 0.637

------- k =  6 -------
Mean F1 : 0.809
Mean ROC AUC : 0.706

F1 : 0.796
ROC AUC : 0.645

------- k =  7 -------
Mean F1 : 0.793
Mean ROC AUC : 0.710

F1 : 0.759
ROC AUC : 0.637

------- k =  8 -------
Mean F1 : 0.813
Mean ROC AUC : 0.695

F1 : 0.758
ROC AUC : 0.641

 N_estimators :  100  & Learning_rate :  0.005

------- k =  1 -------
Mean F1 : 0.879
Mean ROC AUC : 0.757

F1 : 0.845
ROC AUC : 0.626

------- k =  2 -------
Mean F1 : 0.870
Mean ROC AUC : 0.756

F1 : 0.855
ROC AUC : 0.632

------- k =  

Mean F1 : 0.849
Mean ROC AUC : 0.739

F1 : 0.868
ROC AUC : 0.626

------- k =  8 -------
Mean F1 : 0.852
Mean ROC AUC : 0.737

F1 : 0.857
ROC AUC : 0.636

 N_estimators :  300  & Learning_rate :  0.005

------- k =  1 -------
Mean F1 : 0.901
Mean ROC AUC : 0.798

F1 : 0.915
ROC AUC : 0.639

------- k =  2 -------
Mean F1 : 0.897
Mean ROC AUC : 0.798

F1 : 0.915
ROC AUC : 0.639

------- k =  3 -------
Mean F1 : 0.898
Mean ROC AUC : 0.800

F1 : 0.899
ROC AUC : 0.659

------- k =  4 -------
Mean F1 : 0.900
Mean ROC AUC : 0.802

F1 : 0.934
ROC AUC : 0.649

------- k =  5 -------
Mean F1 : 0.899
Mean ROC AUC : 0.795

F1 : 0.919
ROC AUC : 0.686

------- k =  6 -------
Mean F1 : 0.898
Mean ROC AUC : 0.800

F1 : 0.894
ROC AUC : 0.680

------- k =  7 -------
Mean F1 : 0.903
Mean ROC AUC : 0.799

F1 : 0.878
ROC AUC : 0.676

------- k =  8 -------
Mean F1 : 0.899
Mean ROC AUC : 0.800

F1 : 0.885
ROC AUC : 0.651

 N_estimators :  300  & Learning_rate :  0.01

------- k =  1 -------
Mean F1 : 0.907


F1 : 0.875
ROC AUC : 0.667

------- k =  6 -------
Mean F1 : 0.902
Mean ROC AUC : 0.811

F1 : 0.895
ROC AUC : 0.677

------- k =  7 -------
Mean F1 : 0.906
Mean ROC AUC : 0.810

F1 : 0.909
ROC AUC : 0.672

------- k =  8 -------
Mean F1 : 0.902
Mean ROC AUC : 0.809

F1 : 0.890
ROC AUC : 0.654

 N_estimators :  500  & Learning_rate :  0.01

------- k =  1 -------
Mean F1 : 0.908
Mean ROC AUC : 0.831

F1 : 0.913
ROC AUC : 0.707

------- k =  2 -------
Mean F1 : 0.905
Mean ROC AUC : 0.834

F1 : 0.924
ROC AUC : 0.725

------- k =  3 -------
Mean F1 : 0.909
Mean ROC AUC : 0.832

F1 : 0.920
ROC AUC : 0.747

------- k =  4 -------
Mean F1 : 0.905
Mean ROC AUC : 0.836

F1 : 0.920
ROC AUC : 0.735

------- k =  5 -------
Mean F1 : 0.904
Mean ROC AUC : 0.832

F1 : 0.918
ROC AUC : 0.722

------- k =  6 -------
Mean F1 : 0.905
Mean ROC AUC : 0.835

F1 : 0.913
ROC AUC : 0.679

------- k =  7 -------
Mean F1 : 0.906
Mean ROC AUC : 0.833

F1 : 0.921
ROC AUC : 0.707

------- k =  8 -------
Mean F1 : 0

## knn

In [46]:
knn

Unnamed: 0,Attr3,Attr4,Attr5,Attr6,Attr9,Attr10,Attr12,Attr15,Attr18,Attr19,...,Attr57,Attr58,Attr59,Attr61,Attr63,Attr64,Attr68,Attr69,Attr70,class
0,-0.177767,-0.123483,0.009524,0.009448,-0.456467,0.301698,-0.031183,0.056725,-0.370348,-0.518652,...,-0.056065,-0.030072,-0.018519,-0.714359,-0.045731,-1.073841,0.017298,-0.286284,0,0
1,1.861965,1.097655,0.008598,-0.411517,-0.390186,0.725650,19.373965,-0.041950,3.297537,-0.180426,...,0.214041,-0.447314,-0.047413,2.069638,5.610952,-0.137180,-0.034991,0.485499,1,0
2,0.017073,-0.330491,0.009558,0.019555,0.073659,0.099788,-0.035084,-0.049363,-2.347134,-1.020554,...,-0.378704,0.206205,-0.034839,-1.076156,-0.051393,-0.271095,0.005122,-0.138982,0,0
3,1.028999,0.890857,0.025911,0.114729,-0.380344,0.476698,-0.026925,-0.037176,0.422903,2.233324,...,0.017436,-0.063601,-0.045798,0.440854,-0.048111,0.304451,-0.031114,-0.229793,0,0
4,-0.379922,-0.612011,-0.005114,0.019555,0.466834,0.109125,-0.029113,-0.026674,0.440759,0.295026,...,0.073563,-0.109727,-0.046303,-0.082826,-0.050858,-0.294590,0.560525,-0.157048,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6800,0.841516,0.178540,0.003634,0.019555,0.735774,0.224887,-0.030366,0.123367,-0.214425,-0.335082,...,-0.033069,0.004701,-0.046368,-0.279874,-0.048661,1.340748,-0.017819,-0.088597,0,0
6801,-0.280881,-0.578768,0.003506,0.019555,0.669129,-0.381552,-0.030376,0.010296,-0.165937,-0.336622,...,0.000841,0.004816,0.035640,-0.641243,-0.052243,0.398199,0.162641,-0.151818,0,0
6802,0.374718,-0.264667,0.012275,-0.245123,0.707473,-0.561070,-0.031649,-0.080429,-1.027923,-1.136861,...,-3.435131,0.060763,1.904272,-1.099993,-0.051461,1.644106,-0.004462,-0.088597,0,0
6803,0.040220,-0.267259,0.001293,0.060702,-0.391577,0.087644,-0.028795,-0.026499,0.391702,0.413095,...,0.043136,-0.058849,-0.021230,-0.484603,-0.052023,-0.420901,-0.008946,-0.278261,0,1


In [47]:
data_X = knn.iloc[:, :-1]
data_y = knn["class"]
X_train, X_test, Y_train, Y_test = train_test_split(data_X, 
                                                    data_y,
                                                   test_size = 0.3,
                                                   random_state = 24)
sm = SMOTE()
X_train_sm, Y_train_sm = sm.fit_sample(X_train, Y_train)

In [52]:
for n in n_estimators:
    for rate in learning_rate:
        print("\n N_estimators : ", n, " & Learning_rate : ", rate)
        grid_smote_search(k_values, AdaBoostClassifier(n_estimators = n, learning_rate = rate))


 N_estimators :  200  & Learning_rate :  0.001

------- k =  1 -------
Mean F1 : 0.709
Mean ROC AUC : 0.694

F1 : 0.732
ROC AUC : 0.572

------- k =  2 -------
Mean F1 : 0.720
Mean ROC AUC : 0.688

F1 : 0.732
ROC AUC : 0.572

------- k =  3 -------
Mean F1 : 0.707
Mean ROC AUC : 0.693

F1 : 0.732
ROC AUC : 0.572

------- k =  4 -------
Mean F1 : 0.701
Mean ROC AUC : 0.696

F1 : 0.732
ROC AUC : 0.572

------- k =  5 -------
Mean F1 : 0.728
Mean ROC AUC : 0.691

F1 : 0.732
ROC AUC : 0.572

------- k =  6 -------
Mean F1 : 0.687
Mean ROC AUC : 0.693

F1 : 0.617
ROC AUC : 0.670

------- k =  7 -------
Mean F1 : 0.688
Mean ROC AUC : 0.694

F1 : 0.733
ROC AUC : 0.572

------- k =  8 -------
Mean F1 : 0.696
Mean ROC AUC : 0.697

F1 : 0.732
ROC AUC : 0.572

 N_estimators :  200  & Learning_rate :  0.005

------- k =  1 -------
Mean F1 : 0.722
Mean ROC AUC : 0.736

F1 : 0.732
ROC AUC : 0.666

------- k =  2 -------
Mean F1 : 0.725
Mean ROC AUC : 0.729

F1 : 0.723
ROC AUC : 0.649

------- k =  