In [1]:
import matplotlib.pyplot as plt
import imblearn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, roc_auc_score
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn import metrics

  from numpy.core.umath_tests import inner1d


In [2]:
datamice= pd.read_csv("final_mice.csv")
datamedian= pd.read_csv("final_median.csv")
dataknn= pd.read_csv("final_knn.csv")

In [3]:
from sklearn.decomposition import PCA

In [4]:
X_train, X_test, y_train, y_test = train_test_split(datamice.iloc[:, 0:(datamice.shape[1]-1)], datamice['class'], test_size=0.3, random_state= 1)

In [5]:
svm = SVC(kernel='rbf', C=10, random_state=1, probability = True)

In [6]:
re_stf = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)
scores = cross_val_score(svm, X_train, y_train, scoring = "f1_micro", cv = re_stf)

In [7]:
print("Mean CV F1 : %.3f"% np.mean(scores))

Mean CV F1 : 0.954


In [8]:
from sklearn.metrics import roc_auc_score

In [9]:
svm.fit(X_train,y_train)
pred_y = svm.predict(X_test)

In [10]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [11]:
print("F1 : %.3f" % f1_score(y_test, pred_y, average = 'micro'))
print("ROC AUC : %.3f" % roc_auc_score(y_test, pred_y))

F1 : 0.947
ROC AUC : 0.591


In [12]:
X2_train, X2_test, y2_train, y2_test = train_test_split(datamedian.iloc[:, 0:(datamedian.shape[1]-1)], datamedian['class'], test_size=0.3, random_state= 1)

In [13]:
re_stf2 = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)
scores2 = cross_val_score(svm, X2_train, y2_train, scoring = "f1_micro", cv = re_stf2)

In [14]:
print("Mean CV F1 : %.3f"% np.mean(scores2))

Mean CV F1 : 0.951


In [15]:
svm.fit(X2_train,y2_train)
pred_y2 = svm.predict(X2_test)

In [16]:
print("F1 : %.3f" % f1_score(y2_test, pred_y2, average = 'micro'))
print("ROC AUC : %.3f" % roc_auc_score(y2_test, pred_y2))

F1 : 0.956
ROC AUC : 0.613


median 자료로 imputated했을 때, ROC 및 F1 값이 가장 좋게 나왔다. smote, pipeline+smote를 이용한 것보다 그 자체로 이용한 것이 오히려 이 수치들이 더 높게 나온 것을 확인할 수 있다.

In [17]:
X3_train, X3_test, y3_train, y3_test = train_test_split(dataknn.iloc[:, 0:(dataknn.shape[1]-1)], dataknn['class'], test_size=0.3, random_state= 1)

In [18]:
re_stf3 = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)
scores3 = cross_val_score(svm, X3_train, y3_train, scoring = "f1_micro", cv = re_stf3)

In [19]:
print("Mean CV F1 : %.3f"% np.mean(scores3))

Mean CV F1 : 0.954


In [20]:
svm.fit(X3_train,y3_train)
pred_y3 = svm.predict(X3_test)

In [21]:
print("F1 : %.3f" % f1_score(y3_test, pred_y3, average = 'micro'))
print("ROC AUC : %.3f" % roc_auc_score(y3_test, pred_y3))

F1 : 0.948
ROC AUC : 0.580


In [22]:
from sklearn.datasets import load_digits
from sklearn.model_selection import validation_curve

  from collections import Mapping, defaultdict


In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [24]:
pipe_svc = Pipeline([('scl', StandardScaler()), ('clf', SVC(random_state=1))])

In [25]:
scores4 = cross_val_score(pipe_svc, X_train, y_train, scoring = "f1_micro", cv = re_stf)
print("Mean CV F1 : %.3f"% np.mean(scores4))

Mean CV F1 : 0.957


In [26]:
scores5 = cross_val_score(pipe_svc, X_train, y_train, scoring = "roc_auc", cv = re_stf)
print("Mean CV ROC_AUC : %.3f"% np.mean(scores5))

Mean CV ROC_AUC : 0.728


In [27]:
pipe_svc.fit(X_train, y_train)
pred_y4 = pipe_svc.predict(X_test)

In [28]:
print("F1 : %.3f" % f1_score(y_test, pred_y4, average = 'micro'))
print("ROC AUC : %.3f" % roc_auc_score(y_test, pred_y4))

F1 : 0.948
ROC AUC : 0.584


In [29]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import RepeatedStratifiedKFold

In [30]:
sm = SMOTE()
X_train_sm, Y_train_sm = sm.fit_sample(X_train, y_train)

In [31]:
svms= SVC(kernel='rbf',random_state=1)

In [32]:
def grid_smote_search(lst, svms) :
    
    for k in lst:
        over4 = SMOTE(k_neighbors = k)
        under4 = RandomUnderSampler()
        steps4 = [("over", over4), ("under", under4), ("model", svms)]
        pipeline4 = Pipeline(steps = steps4)

        print("\n------- k = ", k,"-------")
        f1_scores4 = cross_val_score(pipeline4, X_train, Y_train, scoring = "f1_micro", cv = re_stf)
        print("Mean F1 : %.3f" % (np.mean(f1_scores4)))
        roc_scores4 = cross_val_score(pipeline4, X_train, Y_train, scoring = "roc_auc", cv = re_stf)
        print("Mean ROC AUC : %.3f" % (np.mean(roc_scores4)))

        pipeline4.fit(X_train, Y_train)
        pred_y4 = pipeline4.predict(X_test)
        print("\nF1 : %.3f" % f1_score(Y_test, pred_y4, average = 'micro'))
        print("ROC AUC : %.3f" % roc_auc_score(Y_test, pred_y4))

In [33]:
from sklearn.model_selection import GridSearchCV
k_values = [1,2,3,4,5,6,7,8]
n_estimators = [200]
learning_rate = [0.001, 0.005, 0.01, 0.05, 0.1]
svm_greedy_model = SVC(kernel='rbf',random_state=1)

In [34]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

pipe_svc = Pipeline([('scl', StandardScaler()), ('clf', SVC(random_state=1))])

param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
param_grid = [
    {'clf__C': param_range, 'clf__gamma': param_range, 'clf__kernel': ['rbf']}]

gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=4)
%time gs = gs.fit(X_train_sm, Y_train_sm)

Wall time: 1h 4min 22s


In [35]:
gs.cv_results_['mean_test_score']

array([0.62714852, 0.64345527, 0.6906126 , 0.68554429, 0.5947554 ,
       0.51267078, 0.50550903, 0.52258704, 0.62714852, 0.64345527,
       0.6906126 , 0.68554429, 0.5947554 , 0.51267078, 0.50550903,
       0.52258704, 0.62714852, 0.62703834, 0.7031732 , 0.74151609,
       0.60257823, 0.51267078, 0.50550903, 0.52258704, 0.54947113,
       0.7089026 , 0.74206699, 0.79914059, 0.83957691, 0.56346408,
       0.50550903, 0.52258704, 0.70537682, 0.73479506, 0.78746144,
       0.88651388, 0.95614808, 0.88122521, 0.74746584, 0.62769943,
       0.72983693, 0.76740855, 0.82150727, 0.93234905, 0.96386073,
       0.89136183, 0.76090789, 0.63761569, 0.76101807, 0.7979286 ,
       0.8635963 , 0.95769061, 0.96253856, 0.89103129, 0.76057735,
       0.63750551, 0.78107096, 0.82062583, 0.91527104, 0.96011459,
       0.96198766, 0.89048039, 0.76057735, 0.63739533])

In [36]:
gs.best_score_

0.9638607315998237

In [37]:
gs.best_params_

{'clf__C': 10.0, 'clf__gamma': 1.0, 'clf__kernel': 'rbf'}

In [38]:
pipe_svc2 = Pipeline([('scl', StandardScaler()), ('clf', SVC(random_state=1))])

In [39]:
scores6 = cross_val_score(pipe_svc2, X2_train, y2_train, scoring = "f1_micro", cv = re_stf2)
print("Mean CV F1 : %.3f"% np.mean(scores6))

Mean CV F1 : 0.953


In [40]:
scores7 = cross_val_score(pipe_svc2, X2_train, y2_train, scoring = "f1_micro", cv = re_stf2)
print("Mean CV F1 : %.3f"% np.mean(scores7))

Mean CV F1 : 0.953


In [41]:
pipe_svc2.fit(X2_train, y2_train)
pred_y5 = pipe_svc2.predict(X2_test)

In [42]:
print("F1 : %.3f" % f1_score(y2_test, pred_y5, average = 'micro'))
print("ROC AUC : %.3f" % roc_auc_score(y2_test, pred_y5))

F1 : 0.957
ROC AUC : 0.609


In [43]:
sm = SMOTE()
X2_train_sm, Y2_train_sm = sm.fit_sample(X2_train, y2_train)

In [47]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
pipe_svc2 = Pipeline([('scl', StandardScaler()), ('clf', SVC(random_state=1))])

param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
param_grid = [
    {'clf__C': param_range, 'clf__gamma': param_range, 'clf__kernel': ['rbf']}]

gs2 = GridSearchCV(estimator=pipe_svc2, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=4)
%time gs2 = gs2.fit(X2_train_sm, Y2_train_sm)

Wall time: 1h 36min 59s


In [48]:
gs2.cv_results_['mean_test_score']

array([0.61725664, 0.6380531 , 0.67223451, 0.66117257, 0.5994469 ,
       0.52267699, 0.50519912, 0.50132743, 0.61725664, 0.6380531 ,
       0.67223451, 0.66117257, 0.5994469 , 0.52267699, 0.50519912,
       0.50132743, 0.61725664, 0.6164823 , 0.67975664, 0.68628319,
       0.5994469 , 0.52267699, 0.50519912, 0.50132743, 0.60608407,
       0.69015487, 0.72201327, 0.78053097, 0.82068584, 0.55320796,
       0.50519912, 0.50132743, 0.68761062, 0.70884956, 0.76172566,
       0.87610619, 0.95044248, 0.85276549, 0.72754425, 0.61050885,
       0.70232301, 0.73849558, 0.79977876, 0.92986726, 0.95641593,
       0.86637168, 0.74037611, 0.61725664, 0.73030973, 0.76902655,
       0.85752212, 0.95530973, 0.95597345, 0.86570796, 0.74004425,
       0.61725664, 0.74480088, 0.80099558, 0.9164823 , 0.95973451,
       0.95486726, 0.86548673, 0.74004425, 0.61736726])

In [49]:
gs2.best_score_

0.9597345132743362

In [50]:
gs2.best_params_

{'clf__C': 1000.0, 'clf__gamma': 0.1, 'clf__kernel': 'rbf'}

In [51]:
sm = SMOTE()
X3_train_sm, Y3_train_sm = sm.fit_sample(X3_train, y3_train)

In [52]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
pipe_svc3 = Pipeline([('scl', StandardScaler()), ('clf', SVC(random_state=1))])

param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
param_grid = [
    {'clf__C': param_range, 'clf__gamma': param_range, 'clf__kernel': ['rbf']}]

gs3= GridSearchCV(estimator=pipe_svc3, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=4)
%time gs3 = gs3.fit(X3_train_sm, Y3_train_sm)

Wall time: 1h 25min 9s


In [53]:
gs3.cv_results_['mean_test_score']

array([0.60072977, 0.68752764, 0.68222026, 0.68498452, 0.62925697,
       0.51735958, 0.50796108, 0.50132685, 0.60072977, 0.68752764,
       0.68222026, 0.68498452, 0.62925697, 0.51735958, 0.50796108,
       0.50132685, 0.60072977, 0.68752764, 0.69692614, 0.70654578,
       0.6208536 , 0.51735958, 0.50796108, 0.50132685, 0.68078284,
       0.70190181, 0.73562583, 0.80937638, 0.85957541, 0.55152587,
       0.50796108, 0.50132685, 0.69792127, 0.73076073, 0.78781513,
       0.89573198, 0.95643521, 0.86377709, 0.73142415, 0.61709421,
       0.72655904, 0.76149934, 0.82408226, 0.94161875, 0.9609686 ,
       0.875387  , 0.74458204, 0.6272667 , 0.74745688, 0.79245909,
       0.8802521 , 0.95964175, 0.95964175, 0.87516586, 0.74447147,
       0.6272667 , 0.77432552, 0.82607253, 0.92326404, 0.96273773,
       0.95875719, 0.87494471, 0.74447147, 0.6272667 ])

In [54]:
gs3.best_score_

0.9627377266696152

In [55]:
gs3.best_params_

{'clf__C': 1000.0, 'clf__gamma': 0.1, 'clf__kernel': 'rbf'}

In [56]:
pipe_svc3 = Pipeline([('scl', StandardScaler()), ('clf', SVC(random_state=1))])

In [57]:
scores8 = cross_val_score(pipe_svc3, X3_train, y3_train, scoring = "f1_micro", cv = re_stf3)
print("Mean CV F1 : %.3f"% np.mean(scores8))

Mean CV F1 : 0.955


In [58]:
scores9 = cross_val_score(pipe_svc3, X3_train, y3_train, scoring = "roc_auc", cv = re_stf3)
print("Mean CV ROC_AUC : %.3f"% np.mean(scores9))

Mean CV ROC_AUC : 0.692


In [59]:
pipe_svc3.fit(X3_train, y3_train)
pred_y6 = pipe_svc3.predict(X3_test)

In [60]:
print("F1 : %.3f" % f1_score(y3_test, pred_y6, average = 'micro'))
print("ROC AUC : %.3f" % roc_auc_score(y3_test, pred_y6))

F1 : 0.951
ROC AUC : 0.577


In [67]:
svm4 = SVC(kernel='rbf', C=1000, random_state=1, gamma=1.0, probability = True)

In [68]:
scores10 = cross_val_score(svm4, X2_train, y2_train, scoring = "f1_micro", cv = re_stf2)

In [69]:
svm4.fit(X2_train,y2_train)
pred_y7 = svm4.predict(X2_test)

In [70]:
print("F1 : %.3f" % f1_score(y2_test, pred_y7, average = 'micro'))
print("ROC AUC : %.3f" % roc_auc_score(y2_test, pred_y7))

F1 : 0.938
ROC AUC : 0.500


In [71]:
print("Mean CV F1 : %.3f"% np.mean(scores10))

Mean CV F1 : 0.937


In [73]:
svm5 = SVC(kernel='rbf', C=1000, random_state=1, gamma=0.1, probability = True)

In [74]:
scores11 = cross_val_score(svm5, X3_train, y3_train, scoring = "f1_micro", cv = re_stf3)

In [75]:
svm5.fit(X3_train,y3_train)
pred_y8 = svm5.predict(X3_test)

In [76]:
print("F1 : %.3f" % f1_score(y3_test, pred_y8, average = 'micro'))
print("ROC AUC : %.3f" % roc_auc_score(y3_test, pred_y8))

F1 : 0.915
ROC AUC : 0.566


In [77]:
print("Mean CV F1 : %.3f"% np.mean(scores11))

Mean CV F1 : 0.922


In [78]:
pipe_svc4 = Pipeline([('scl', StandardScaler()), ('clf',SVC(kernel='rbf', C=1000, random_state=1, gamma=1.0, probability = True))])

In [79]:
scores12 = cross_val_score(pipe_svc4, X2_train, y2_train, scoring = "f1_micro", cv = re_stf2)
print("Mean CV F1 : %.3f"% np.mean(scores12))

Mean CV F1 : 0.937


In [80]:
scores13 = cross_val_score(pipe_svc4, X2_train, y2_train, scoring = "f1_micro", cv = re_stf2)
print("Mean CV F1 : %.3f"% np.mean(scores13))

Mean CV F1 : 0.937


In [81]:
pipe_svc4.fit(X2_train, y2_train)
pred_y9 = pipe_svc4.predict(X2_test)

In [82]:
print("F1 : %.3f" % f1_score(y2_test, pred_y9, average = 'micro'))
print("ROC AUC : %.3f" % roc_auc_score(y2_test, pred_y9))

F1 : 0.938
ROC AUC : 0.508


In [83]:
sm2 = SMOTE()
X2_train_sm, Y2_train_sm = sm2.fit_sample(X2_train, y2_train)

In [84]:
pipe_svc4.fit(X2_train_sm, Y2_train_sm)
pred_y9 = pipe_svc4.predict(X2_test)

In [85]:
print("F1 : %.3f" % f1_score(y2_test, pred_y9, average = 'micro'))
print("ROC AUC : %.3f" % roc_auc_score(y2_test, pred_y9))

F1 : 0.925
ROC AUC : 0.545


In [86]:
pipe_svc5 = Pipeline([('scl', StandardScaler()), ('clf',SVC(kernel='rbf', C=1000, random_state=1, gamma=0.1, probability = True))])

In [87]:
scores14 = cross_val_score(pipe_svc5, X3_train, y3_train, scoring = "f1_micro", cv = re_stf3)
print("Mean CV F1 : %.3f"% np.mean(scores14))

Mean CV F1 : 0.923


In [88]:
pipe_svc5.fit(X3_train, y3_train)
pred_y10 = pipe_svc5.predict(X3_test)

In [89]:
print("F1 : %.3f" % f1_score(y3_test, pred_y10, average = 'micro'))
print("ROC AUC : %.3f" % roc_auc_score(y3_test, pred_y10))

F1 : 0.920
ROC AUC : 0.585


In [90]:
sm3 = SMOTE()
X3_train_sm, Y3_train_sm = sm3.fit_sample(X3_train, y3_train)

In [91]:
pipe_svc5.fit(X3_train_sm, Y3_train_sm)
pred_y11 = pipe_svc5.predict(X3_test)

In [92]:
print("F1 : %.3f" % f1_score(y3_test, pred_y11, average = 'micro'))
print("ROC AUC : %.3f" % roc_auc_score(y3_test, pred_y11))

F1 : 0.908
ROC AUC : 0.603
