In [1]:
import matplotlib.pyplot as plt
import imblearn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, roc_auc_score
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn import metrics

  from numpy.core.umath_tests import inner1d


In [2]:
datamice= pd.read_csv("final_mice.csv")
datamedian= pd.read_csv("final_median.csv")
dataknn= pd.read_csv("final_knn.csv")

In [3]:
from sklearn.decomposition import PCA

In [4]:
X_train, X_test, y_train, y_test = train_test_split(datamice.iloc[:, 0:(datamice.shape[1]-1)], datamice['class'], test_size=0.3, random_state= 1)

In [5]:
svm = SVC(kernel='rbf', C=10, random_state=1, probability = True)

In [6]:
re_stf = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)
scores = cross_val_score(svm, X_train, y_train, scoring = "f1_micro", cv = re_stf)

In [7]:
print("Mean CV F1 : %.3f"% np.mean(scores))

Mean CV F1 : 0.954


In [8]:
from sklearn.metrics import roc_auc_score

In [9]:
svm.fit(X_train,y_train)
pred_y = svm.predict(X_test)

In [10]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [11]:
print("F1 : %.3f" % f1_score(y_test, pred_y, average = 'micro'))
print("ROC AUC : %.3f" % roc_auc_score(y_test, pred_y))

F1 : 0.947
ROC AUC : 0.591


In [12]:
X2_train, X2_test, y2_train, y2_test = train_test_split(datamedian.iloc[:, 0:(datamedian.shape[1]-1)], datamedian['class'], test_size=0.3, random_state= 1)

In [13]:
re_stf2 = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)
scores2 = cross_val_score(svm, X2_train, y2_train, scoring = "f1_micro", cv = re_stf2)

In [14]:
print("Mean CV F1 : %.3f"% np.mean(scores2))

Mean CV F1 : 0.951


In [15]:
svm.fit(X2_train,y2_train)
pred_y2 = svm.predict(X2_test)

In [16]:
print("F1 : %.3f" % f1_score(y2_test, pred_y2, average = 'micro'))
print("ROC AUC : %.3f" % roc_auc_score(y2_test, pred_y2))

F1 : 0.956
ROC AUC : 0.613


In [17]:
X3_train, X3_test, y3_train, y3_test = train_test_split(dataknn.iloc[:, 0:(dataknn.shape[1]-1)], dataknn['class'], test_size=0.3, random_state= 1)

In [18]:
re_stf3 = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)
scores3 = cross_val_score(svm, X3_train, y3_train, scoring = "f1_micro", cv = re_stf3)

In [19]:
print("Mean CV F1 : %.3f"% np.mean(scores3))

Mean CV F1 : 0.954


In [20]:
svm.fit(X3_train,y3_train)
pred_y3 = svm.predict(X3_test)

In [21]:
print("F1 : %.3f" % f1_score(y3_test, pred_y3, average = 'micro'))
print("ROC AUC : %.3f" % roc_auc_score(y3_test, pred_y3))

F1 : 0.948
ROC AUC : 0.580


In [22]:
from sklearn.datasets import load_digits
from sklearn.model_selection import validation_curve

  from collections import Mapping, defaultdict


In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [24]:
pipe_svc = Pipeline([('scl', StandardScaler()), ('clf', SVC(random_state=1))])

In [25]:
scores4 = cross_val_score(pipe_svc, X_train, y_train, scoring = "f1_micro", cv = re_stf)
print("Mean CV F1 : %.3f"% np.mean(scores4))

Mean CV F1 : 0.957


In [26]:
scores5 = cross_val_score(pipe_svc, X_train, y_train, scoring = "roc_auc", cv = re_stf)
print("Mean CV ROC_AUC : %.3f"% np.mean(scores2))

Mean CV ROC_AUC : 0.951


In [27]:
pipe_svc.fit(X_train, y_train)
pred_y4 = pipe_svc.predict(X_test)

In [28]:
print("F1 : %.3f" % f1_score(y_test, pred_y4, average = 'micro'))
print("ROC AUC : %.3f" % roc_auc_score(y_test, pred_y4))

F1 : 0.948
ROC AUC : 0.584


In [38]:
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
param_grid = [
    {'clf__C': param_range, 'clf__kernel': ['linear']},
    {'clf__C': param_range, 'clf__gamma': param_range, 'clf__kernel': ['rbf']}]

gs = GridSearchCV(pipe_svc, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=1)

In [None]:
gs= gs.fit(X_train, y_train)

In [None]:
gs.cv_results_["params"]

In [None]:
gs.cv_results_["mean_test_score"]

In [None]:
print(gs.best_score_)
print(gs.best_params_)

In [None]:
y4_test_predicted = gs.predict(X_test)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, gs.predict_proba(X_test)[:,1])

plt.gs()
plt.plot(false_positive_rate, true_positive_rate)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve-' + '-' + str(gs))
plt.show()

In [29]:
pipe_svc2 = Pipeline([('scl', StandardScaler()), ('clf', SVC(random_state=1))])

In [30]:
scores6 = cross_val_score(pipe_svc2, X2_train, y2_train, scoring = "f1_micro", cv = re_stf2)
print("Mean CV F1 : %.3f"% np.mean(scores6))

Mean CV F1 : 0.953


In [31]:
scores7 = cross_val_score(pipe_svc2, X2_train, y2_train, scoring = "roc_auc", cv = re_stf2)
print("Mean CV ROC_AUC : %.3f"% np.mean(scores7))

Mean CV ROC_AUC : 0.687


In [33]:
pipe_svc2.fit(X2_train, y2_train)
pred_y5 = pipe_svc2.predict(X2_test)

In [34]:
pipe_svc3 = Pipeline([('scl', StandardScaler()), ('clf', SVC(random_state=1))])

In [35]:
scores7 = cross_val_score(pipe_svc3, X3_train, y3_train, scoring = "f1_micro", cv = re_stf3)
print("Mean CV F1 : %.3f"% np.mean(scores7))

Mean CV F1 : 0.955


In [36]:
scores8 = cross_val_score(pipe_svc3, X3_train, y3_train, scoring = "roc_auc", cv = re_stf3)
print("Mean CV ROC_AUC : %.3f"% np.mean(scores8))

Mean CV ROC_AUC : 0.692


In [37]:
pipe_svc3.fit(X3_train, y3_train)
pred_y5 = pipe_svc3.predict(X3_test)

In [None]:
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
param_grid = [
    {'clf__C': param_range, 'clf__kernel': ['linear']},
    {'clf__C': param_range, 'clf__gamma': param_range, 'clf__kernel': ['rbf']}]

gs2 = GridSearchCV(pipe_svc2, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=1)

In [None]:
gs2= gs2.fit(X2_train, y2_train)

In [None]:
gs2.cv_results_["params"]

In [None]:
gs2.cv_results_["mean_test_score"]

In [None]:
print(gs2.best_score_)
print(gs2.best_params_)

In [None]:
y5_test_predicted = gs.predict(X2_test)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y2_test, gs.predict_proba(X2_test)[:,1])

plt.gs2()
plt.plot(false_positive_rate, true_positive_rate)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve-' + '-' + str(gs2))
plt.show()

In [None]:
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
param_grid = [
    {'clf__C': param_range, 'clf__kernel': ['linear']},
    {'clf__C': param_range, 'clf__gamma': param_range, 'clf__kernel': ['rbf']}]

gs3 = GridSearchCV(pipe_svc3, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=1)

In [None]:
gs3= gs3.fit(X3_train, y3_train)

In [None]:
gs3.cv_results_["params"]

In [None]:
gs3.cv_results_["mean_test_score"]

In [None]:
print(gs3.best_score_)
print(gs3.best_params_)

In [None]:
y6_test_predicted = gs.predict(X3_test)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y3_test, gs.predict_proba(X3_test)[:,1])

plt.gs3()
plt.plot(false_positive_rate, true_positive_rate)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve-' + '-' + str(gs3))
plt.show()