In [3]:
import csv
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
import matplotlib.pyplot as plt

In [2]:
#!pip install imblearn

Collecting imblearn
  Downloading https://files.pythonhosted.org/packages/81/a7/4179e6ebfd654bd0eac0b9c06125b8b4c96a9d0a8ff9e9507eb2a26d2d7e/imblearn-0.0-py2.py3-none-any.whl
Collecting imbalanced-learn (from imblearn)
  Downloading https://files.pythonhosted.org/packages/e6/62/08c14224a7e242df2cef7b312d2ef821c3931ec9b015ff93bb52ec8a10a3/imbalanced_learn-0.5.0-py3-none-any.whl (173kB)
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.5.0 imblearn-0.0


In [4]:
score={'AUC':'roc_auc', 
           'RECALL':'recall',
           'PRECISION':'precision',
           'F1':'f1'}

In [7]:
X_train = pd.read_csv(r"C:\Users\sdauser\Documents\ML\drive-download-20191121T164109Z-001\X_all_train_wo_OS.csv", low_memory=False, index_col = 0).iloc[:, 2:]
y_train = pd.read_csv(r"C:\Users\sdauser\Documents\ML\drive-download-20191121T164109Z-001\y_train_wo_OS.csv", low_memory=False, index_col = 0).squeeze()
X_test = pd.read_csv(r"C:\Users\sdauser\Documents\ML\drive-download-20191121T164109Z-001\X_all_test.csv", low_memory=False, index_col = 0).iloc[:, 2:]
y_test = pd.read_csv(r"C:\Users\sdauser\Documents\ML\drive-download-20191121T164109Z-001\y_test.csv", low_memory=False, index_col = 0).squeeze()

In [8]:
LogReg = Pipeline([
            ('sampling', RandomOverSampler()),
            ('classification', LogisticRegression(solver='lbfgs', random_state=0))
            ])
LogReg_para = {}
RandF = Pipeline([
            ('sampling', RandomOverSampler()),
            ('classification', RandomForestClassifier(random_state=0))
            ])
RandF_para = {'classification__n_estimators':[20, 50, 100, 200, 400, 800], 'classification__max_depth':[2, 5, 10, 20]}
AdaBoost = Pipeline([
            ('sampling', RandomOverSampler()),
            ('classification', AdaBoostClassifier(random_state=0))
            ])
AdaBoost_para = {'classification__n_estimators':[20, 50, 100, 200, 400, 800]}
SVM = Pipeline([
            ('sampling', RandomOverSampler()),
            ('classification', SVC(decision_function_shape='ovr', degree=3, gamma='auto'))
            ]) 
SVM_para = {'classification__C':[0.01, 0.1, 1, 10], 'classification__kernel':('linear', 'rbf')}
NaivBay = Pipeline([
            ('sampling', RandomOverSampler()),
            ('classification', GaussianNB())
            ])
NaivBay_para = {}
Knn = Pipeline([
            ('sampling', RandomOverSampler()),
            ('classification', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski'))
            ])
Knn_para = {'classification__n_neighbors': (10, 15, 25)}

In [9]:
clasifier_names = ["Logistic Regression", "Random Forest", "Adaptive Boosting", "Support Vector Machines", "Naive Bayes", "K Nearest Neighbours"]
classifiers = [LogReg, RandF, AdaBoost, SVM, NaivBay, Knn]
parameters = [LogReg_para, RandF_para, AdaBoost_para, SVM_para, NaivBay_para, Knn_para]

In [None]:
results = list()

for i in range(len(classifiers)):
    clf = GridSearchCV(classifiers[i], parameters[i], cv=5, scoring=score, n_jobs=-1, refit=False, return_train_score=True)
    clf.fit(X_train, y_train)
    results.append([clasifier_names[i], clf.cv_results_])
    print(clasifier_names[i])
    print(clf.cv_results_)

with open("results_ML.csv", 'w', newline='') as myfile:
     wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
     wr.writerow(results)

Logistic Regression
{'mean_fit_time': array([1.47324486]), 'std_fit_time': array([0.46471457]), 'mean_score_time': array([0.06503663]), 'std_score_time': array([0.02136256]), 'params': [{}], 'split0_test_AUC': array([0.5]), 'split1_test_AUC': array([0.5]), 'split2_test_AUC': array([0.5]), 'split3_test_AUC': array([0.5]), 'split4_test_AUC': array([0.5]), 'mean_test_AUC': array([0.5]), 'std_test_AUC': array([0.]), 'rank_test_AUC': array([1]), 'split0_train_AUC': array([0.5]), 'split1_train_AUC': array([0.5]), 'split2_train_AUC': array([0.5]), 'split3_train_AUC': array([0.5]), 'split4_train_AUC': array([0.5]), 'mean_train_AUC': array([0.5]), 'std_train_AUC': array([0.]), 'split0_test_RECALL': array([0.]), 'split1_test_RECALL': array([0.]), 'split2_test_RECALL': array([0.]), 'split3_test_RECALL': array([0.]), 'split4_test_RECALL': array([0.]), 'mean_test_RECALL': array([0.]), 'std_test_RECALL': array([0.]), 'rank_test_RECALL': array([1]), 'split0_train_RECALL': array([0.]), 'split1_train_R

In [None]:
label, test_precision, test_recall, train_precision, train_recall = list(),list(),list(),list(),list()
for result in results:
    label.append(result[0])
    test_precision.extend([result[1]['mean_test_PRECISION'].tolist()])
    test_recall.extend([result[1]['mean_test_RECALL'].tolist()])
    train_precision.extend([result[1]['mean_train_PRECISION'].tolist()])
    train_recall.extend([result[1]['mean_train_RECALL'].tolist()])

colors = {"Logistic Regression":"red", "Random Forest":"blue", "Adaptive Boosting":"green", "Naive Bayes":"orange", "Support Vector Machines":"black", "K Nearest Neighbours":"purple"}

fig, ax = plt.subplots(figsize=(15,15))
for i in range(len(label)):
    ax.scatter(test_recall[i], test_precision[i], c=colors[label[i]], label=label[i])
ax.axis((0,1,0,1))
ax.set_xlabel("Recall")
ax.set_ylabel("Precision")
plt.legend()
plt.title("Test scores: Recall vs. Precision")
plt.savefig("test_scores_recall_vs_precision.png", transparent=True)
plt.show()

fig, ax = plt.subplots(figsize=(15,15))
for i in range(len(label)):
    ax.scatter(train_recall[i], train_precision[i], c=colors[label[i]], label=label[i])
ax.axis((0,1,0,1))
ax.set_xlabel("Recall")
ax.set_ylabel("Precision")
plt.legend()
plt.title("Train scores: Recall vs. Precision")
plt.savefig("train_scores_recall_vs_precision.png", transparent=True)
plt.show()