# Classification for POIs fusion validation action

In [None]:
# Library imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from helpers import *
from sklearn import svm 
from sklearn.model_selection import train_test_split, cross_validate, ShuffleSplit
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [None]:
# Read the data
feat = pd.read_csv("features_export.csv")
X = transform(feat[feat.columns.values[range(40)]], False)
y = X.pop(' acceptance')

In [None]:
# Classification process
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", #"QDA", 
         "DecisionTreeClassifier","GradientBoostingClassifier",
         "ExtraTreesClassifier"]

classifiers = [
    KNeighborsClassifier(3),
    svm.SVC(kernel="linear", C=0.025),
    svm.SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1,max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    #QuadraticDiscriminantAnalysis(),
    DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=0),
    GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0),
    ExtraTreesClassifier(n_estimators=28, max_depth=None, min_samples_split=11, random_state=0),
    ]

scr={}
inf={}
df3= pd.DataFrame()
error=pd.DataFrame()
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
for name, clf in zip(names, classifiers):
    scoring=['accuracy','precision','f1', 'recall']
    scores = cross_validate(clf, X, y, cv=cv, scoring=scoring, return_train_score=True) # this function does cross validation
    inf[name]= clf.get_params()
    scr[name]={'test_score':scores.get('test_accuracy'),
               'train_score':scores.get('train_accuracy')}
    df3[name] = [scores.get('test_accuracy').mean(),  scores.get('train_accuracy').mean() ]
    error[name]  = [scores.get('test_accuracy').std() * 2, scores.get('train_accuracy').std() * 2]
    print(name, ":" ,"Test Accuracy: \033[91m %0.2f  (+/- %0.2f)\033[0m, Train Accuracy: %0.2f (+/- %0.2f)" % (scores.get('test_accuracy').mean(), scores.get('test_accuracy').std() * 2,
                                                                                              scores.get('train_accuracy').mean(), scores.get('train_accuracy').std() * 2)) # print the average and variance of cv

    print(name, ":" ,"f1: %0.2f (+/- %0.2f), Precision: %0.2f (+/- %0.2f), recall: %0.2f (+/- %0.2f)" % (scores.get('test_f1').mean(), scores.get('test_f1').std() * 2,
                                                                                                        scores.get('test_precision').mean(), scores.get('test_precision').std() * 2,
                                                                                                        scores.get('test_recall').mean(), scores.get('test_recall').std() * 2)) 

In [None]:
fig, ax = plt.subplots()
ax.set_ylabel('Scores')
ax.set_xticklabels(names)

ax = df3.rename(index={0:'test',1: 'train'}).T.plot.bar( yerr=error.rename(index={0:'test',1: 'train'}).T, ax=ax, tick_label= names)
fig.set_size_inches(18.5, 10.5)
fig.savefig('test2png.png', dpi=100)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
scoring = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)}
gs = GridSearchCV(ExtraTreesClassifier( random_state=0),
                  param_grid={'n_estimators': range(15,30), 'min_samples_split': range(2, 35)},
                 scoring=scoring, cv=5, refit='AUC',n_jobs=4)
gs.fit(X, y)
results = gs.cv_results_

In [None]:
gs.best_params_