## Classification for POIs fusion validation action -  Double data case

In [None]:
# Library imports
import matplotlib.pyplot as plt
from helpers import *
from sklearn import svm 
from sklearn.model_selection import train_test_split, cross_validate, ShuffleSplit, KFold
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [None]:
feat = pd.read_csv("features_export.csv") # must be on the same folder
X1 = feat[feat.columns.values[range(feat.shape[1]-2)]]
test=pd.DataFrame()
X1 = transform2(X1,test)
X11, X22 = transform_object_pair(test, ' BcontainsFT_T', ' AcontainsFT_T')

In [None]:
# Classification process

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", #"QDA", 
         "DecisionTreeClassifier","GradientBoostingClassifier",
         "ExtraTreesClassifier","ExtraTreesClassifier"]

classifiers = [
    KNeighborsClassifier(3),
    svm.SVC(kernel="linear", C=0.025),
    svm.SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1,max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    #QuadraticDiscriminantAnalysis(),
    DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=0),
    GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0),
    ExtraTreesClassifier(n_estimators=28, max_depth=None, min_samples_split=11, random_state=0),
    ExtraTreesClassifier(n_estimators=24, max_depth=None, min_samples_split=3, random_state=0),
    ]

cross={}  
# Read the data

feat = pd.read_csv("features_export.csv")
X1 = feat[feat.columns.values[range(40)]]
X1 = transform2(X1,test)
kf = KFold(n_splits=5, random_state=0, shuffle=True)
k=0
for train_index, test_index in kf.split(X1):
   
    
    # split the data
    X_train, X_test = X1.iloc[train_index], X1.iloc[test_index]
    z= X1[' acceptance']
    y_train, y_test = z[train_index], z[test_index]

    X2 =swap_symetric(X_train)
    
    
    X_train = pd.concat([X_train, X11.iloc[train_index], X22.iloc[train_index]], axis=1)# append the expanded object
    X2 = pd.concat([X2, X22.iloc[train_index], X11.iloc[train_index]], axis=1)# append the expanded object
    
    X = pd.concat([X_train, X2], axis=0)  
    y = X.pop(' acceptance')

    X_test = pd.concat([X_test, X11.loc[X_test.index], X22.loc[X_test.index]], axis=1)
    X_test.pop(' acceptance');

    for name, clf in zip(names, classifiers):

        clf.fit(X,y)
        if k==0:
            cross[name]={'accuracy':[clf.score(X_test, y_test)]}
        else:
            zz=cross[name].get('accuracy')
            zz.append(clf.score(X_test, y_test))
            cross[name]={'accuracy':zz}
    k=k+1

       

In [None]:
inf = {}
scr = {}
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
for name, clf in zip(names, classifiers):
        #clf.fit(X_train, y_train)
        scoring=['accuracy','precision','f1', 'recall']
        scores = cross_validate(clf, X, y, cv=cv, scoring=scoring, return_train_score=True) # this function does cross validation

        inf[name]= clf.get_params()
        scr[name]={'test_score':scores.get('test_accuracy'),
                   'train_score':scores.get('train_accuracy')}
        clf.fit(X,y)
        clf.score(X_test, y_test)

        print(name, ":" ,"Test Accuracy: %0.2f (+/- %0.2f), Train Accuracy: %0.2f (+/- %0.2f)" % (scores.get('test_accuracy').mean(), scores.get('test_accuracy').std() * 2,
                                                                                                  scores.get('train_accuracy').mean(), scores.get('train_accuracy').std() * 2)) # print the average and variance of cv
        print(name, ":" ,"Real Accuracy: \033[91m %0.2f  (+/- %0.2f)\033[0m" % (np.asarray(cross[name].get('accuracy')).mean(), np.asarray(cross[name].get('accuracy')).std() * 2))

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
scoring = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)}
gs = GridSearchCV(ExtraTreesClassifier( random_state=0),
                  param_grid={'n_estimators': range(15,30), 'min_samples_split': range(2, 35)},
                 scoring=scoring, cv=5, refit='AUC',n_jobs=4)
gs.fit(X, y)
results = gs.cv_results_

In [None]:
gs.best_params_