# Classification for POIs fusion action

In [None]:
# Library imports
import matplotlib.pyplot as plt
from helpers import *
from sklearn import svm 
from sklearn.model_selection import train_test_split, cross_validate, ShuffleSplit
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

import keras
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier

In [None]:
# Read the data
feat = pd.read_csv("features_export.csv")

feat = feat[feat.columns.values[range(feat.shape[1]-1)]] 
X1=feat[feat[' acceptance']]

X2=swap_symetric(X1)
X= pd.concat([X1, X2], axis=0)  

In [None]:
X.pop(' acceptance')
y1= X.pop(' nameFusionAction')
y= pd.get_dummies(y1)
X = transform(X, True)

In [None]:
X.shape[1]

In [None]:
# A function for the creation of the NN using keras
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(10, input_dim=97, activation='relu'))
    model.add(Dense(4, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy','categorical_accuracy'])
    return model

In [None]:
# Classification process
names1 = ["keras","keras2", "Random Forest", "Neural Net", 
         "DecisionTreeClassifier"]

classifiers1 = [
    KerasClassifier(build_fn=baseline_model, epochs=100, batch_size=15, verbose=0),
    KerasClassifier(build_fn=baseline_model, epochs=100, batch_size=10, verbose=0),
    RandomForestClassifier(max_depth=9, n_estimators=3, max_features=3),
    MLPClassifier(alpha=1.11, solver='lbfgs',max_iter=1000),
    DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=0),
    ]

names2 = [ "Random Forest(labeled)", "Lnear SVM(labeld targets)", 
         "SVM OVO(labeld targets)"]

classifiers2 = [
    RandomForestClassifier(max_depth=2, n_estimators=7, max_features=6),
    svm.LinearSVC(multi_class='crammer_singer'),
    svm.SVC(decision_function_shape='ovo'),
    ]

scr={}
inf={}
df3= pd.DataFrame()
error=pd.DataFrame()
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

for name, clf in zip(names1, classifiers1):
    scores = cross_validate(clf, X, y, cv=cv, return_train_score=True) # this function does cross validation
 
    inf[name]= clf.get_params()
    scr[name]=scores
    df3[name] = [scores.get('test_score').mean(),  scores.get('train_score').mean() ]
    error[name]  = [scores.get('test_score').std() * 2, scores.get('train_score').std() * 2]
    print(name, ":" ,"Test Accuracy: %0.2f (+/- %0.2f), Train Accuracy: %0.2f (+/- %0.2f)| %2.2f sec" % (scores.get('test_score').mean(), scores.get('test_score').std() * 2,
                                                                                                         scores.get('train_score').mean(), scores.get('train_score').std() * 2, scores.get('fit_time').mean()) ) # print the average and variance of cv

for name, clf in zip(names2, classifiers2):
    scores = cross_validate(clf, X, y1, cv=cv, return_train_score=True) # this function does cross validation

    inf[name]= clf.get_params()
    scr[name]=scores
    df3[name] = [scores.get('test_score').mean(),  scores.get('train_score').mean() ]
    error[name]  = [scores.get('test_score').std() * 2, scores.get('train_score').std() * 2]
    print(name, ":" ,"Test Accuracy: %0.2f (+/- %0.2f), Train Accuracy: %0.2f (+/- %0.2f)| %2.2f sec" 
      % (scores.get('test_score').mean(), scores.get('test_score').std() * 2,
         scores.get('train_score').mean(), scores.get('train_score').std() * 2, scores.get('fit_time').mean()) ) # print the average and variance of cv                                 

In [None]:
fig, ax = plt.subplots()
ax.set_ylabel('Scores')
ax.set_xticklabels(names1+names2)

ax = df3.rename(index={0:'test',1: 'train'}).T.plot.bar( yerr=error.rename(index={0:'test',1: 'train'}).T, ax=ax, tick_label= names)
fig.set_size_inches(18.5, 10.5)
fig.savefig('test2png.png', dpi=100)

An example of a grid search for a classification aglgorithm

In [None]:
from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(RandomForestClassifier( ),
                  param_grid={'n_estimators': range(1,20), 'max_depth': range(1, 10),'max_features' :range(1,10)},
                   cv=5, refit=True, n_jobs=4)
gs.fit(X, y1)
results = gs.cv_results_

In [None]:
# Get the best parameters
gs.best_params_

In [None]:
plt.figure(figsize=(13, 13))
plt.title("GridSearchCV evaluating using multiple scorers simultaneously",
          fontsize=16)

plt.xlabel("alpha")
plt.ylabel("Score")
plt.grid()

ax = plt.axes()
ax.set_xlim(1, 19)
ax.set_ylim(0.0, 1)

# Get the regular numpy array from the MaskedArray
X_axis = np.array(results['param_n_estimators'].data, dtype=float)

for scorer, color in zip(sorted(scoring), ['g', 'k']):
    for sample, style in (('train', '--'), ('test', '-')):
        sample_score_mean = results['mean_%s_%s' % (sample, "score")]
        sample_score_std = results['std_%s_%s' % (sample, "score")]
        ax.fill_between(X_axis, sample_score_mean - sample_score_std,
                        sample_score_mean + sample_score_std,
                        alpha=0.1 if sample == 'test' else 0, color=color)
        ax.plot(X_axis, sample_score_mean, style, color=color,
                alpha=1 if sample == 'test' else 0.7,
                label="%s (%s)" % (scorer, sample))

    best_index = np.nonzero(results['rank_test_%s' % "score"] == 1)[0][0]
    best_score = results['mean_test_%s' % "score"][best_index]
    
    # Plot a dotted vertical line at the best score for that scorer marked by x
    ax.plot([X_axis[best_index], ] * 2, [0, best_score],
            linestyle='-.', color=color, marker='x', markeredgewidth=3, ms=8)

    # Annotate the best score for that scorer
    ax.annotate("%0.2f" % best_score,
                (X_axis[best_index], best_score + 0.005))

plt.legend(loc="best")
plt.grid('off')
plt.show()