In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier

np.set_printoptions(precision = 3)

In [2]:
def plot_matrix(matrix_seq, xAx, labels):
    true_neg = list(mat[0][0] for mat in matrix_seq)
    false_neg = list(mat[1][0] for mat in matrix_seq)
    false_pos = list(mat[0][1] for mat in matrix_seq)
    true_pos = list(mat[1][1] for mat in matrix_seq)
    #xAx = list(range(len(matrix_seq)))
    plt.plot(xAx, true_neg, color = 'red', label = 'TN')
    plt.plot(xAx, false_neg, color = 'blue', label = 'FN')
    plt.plot(xAx, true_pos, color = 'green', label = 'TP')
    plt.plot(xAx, false_pos, color = 'black', label = 'FP')
    plt.legend(loc='best')
    plt.xlabel(labels[0])
    plt.ylabel(labels[1])
    plt.grid(linewidth=1)
    plt.show()
    return 


def plot_scores(scores, param, labels):
    plt.plot(param, scores)
    plt.xlabel(labels[0])
    plt.ylabel(labels[1])
    plt.grid(linewidth=1)
    plt.show()
    return

def plot_metrics(metrics, xAx, labels):
    marks = ['o','v','x','D']
    i = 0
    for key in metrics:
        plt.plot(xAx, metrics[key], label = key, marker = marks[i])
        i += 1
    plt.xlabel(labels[0])
    plt.ylabel(labels[1])
    plt.grid(linewidth = 1)
    plt.legend(loc='best')
    plt.show()
    return

def full_plot(mat, par, mat_lab):
        
#     plt.figure(figsize=(10,10))
#     plt.subplot(2,1,1)
#     plt.title("Confusion matrix")
#     plot_matrix(mat, par, mat_lab)
    
#     plt.figure(figsize = (10,10))
#     plt.subplot(2,1,1)
    
    plt.figure(figsize = (10,10))
    plt.subplot(2,1,1)
    plt.title("Positive metrics")
    metrics = get_metrics(mat, 'positive')
    best_scores = get_best_scores(metrics)
    best_scores['Parameter'] = par[best_scores['Parameter']]
    plot_metrics(metrics, par, mat_lab)
    
    print_scores(best_scores)
    
#     plt.figure(figsize = (10,10))
#     plt.subplot(2,1,2)
#     plt.title("Negative metrics")
#     metrics = get_metrics(mat, 'negative')
#     best_scores = get_best_scores(metrics)
#     best_scores['Parameter'] = par[best_scores['Parameter']]
#     plot_metrics(metrics, par, mat_lab)
    
#     print_scores(best_scores)
    return best_scores
    
def print_scores(scor):
    for key in scor:
        print(key, '\t\t-\t', "%.4f" % (scor[key]))
    return
    
def get_metrics(matrix_seq, cl):
    true_neg = np.asarray(list(mat[0][0] for mat in matrix_seq))
    false_neg = np.asarray(list(mat[1][0] for mat in matrix_seq))
    false_pos = np.asarray(list(mat[0][1] for mat in matrix_seq))
    true_pos = np.asarray(list(mat[1][1] for mat in matrix_seq))
    
    if cl == 'positive':
        accuracy = 100*(true_pos+true_neg)/(true_pos+true_neg+false_pos+false_neg)
        precision = 100*true_pos/np.add(true_pos, false_pos)
        recall = 100*true_pos/(true_pos + false_neg)
        FM = ((true_pos/(true_pos + false_pos))*(true_pos/(true_pos+false_neg)))**(1/2)
        F1score = 2*precision*recall/(precision+recall)
    elif cl == 'negative':
        accuracy = (true_pos+true_neg)/(true_pos+true_neg+false_pos+false_neg)
        precision = true_neg/np.add(true_neg, false_neg)
        recall = true_neg/(true_neg + false_pos)
        FM = ((true_pos/(true_pos + false_pos))*(true_pos/(true_pos+false_neg)))**(1/2)
        F1score = 2*precision*recall/(precision+recall)
        
    metr = dict()
    metr['Recall'] = recall
    metr['Precision'] = precision
    metr['F1 score'] = F1score
#     metr['Folwkes-Mallows'] = FM
    metr['Accuracy'] = accuracy
    return metr

def get_best_scores(metr):
    best_sc = dict()
    max_ind = np.argmax(metr['F1 score'])
    for key in metr:
        best_sc[key] = metr[key][max_ind]
    best_sc["Parameter"] = max_ind
    return best_sc

In [3]:
df = pd.read_csv('peak_data_mining.csv')

In [4]:
df.head()

Unnamed: 0,name,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Target
0,semchuk_toner_high_close_7_0_Con,-1.777306e-12,0.036519,0.19119,0.455394,0.689618,0.813873,0.904906,1.0,0.937068,0.594997,0.094981,-0.348772,-0.600293,-0.676979,-0.699226,0
1,semchuk_toner_high_close_7_1_Con,0.03651911,0.19119,0.455394,0.689618,0.813873,0.904906,1.0,0.937068,0.594997,0.094981,-0.348772,-0.600293,-0.676979,-0.699226,-0.761844,0
2,semchuk_toner_high_close_7_2_Con,0.1911896,0.455394,0.689618,0.813873,0.904906,1.0,0.937068,0.594997,0.094981,-0.348772,-0.600293,-0.676979,-0.699226,-0.761844,-0.882136,0
3,semchuk_toner_high_close_7_3_Con,0.455394,0.689618,0.813873,0.904906,1.0,0.937068,0.594997,0.094981,-0.348772,-0.600293,-0.676979,-0.699226,-0.761844,-0.882136,-1.061923,0
4,semchuk_toner_high_close_7_4_Con,0.6896176,0.813873,0.904906,1.0,0.937068,0.594997,0.094981,-0.348772,-0.600293,-0.676979,-0.699226,-0.761844,-0.882136,-1.061923,-1.343781,0


In [5]:
keys = df.columns.values
keys = keys[1:-1]
print(keys)

['0' '1' '2' '3' '4' '5' '6' '7' '8' '9' '10' '11' '12' '13' '14']


In [6]:
nulls = df[df['Target']==0].count()
nulls = nulls['name']
ones = df[df['Target']==1].count()
ones = ones['name']

lenDiff = abs(nulls-ones)
halfLength = int(ones/2)
print(halfLength)

df_null = df[df['Target']==0].iloc[lenDiff:]
df_ed = df[df['Target']==1]
df_train = pd.concat([df_null.head(halfLength), df_ed.head(halfLength)], ignore_index = True)
df_ver = pd.concat([df_null.tail(halfLength), df_ed.tail(halfLength)], ignore_index = True)

df_train.sample(frac=1).reset_index(drop=True)
df_ver.sample(frac=1).reset_index(drop=True)

7955


Unnamed: 0,name,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Target
0,semchuk_toner_move_2_755_Con,-3.090297,-2.849647,-2.568542,-2.198858,-1.752188,-1.285374,-0.854103,-0.488344,-0.189523,0.058863,0.277234,0.479769,0.670782,0.846593,1.000000,0
1,semchuk_toner_move_3_113_Con,1.000000,0.809691,0.513109,0.197421,-0.043409,-0.139099,-0.079585,0.061579,0.152072,0.078740,-0.179297,-0.535640,-0.844307,-0.976030,-0.873613,0
2,semchuk_toner_move_2_354_Con,0.805867,0.933204,1.000000,0.973661,0.833983,0.581806,0.238929,-0.142994,-0.503175,-0.815862,-1.086277,-1.322977,-1.525722,-1.684973,-1.785632,0
3,semchuk_medium_medium_551_Con,-0.136551,-0.091662,-0.038124,0.014506,0.041371,0.036596,0.024493,0.015537,-0.004903,-0.029819,-0.010352,0.133980,0.429188,0.768393,1.000000,1
4,Omel_2_240_Vid,-1.113046,0.034576,0.486321,0.483146,0.316822,0.205264,0.227401,0.345875,0.485799,0.592368,0.649813,0.683761,0.742996,0.854871,1.000000,0
5,semchuk_toner_move_3_381_Vid,0.030185,0.192625,0.081355,-0.060792,-0.029351,0.116962,0.237748,0.416998,0.741870,1.000000,0.986105,0.688814,0.067011,-0.798350,-1.515345,0
6,semchuk5_prised_318_Con,1.000000,0.390477,-0.535429,-0.687392,0.180305,0.952827,0.895384,0.613790,0.539216,0.399433,0.269841,0.334257,0.256747,0.044268,0.114448,0
7,semchuk_toner_move_3_307_Vid,1.000000,2.893887,4.030004,4.805265,5.388382,5.838946,6.019939,5.760350,5.204484,4.691894,4.430440,4.415268,4.462518,4.351740,4.014856,0
8,semchuk_toner_move_2_465_Vid,0.627549,0.782196,0.900574,0.974208,1.000000,0.987322,0.955012,0.914093,0.860814,0.785781,0.678629,0.536795,0.376576,0.226431,0.108107,0
9,valya1_407_Vid,-0.379858,-0.211522,0.157798,0.538501,0.804918,0.955711,1.000000,0.914470,0.725723,0.506890,0.283320,0.037221,-0.193935,-0.315398,-0.282002,1


In [8]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15, 5), activation = 'logistic')
features = keys   

scores = []
alphas = []
conf_mat = []
for i in range(200, 500, 10):
    a = 2/i
    clf.alpha = a
    clf.fit(df_train[features].values, df_train['Target'].values)
    scoreRes = clf.score(df_ver[features].values, df_ver['Target'].values)
    res_pred = clf.predict(df_ver[features].values)
    scores.append(scoreRes)
    conf_mat.append(confusion_matrix(res_pred, df_ver['Target'].values)/halfLength)
    alphas.append(a)
full_plot(conf_mat, alphas, ["Regularization parameter", 'Score, %'])
plt.show()

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').