# Algorithms

In [9]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.metrics import confusion_matrix, plot_roc_curve, plot_confusion_matrix, plot_precision_recall_curve
from sklearn.model_selection import GridSearchCV

## First experiment: normalize data

In [10]:
def readCut(dir):
    xTrain = pd.read_csv("{}/xTrain.csv".format(dir), sep=',', error_bad_lines=True, index_col=False, dtype='unicode')
    yTrain = pd.read_csv("{}/yTrain.csv".format(dir), sep=',', error_bad_lines=True, index_col=False, dtype='unicode')
    xTest = pd.read_csv("{}/xTest.csv".format(dir), sep=',', error_bad_lines=True, index_col=False, dtype='unicode')
    yTest = pd.read_csv("{}/yTest.csv".format(dir), sep=',', error_bad_lines=True, index_col=False, dtype='unicode')
    return xTrain, yTrain, xTest, yTest

### Logistic Regression

In [3]:
mean_accuracy_score = 0
mean_precision_score = 0
mean_recall_score = 0
mean_f1_score = 0

In [11]:
def doLogisticRegression(xTrain, yTrain, xTest, yTest):
    logisticRegression = LogisticRegression()
    param_grid = [{'penalty':['l2'],
                'C' : np.logspace(-4, 4, 20),
                'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                'max_iter':[100, 500, 1000],
                'fit_intercept' : [True, False]
    }]
    logreg_cv=GridSearchCV(logisticRegression,param_grid)
    logreg_cv.fit(xTrain, yTrain.values.ravel())

    print("Tuned hyperparameters :(best parameters) ",logreg_cv.best_params_)

    yPred = logreg_cv.predict(xTest)

    print('Accuracy Score : ' + str(accuracy_score(yTest.values.ravel(),yPred)))
    print('Precision Score : ' + str(precision_score(yTest.values.ravel(),yPred, pos_label='1')))
    print('Recall Score : ' + str(recall_score(yTest.values.ravel(),yPred, pos_label='1')))
    print('F1 Score : ' + str(f1_score(yTest.values.ravel(),yPred, pos_label='1')))


In [12]:
def doKNN(xTrain, yTrain, xTest, yTest):
    knn = KNeighborsClassifier()
    param_grid = [{'n_neighbors':[i for i in range(3,11)],
                'weights' : ['uniform', 'distance'],
                'algorithm' : ['ball_tree', 'kd_tree'],
                'leaf_size' : [i for i in range(20,41)],
                'metric' : ['euclidean', 'manhattan', 'chebyshev']
    }]
    knn_cv=GridSearchCV(knn,param_grid)
    knn_cv.fit(xTrain, yTrain.values.ravel())

    print("Tuned hyperparameters :(best parameters) ",knn_cv.best_params_)

    yPred = knn_cv.predict(xTest)

    print('Accuracy Score : ' + str(accuracy_score(yTest.values.ravel(),yPred)))
    print('Precision Score : ' + str(precision_score(yTest.values.ravel(),yPred, pos_label='1')))
    print('Recall Score : ' + str(recall_score(yTest.values.ravel(),yPred, pos_label='1')))
    print('F1 Score : ' + str(f1_score(yTest.values.ravel(),yPred, pos_label='1')))

In [13]:
def doDecisionTree(xTrain, yTrain, xTest, yTest):
    decisionTree = DecisionTreeClassifier()
    param_grid = [{'criterion':['gini', 'entropy'],
                'max_depth': np.arange(3,15).tolist() + [None],
                'splitter' : ['best', 'random'],
                'max_features' : ['sqrt', 'log2', None]
    }]
    decisionTree_cv=GridSearchCV(decisionTree,param_grid)
    decisionTree_cv.fit(xTrain, yTrain.values.ravel())

    print("Tuned hyperparameters :(best parameters) ",decisionTree_cv.best_params_)

    yPred = decisionTree_cv.predict(xTest)

    print('Accuracy Score : ' + str(accuracy_score(yTest.values.ravel(),yPred)))
    print('Precision Score : ' + str(precision_score(yTest.values.ravel(),yPred, pos_label='1')))
    print('Recall Score : ' + str(recall_score(yTest.values.ravel(),yPred, pos_label='1')))
    print('F1 Score : ' + str(f1_score(yTest.values.ravel(),yPred, pos_label='1')))

In [14]:
def doNeuralNetwork(xTrain, yTrain, xTest, yTest):
    nnClassifier = MLPClassifier(max_iter=500)
    numberFeatures = xTrain.shape[1]
    #Saul's Heuristic
    hiddenLayerAmount = np.arange(numberFeatures/2, 2*numberFeatures +1 )
    param_grid =[{
        'hidden_layer_sizes': [(10,30,10),(20,)],
        'activation': ['tanh', 'relu','logistic'],
        'solver': ['sgd', 'adam'],
        'max_iter': [1000,1500],
        'alpha': [0.0001, 0.05],
    }]

    nnClassifier_cv = GridSearchCV(nnClassifier, param_grid)
    nnClassifier_cv.fit(xTrain, yTrain.values.ravel())

    print("Tuned hyperparameters :(best parameters) ",nnClassifier_cv.best_params_)

    yPred = nnClassifier_cv.predict(xTest)

    print('Accuracy Score : ' + str(accuracy_score(yTest.values.ravel(),yPred)))
    print('Precision Score : ' + str(precision_score(yTest.values.ravel(),yPred, pos_label='1')))
    print('Recall Score : ' + str(recall_score(yTest.values.ravel(),yPred, pos_label='1')))
    print('F1 Score : ' + str(f1_score(yTest.values.ravel(),yPred, pos_label='1')))

# Data only normalized

In [8]:
for i in range(1,6):
    print("-----------Normalized cut {}------------------".format(i))
    path = "Normalized/cut{}".format(i)
    xTrain, yTrain, xTest, yTest = readCut(path)
    doLogisticRegression(xTrain, yTrain, xTest, yTest)
    doKNN(xTrain, yTrain, xTest, yTest)
    doDecisionTree(xTrain, yTrain, xTest, yTest)
    doNeuralNetwork(xTrain, yTrain, xTest, yTest)

-----------Normalized cut 1------------------
Tuned hyperparameters :(best parameters)  {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (20,), 'max_iter': 1500, 'solver': 'adam'}
Accuracy Score : 0.725
Precision Score : 0.7515151515151515
Recall Score : 0.7251461988304093
F1 Score : 0.738095238095238
-----------Normalized cut 2------------------
Tuned hyperparameters :(best parameters)  {'activation': 'relu', 'alpha': 0.05, 'hidden_layer_sizes': (10, 30, 10), 'max_iter': 1000, 'solver': 'adam'}
Accuracy Score : 0.75625
Precision Score : 0.7751479289940828
Recall Score : 0.7660818713450293
F1 Score : 0.7705882352941177
-----------Normalized cut 3------------------
Tuned hyperparameters :(best parameters)  {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (10, 30, 10), 'max_iter': 1000, 'solver': 'adam'}
Accuracy Score : 0.765625
Precision Score : 0.7857142857142857
Recall Score : 0.7719298245614035
F1 Score : 0.7787610619469025
-----------Normalized cut 4------

# Data standarized

In [15]:
for i in range(1,6):
    print("-----------Standarized cut {}------------------".format(i))
    path = "Standardized/cut{}".format(i)
    xTrain, yTrain, xTest, yTest = readCut(path)
    doLogisticRegression(xTrain, yTrain, xTest, yTest)
    doKNN(xTrain, yTrain, xTest, yTest)
    doDecisionTree(xTrain, yTrain, xTest, yTest)
    doNeuralNetwork(xTrain, yTrain, xTest, yTest)

-----------Standarized cut 1------------------
Tuned hyperparameters :(best parameters)  {'C': 0.012742749857031334, 'fit_intercept': True, 'max_iter': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
Accuracy Score : 0.71875
Precision Score : 0.7396449704142012
Recall Score : 0.7309941520467836
F1 Score : 0.735294117647059
Tuned hyperparameters :(best parameters)  {'algorithm': 'ball_tree', 'leaf_size': 20, 'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}
Accuracy Score : 0.8375
Precision Score : 0.8287292817679558
Recall Score : 0.8771929824561403
F1 Score : 0.8522727272727272
Tuned hyperparameters :(best parameters)  {'criterion': 'entropy', 'max_depth': 13, 'max_features': None, 'splitter': 'best'}
Accuracy Score : 0.753125
Precision Score : 0.7613636363636364
Recall Score : 0.783625730994152
F1 Score : 0.7723342939481268
Tuned hyperparameters :(best parameters)  {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (20,), 'max_iter': 1500, 'solver': 'adam'}
Accu

In [16]:
# Better Feature Engineering

In [17]:
for i in range(1,6):
    print("-----------BetterFE cut {}------------------".format(i))
    path = "BetterFE/cut{}".format(i)
    xTrain, yTrain, xTest, yTest = readCut(path)
    doLogisticRegression(xTrain, yTrain, xTest, yTest)
    doKNN(xTrain, yTrain, xTest, yTest)
    doDecisionTree(xTrain, yTrain, xTest, yTest)
    doNeuralNetwork(xTrain, yTrain, xTest, yTest)

-----------BetterFE cut 1------------------
Tuned hyperparameters :(best parameters)  {'C': 0.004832930238571752, 'fit_intercept': True, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy Score : 0.7364620938628159
Precision Score : 0.7753623188405797
Recall Score : 0.7181208053691275
F1 Score : 0.7456445993031359


KeyboardInterrupt: 