In [1]:
import numpy as np # imports a fast numerical programming library
import scipy as sp #imports stats functions, amongst other things
import matplotlib as mpl # this actually imports matplotlib
import matplotlib.cm as cm #allows us easy access to colormaps
import matplotlib.pyplot as plt #sets up plotting under plt
import pandas as pd #lets us handle data as dataframes
from pandas import DataFrame
import sklearn
from sklearn import model_selection
import pickle

In [9]:
#!/usr/bin/python

import time

def calculate_metrics(y_test,Y_predicted):

    from sklearn import metrics
    from sklearn.metrics import classification_report,confusion_matrix

    accuracy = metrics.accuracy_score(y_test,Y_predicted)
    #print "accuracy = "+str(round(accuracy * 100,2))+"%"

    confusion_mat = confusion_matrix(y_test,Y_predicted)

    print("Confusion Matrix: ")
    print(confusion_mat)
    print("Confusion Matrix Shape: ", confusion_mat.shape)

    print("TP\tFP\tFN\tTN")
    for i in range(confusion_mat.shape[0]):
        TP = round(float(confusion_mat[i,i]),2)  # correctly labeled as i
        FP = round(float(confusion_mat[:,i].sum()),2) - TP  # incorrectly labeled as i
        FN = round(float(confusion_mat[i,:].sum()),2) - TP  # incorrectly labeled as non-i
        TN = round(float(confusion_mat.sum().sum()),2) - TP - FP - FN
        print(str(TP)+"\t"+str(FP)+"\t"+str(FN)+"\t"+str(TN)),
        sensitivity = round(TP / (TP + FN),2)
        specificity = round(TN / (TN + FP),2)
        print("Sensitivity: "+str(sensitivity))
        print("Specificity: "+str(specificity))


    f_score = metrics.f1_score(y_test,Y_predicted)
    print("The F1 score: ", f_score)
    print("Accuracy: ", accuracy)
    

def neural_network(dataset,class_labels,test_size):
    import numpy as np
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.neural_network import MLPClassifier
    X = pd.read_csv(dataset)
    Y = pd.read_csv(class_labels)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size= test_size, random_state=42) 
    model = MLPClassifier(hidden_layer_sizes=(100), activation='logistic',random_state = 42)
    model.fit(X_train,y_train)
    Y_predicted = model.predict(X_test)
    
    
    return y_test,Y_predicted


def random_forests(dataset,class_labels,test_size):
    import numpy as np
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from sklearn import metrics
    X = pd.read_csv(dataset)
    Y = pd.read_csv(class_labels)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size= test_size, random_state=42)   
    model = RandomForestClassifier(n_estimators = 5, criterion = 'entropy',random_state = 42)
    model.fit(X_train,y_train)
    Y_predicted = model.predict(X_test)
    return y_test,Y_predicted

def support_vector_machines(dataset,class_labels,test_size):
    import numpy as np
    from sklearn import svm
    import pandas as pd
    from sklearn.model_selection import train_test_split
    X = pd.read_csv(dataset)
    Y = pd.read_csv(class_labels)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size= test_size, random_state=42)  
    # 'rbf' value is the gaussian kernel, 'C' is the coefficient used for regularization during training
    model = svm.SVC(kernel='rbf',C=2.0)
    model.fit(X_train,y_train)
    Y_predicted = model.predict(X_test)
    return y_test,Y_predicted

def main():

    dataset = "dataset1.csv"
    class_labels = "target_labels.csv"
    test_size = 0.3
    print("\nrunning neural networks...")
    start_time = time.time()
    y_test,Y_predicted = neural_network(dataset,class_labels,test_size)
    calculate_metrics(y_test,Y_predicted)
    end_time = time.time()
    print("runtime = "+str(end_time - start_time)+" seconds")
    print("\nrunning random forests...")
    start_time = time.time()
    y_test,Y_predicted = random_forests(dataset,class_labels,test_size)
    calculate_metrics(y_test,Y_predicted)
    end_time = time.time()
    print("runtime = "+str(end_time - start_time)+" seconds")

    print("\nrunning support vector machines...")
    start_time = time.time()
    y_test,Y_predicted = support_vector_machines(dataset,class_labels,test_size)
    calculate_metrics(y_test,Y_predicted)
    end_time = time.time()
    print("runtime = "+str(end_time - start_time)+" seconds")


if __name__ == '__main__':
    start_time = time.time()
    main()
    end_time = time.time()
    print("runtime = "+str(end_time - start_time)+" seconds")


running neural networks...


  y = column_or_1d(y, warn=True)


Confusion Matrix: 
[[1246  209]
 [ 155 1707]]
Confusion Matrix Shape:  (2, 2)
TP	FP	FN	TN
1246.0	155.0	209.0	1707.0
Sensitivity: 0.86
Specificity: 0.92
1707.0	209.0	155.0	1246.0
Sensitivity: 0.92
Specificity: 0.86
The F1 score:  0.9036527263102171
Accuracy:  0.8902622851974676
runtime = 9.046810388565063 seconds

running random forests...
Confusion Matrix: 
[[1293  162]
 [ 182 1680]]
Confusion Matrix Shape:  (2, 2)
TP	FP	FN	TN
1293.0	182.0	162.0	1680.0
Sensitivity: 0.89
Specificity: 0.9
1680.0	162.0	182.0	1293.0
Sensitivity: 0.9
Specificity: 0.89
The F1 score:  0.9071274298056156
Accuracy:  0.8962918299668375
runtime = 0.1296522617340088 seconds

running support vector machines...


  y = column_or_1d(y, warn=True)


Confusion Matrix: 
[[1249  206]
 [ 131 1731]]
Confusion Matrix Shape:  (2, 2)
TP	FP	FN	TN
1249.0	131.0	206.0	1731.0
Sensitivity: 0.86
Specificity: 0.93
1731.0	206.0	131.0	1249.0
Sensitivity: 0.93
Specificity: 0.86
The F1 score:  0.9112924453803632
Accuracy:  0.8984021706361169
runtime = 1.7762465476989746 seconds
runtime = 10.953709363937378 seconds


In [None]:
#!/usr/bin/python

import time

def calculate_metrics(y_test,Y_predicted):

    from sklearn import metrics
    from sklearn.metrics import classification_report,confusion_matrix

    accuracy = metrics.accuracy_score(y_test,Y_predicted)
    #print "accuracy = "+str(round(accuracy * 100,2))+"%"

    confusion_mat = confusion_matrix(y_test,Y_predicted)

    print("Confusion Matrix: ")
    print(confusion_mat)
    print("Confusion Matrix Shape: ", confusion_mat.shape)

    print("TP\tFP\tFN\tTN\tSensitivity\tSpecificity")
    for i in range(confusion_mat.shape[0]):
        TP = round(float(confusion_mat[i,i]),2)  # correctly labeled as i
        FP = round(float(confusion_mat[:,i].sum()),2) - TP  # incorrectly labeled as i
        FN = round(float(confusion_mat[i,:].sum()),2) - TP  # incorrectly labeled as non-i
        TN = round(float(confusion_mat.sum().sum()),2) - TP - FP - FN
        print(str(TP)+"\t"+str(FP)+"\t"+str(FN)+"\t"+str(TN)),
        sensitivity = round(TP / (TP + FN),2)
        specificity = round(TN / (TN + FP),2)
        print("Sensitivity: "+str(sensitivity))
        print("Specificity: "+str(specificity))


    f_score = metrics.f1_score(y_test,Y_predicted)
    print("The F1 score: ", f_score)
    print("Accuracy: ", accuracy)
    

def neural_network(dataset,class_labels,test_size):
    import numpy as np
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.neural_network import MLPClassifier
    X = pd.read_csv(dataset)
    Y = pd.read_csv(class_labels)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size= test_size, random_state=42) 
    model = MLPClassifier(hidden_layer_sizes=(100), activation='logistic',random_state = 42)
    model.fit(X_train,y_train)
    Y_predicted = model.predict(X_test)
    
    
    return y_test,Y_predicted


def random_forests(dataset,class_labels,test_size):
    import numpy as np
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from sklearn import metrics
    X = pd.read_csv(dataset)
    Y = pd.read_csv(class_labels)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size= test_size, random_state=42)   
    model = RandomForestClassifier(n_estimators = 5, criterion = 'entropy',random_state = 42)
    model.fit(X_train,y_train)
    filename = 'finalized_model.pkl'
    pickle.dump(model, open(filename, 'wb'))
    Y_predicted = model.predict(X_test)
    return y_test,Y_predicted

def support_vector_machines(dataset,class_labels,test_size):
    import numpy as np
    from sklearn import svm
    import pandas as pd
    from sklearn.model_selection import train_test_split
    X = pd.read_csv(dataset)
    Y = pd.read_csv(class_labels)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size= test_size, random_state=42)  
    # 'rbf' value is the gaussian kernel, 'C' is the coefficient used for regularization during training
    model = svm.SVC(kernel='rbf',C=2.0)
    model.fit(X_train,y_train)
    Y_predicted = model.predict(X_test)
    return y_test,Y_predicted

def main():

    dataset = "dataset2.csv"
    class_labels = "target_labels.csv"
    test_size = 0.3
    print("\nrunning neural networks...")
    start_time = time.time()
    y_test,Y_predicted = neural_network(dataset,class_labels,test_size)
    calculate_metrics(y_test,Y_predicted)
    end_time = time.time()
    print("runtime = "+str(end_time - start_time)+" seconds")
    print("\nrunning random forests...")
    start_time = time.time()
    y_test,Y_predicted = random_forests(dataset,class_labels,test_size)
    calculate_metrics(y_test,Y_predicted)
    end_time = time.time()
    print("runtime = "+str(end_time - start_time)+" seconds")

    print("\nrunning support vector machines...")
    start_time = time.time()
    y_test,Y_predicted = support_vector_machines(dataset,class_labels,test_size)
    calculate_metrics(y_test,Y_predicted)
    end_time = time.time()
    print("runtime = "+str(end_time - start_time)+" seconds")


if __name__ == '__main__':
    start_time = time.time()
    main()
    end_time = time.time()
    print("runtime = "+str(end_time - start_time)+" seconds")


running neural networks...


  y = column_or_1d(y, warn=True)
