In [1]:
# https://machinelearningmastery.com/multi-class-classification-tutorial-keras-deep-learning-library/
import numpy
import pandas
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef

Using TensorFlow backend.


In [2]:
# Initialize Random Number Generator
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

In [3]:
# load dataset
# dataframe = pandas.read_csv("kddforpandatrain.csv")#, header=True)
dataframe = pandas.read_csv("kdd_dataset.csv")#, header=True)  # read the whole 10% dataset into dataframe

# samples 3000 random data points from 500k
dataframe = dataframe.sample(n=300)

# LabelEncoder, turns all our categorical data into integers
le = LabelEncoder()

# apply "le.fit_transform" to every column (usually only works on 1 column)
dataframe_encoded = dataframe.apply(le.fit_transform)
attack_labels = le.classes_
dataset = dataframe_encoded.values

#Set X as our input data and Y as our label
X = dataset[:,0:41].astype(float)
Y = dataset[:,41]


In [4]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)
# print(dummy_y)
print(len(dummy_y[0]))
num_of_classes = len(dummy_y[0])  # the length of dummy y is the number of classes we have in our small sample
# since we are randomly sampling from a large dataset, we might not get 1 of every class in our sample
# we need to set output layer to be equal to the length of our dummy_y vectors


6


In [5]:
# define baseline model
def baseline_model():
    # create model
    model = Sequential()
    
    inputs = 41
    hidden_layer1 = 18
    hidden_layer2 = 6
    hidden_layer3 = 0
    outputs = num_of_classes  #needs to be this variable in case we forget to sample. Could end up having 10 classes or 12, etc
    
    model.add(Dense(hidden_layer1, input_dim=inputs, activation='relu'))
    if hidden_layer2 != 0:
        model.add(Dense(hidden_layer2, activation='relu'))
    if hidden_layer3 != 0:
        model.add(Dense(hidden_layer3, activation='relu'))
    model.add(Dense(outputs, activation='softmax'))
    
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [6]:

estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5, verbose=0)

"""
trained_classifier = estimator.fit(X, Y)
print(type(estimator))
# Predicting the Test set results
y_pred = estimator.predict(X)
"""



kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
y_pred = cross_val_predict(estimator, X, dummy_y, cv=kfold)
results = cross_val_score(estimator, X, dummy_y, cv=kfold)

cm = confusion_matrix(Y, y_pred)
print(cm)
print("total: " + str(cm.sum()))
print("accuracy: " + str(numpy.trace(cm) / cm.sum()))
print("Matthews correlation coefficient: " + str(matthews_corrcoef(Y, y_pred)))



print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

[[ 67   0   0   0   0   0]
 [  0  71   1   0   0   0]
 [  0   2   0   0   0   0]
 [  0   1   0 156   0   0]
 [  0   1   0   0   0   0]
 [  0   1   0   0   0   0]]
total: 300
accuracy: 0.98
Matthews correlation coefficient: 0.9677710189539196
Baseline: 89.33% (21.95%)


In [7]:
def true_positive_rate(cm, i, total):
    return cm[i][i]/total
    
def false_positive_rate(cm,j):
    fp_rate = 0
    for i in range(0,len(cm)):
        if (i != j):
            fp_rate += cm[i][j]
    if ((fp_rate + cm[j][j]) != 0):
        return fp_rate/(fp_rate + cm[j][j])
    else:
        return 0;
        
def false_negative_rate(cm, i):
    fn_rate = 0;
    for j in range(0,len(cm)):
        if (i != j):
            fn_rate += cm[i][j]
    if ((fn_rate + cm[j][j]) != 0):
        return fn_rate/(fn_rate + cm[j][j])
    else:
        return 0;

def true_negative_rate(cm,i,total):
    tn_rate = 0
    for j in range(0,len(cm)):
        for k in range(0,len(cm)):
            if (j != i and k != i):
                tn_rate += cm[j][k]
    return tn_rate/total

def misclassification_rate(cm,l):
    fp_rate = 0
    fn_rate = 0
    for i in range(0,len(cm)):
        if (i != l):
            fp_rate += cm[i][l]
    for j in range(0,len(cm)):
        if (l != j):
            fn_rate += cm[l][j]
    return (fp_rate + fn_rate)/(fp_rate + fn_rate + cm[l][l])
    
def avg_true_positive_rate(cm):
    tp_rate = 0
    for i in range(0,len(cm)):
        tp_rate += true_positive_rate(cm,i,cm.sum())
    return tp_rate/len(cm)

def avg_false_positive_rate(cm):
    fp_rate = 0
    for i in range(0,len(cm)):
        fp_rate += false_positive_rate(cm,i)
    return fp_rate/len(cm)

def avg_false_negative_rate(cm):
    fn_rate = 0
    for i in range(0,len(cm)):
        fn_rate += false_negative_rate(cm,i)
    return fn_rate/len(cm)

def avg_true_negative_rate(cm):
    tn_rate = 0
    for i in range(0,len(cm)):
        tn_rate += true_negative_rate(cm,i,cm.sum())
    return tn_rate/len(cm)

def avg_misclassification_rate(cm):
    mc_rate = 0
    for i in range(0,len(cm)):
        mc_rate += misclassification_rate(cm,i)
    return mc_rate/len(cm)

def matthews(TP,TN,FP,FN):
    if ((TP + FP)*(TP + FN)*(TN + FP)*(TN + FN) == 0):
        return 0
    return (TP*TN - FP*FN)/math.sqrt((TP + FP)*(TP + FN)*(TN + FP)*(TN + FN))

def print_table(cm):
    print('{:15}'.format('Field'), end='')
    print('{:15}'.format('TP'), end='')
    print('{:15}'.format('FP'), end='')
    print('{:15}'.format('FN'), end='')
    print('{:15}'.format('TN'), end='')
    print('{:15}'.format('MC Rate'), end='')
    print('{:15}'.format('MCC'), end='')
    print()
    print('---------------------------------------------------------------------------------------------------')
    print()
    for i in range(0,len(cm)):
        print('{:15}'.format(attack_labels[i]), end='')
        print('{:15}'.format('{:.5f}'.format(true_positive_rate(cm,i,cm.sum()))), end='')
        print('{:15}'.format('{:.5f}'.format(false_positive_rate(cm,i))), end='')
        print('{:15}'.format('{:.5f}'.format(false_negative_rate(cm,i))), end='')
        print('{:15}'.format('{:.5f}'.format(true_negative_rate(cm,i,cm.sum()))), end='')
        print('{:15}'.format('{:.5f}'.format(misclassification_rate(cm,i))), end='')
        print('{:15}'.format('{:.5f}'.format(matthews(true_positive_rate(cm,i,cm.sum()),true_negative_rate(cm,i,cm.sum()),
                                                      false_positive_rate(cm,i),false_negative_rate(cm,i)))),end='')
        print()
    print()

print_table(cm)
print("Average true positive rate: " + str(avg_true_positive_rate(cm)))
print("Average false positive rate: " + str(avg_false_positive_rate(cm)))
print("Average false negative rate: " + str(avg_false_negative_rate(cm)))
print("Average true negative rate: " + str(avg_true_negative_rate(cm)))
print("Average Misclassification Rate: " + str(avg_misclassification_rate(cm)))
print("Matthews Correlation Coefficient: " + str(matthews(avg_true_positive_rate(cm),avg_true_negative_rate(cm),avg_false_positive_rate(cm),avg_false_negative_rate(cm))))





Field          TP             FP             FN             TN             MC Rate        MCC            
---------------------------------------------------------------------------------------------------

neptune.       0.22333        0.00000        0.00000        0.77667        0.00000        1.00000        
normal.        0.23667        0.06579        1.00000        0.74333        0.07792        0.15162        
portsweep.     0.00000        1.00000        1.00000        0.99000        1.00000        -0.50251       
smurf.         0.52000        0.00000        1.00000        0.47667        0.00637        0.33231        
teardrop.      0.00000        0.00000        1.00000        0.99667        1.00000        0.00000        
warezclient.   0.00000        0.00000        1.00000        0.99667        1.00000        0.00000        

Average true positive rate: 0.16333333333333333
Average false positive rate: 0.17763157894736845
Average false negative rate: 0.8333333333333334
Average tru