In [1]:
import numpy as np
import pandas as pd
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.datasets import mnist
from keras.layers import Input, Dense, Reshape, Flatten, Dropout
from keras.layers import BatchNormalization, Activation, ZeroPadding2D
from keras.layers.advanced_activations import LeakyReLU
from keras.models import Sequential, Model
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef
from keras.optimizers import Adam


Using TensorFlow backend.


In [2]:
def build_discriminator():
    model = Sequential()
    model.add(Dense(41, input_dim=41, activation='relu'))  # discriminator takes 41 values from our dataset
    model.add(Dense(30, activation='relu'))
    model.add(Dense(15, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # outputs 0 to 1, 1 being read and 0 being fake

    # model.summary()

    attack = Input(shape=(41,))
    validity = model(attack)

    return Model(attack, validity)

In [3]:
def build_generator(hidden1, hidden2, hidden3):
    model = Sequential()
    model.add(Dense(hidden1, input_dim=41))  # arbitrarily selected 100 for our input noise vector?
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(hidden2))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(hidden3))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(41, activation='relu'))  # outputs a generated vector of the same size as our data (41)

    # model.summary()

    noise = Input(shape=(41,))
    attack = model(noise)
    return Model(noise, attack)

In [10]:
def trainGAN(gen_hidden1, gen_hidden2, gen_hidden3):
    batch_size = 256
    epochs = 7000
    optimizer = Adam(0.0002, 0.5)
    
    dataframe = pd.read_csv('kdd_neptune_only_5000.csv').sample(500) # sample 100 data points randomly from the csv
    
    # apply "le.fit_transform" to every column (usually only works on 1 column)
    le = LabelEncoder()
    dataframe_encoded = dataframe.apply(le.fit_transform)
    dataset = dataframe_encoded.values
    
    #to visually judge results
    print("Real neptune attacks:")
    print(dataset[:2])
    
    # Set X as our input data and Y as our label
    X_train = dataset[:, 0:41].astype(float)
    Y_train = dataset[:, 41]
    
    # labels for data. 1 for valid attacks, 0 for fake (generated) attacks
    valid = np.ones((batch_size, 1))
    fake = np.zeros((batch_size, 1))
    
    # build the discriminator portion
    discriminator = build_discriminator();
    discriminator.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    # build the generator portion
    generator = build_generator(gen_hidden1, gen_hidden2, gen_hidden3)
    
    #input and output of our combined model
    z = Input(shape=(41,))
    attack = generator(z)
    validity = discriminator(attack)
    
    # build combined model from generator and discriminator
    combined = Model(z, validity)
    combined.compile(loss='binary_crossentropy', optimizer=optimizer)
    
    #break condition for training (when diverging)
    loss_increase_count = 0;
    prev_g_loss = 0;
    
    for epoch in range(epochs):

        # ---------------------
        #  Train Discriminator
        # ---------------------
        
        # selecting batch_size random attacks from our training data
        idx = np.random.randint(0, X_train.shape[0], batch_size)
        attacks = X_train[idx]
        
        # generate a matrix of noise vectors
        noise = np.random.normal(0, 1, (batch_size, 41))
        
        # create an array of generated attacks
        gen_attacks = generator.predict(noise)
        
        # loss functions, based on what metrics we specify at model compile time
        d_loss_real = discriminator.train_on_batch(attacks, valid)
        d_loss_fake = discriminator.train_on_batch(gen_attacks, fake)
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
        
        # generator loss function
        g_loss = combined.train_on_batch(noise, valid)
        
        if epoch % 100 == 0:
            print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f] [Loss change: %.3f, Loss increases: %.0f]" % (epoch, d_loss[0], 100 * d_loss[1], g_loss, g_loss - prev_g_loss, loss_increase_count))
        
        # if our generator loss icreased this iteration, increment the counter by 1
        if (g_loss - prev_g_loss) > 0:
            loss_increase_count = loss_increase_count + 1
        else: 
            loss_increase_count = 0  # otherwise, reset it to 0, we are still training effectively
            
        prev_g_loss = g_loss
            
        if loss_increase_count > 5:
            print('Stoping on iteration: ', epoch)
            break
            
        if epoch % 20 == 0:
            f = open("GANresultsNeptune.txt", "a")
            np.savetxt("GANresultsNeptune.txt", gen_attacks, fmt="%.0f")
            f.close()
            
    # peek at our results
    results = np.loadtxt("GANresultsNeptune.txt")
    print("Generated Neptune attacks: ")
    print(results[:2])
        
        


In [5]:
# Initialize Random Number Generator
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

# load dataset

dataframe = pd.read_csv("normalAndNeptune.csv")#, header=True) 

# samples 10000 random data points from 500k
dataframe = dataframe.sample(n=15000)
# LabelEncoder, turns all our categorical data into integers
le = LabelEncoder()

# apply "le.fit_transform" to every column (usually only works on 1 column)
dataframe_encoded = dataframe.apply(le.fit_transform)
attack_labels = le.classes_
indices_of_neptune = np.where(attack_labels == 'neptune.')
neptune_index = indices_of_neptune[0]
dataset = dataframe_encoded.values

print(attack_labels)
print(neptune_index)

#Set X as our input data and Y as our label
X = dataset[:,0:41].astype(float)
Y = dataset[:,41]


['neptune.' 'normal.']
[0]


In [6]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)
# print(dummy_y)
#print(len(dummy_y[0]))
num_of_classes = len(dummy_y[0])  # the length of dummy y is the number of classes we have in our small sample
# since we are randomly sampling from a large dataset, we might not get 1 of every class in our sample
# we need to set output layer to be equal to the length of our dummy_y vectors
print(num_of_classes)


2


In [7]:
# define baseline model
def baseline_model():
    # create model
    model = Sequential()
    
    inputs = 41
    hidden_layer1 = 10
    hidden_layer2 = 5
    hidden_layer3 = 0
    outputs = num_of_classes  #needs to be this variable in case we forget to sample. Could end up having 10 classes or 12, etc
    
    model.add(Dense(hidden_layer1, input_dim=inputs, activation='relu'))
    if hidden_layer2 != 0:
        model.add(Dense(hidden_layer2, activation='relu'))
    if hidden_layer3 != 0:
        model.add(Dense(hidden_layer3, activation='relu'))
    model.add(Dense(outputs, activation='softmax'))
    
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) #optimizer=adam
    return model

In [8]:
#for i in range(0,10):
estimator = KerasClassifier(build_fn=baseline_model, epochs=32, batch_size=200, verbose=2)

kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
y_pred = cross_val_predict(estimator, X, dummy_y, cv=kfold)
results = cross_val_score(estimator, X, dummy_y, cv=kfold)

trained_classifier = estimator.fit(X, Y)
print(type(estimator))

cm = confusion_matrix(Y, y_pred)
print(cm)
print("total: " + str(cm.sum()))
print("accuracy: " + str(np.trace(cm) / cm.sum()))
print("Matthews correlation coefficient: " + str(matthews_corrcoef(Y, y_pred)))



print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

f = open("discriminatorResults.txt", "a+")
f.write("TP: %d, FP: %d, FN: %d, TN: %d\n" % (cm[0][0], cm[0][1], cm[1][0], cm[1][1]))
f.close()


Epoch 1/32
 - 0s - loss: 7.0078 - acc: 0.5446
Epoch 2/32
 - 0s - loss: 1.0455 - acc: 0.8850
Epoch 3/32
 - 0s - loss: 0.0806 - acc: 0.9850
Epoch 4/32
 - 0s - loss: 0.0517 - acc: 0.9879
Epoch 5/32
 - 0s - loss: 0.0356 - acc: 0.9896
Epoch 6/32
 - 0s - loss: 0.0316 - acc: 0.9918
Epoch 7/32
 - 0s - loss: 0.0282 - acc: 0.9936
Epoch 8/32
 - 0s - loss: 0.0239 - acc: 0.9961
Epoch 9/32
 - 0s - loss: 0.0209 - acc: 0.9977
Epoch 10/32
 - 0s - loss: 0.0185 - acc: 0.9990
Epoch 11/32
 - 0s - loss: 0.0170 - acc: 0.9992
Epoch 12/32
 - 0s - loss: 0.0157 - acc: 0.9995
Epoch 13/32
 - 0s - loss: 0.0146 - acc: 0.9996
Epoch 14/32
 - 0s - loss: 0.0137 - acc: 0.9996
Epoch 15/32
 - 0s - loss: 0.0124 - acc: 0.9996
Epoch 16/32
 - 0s - loss: 0.0116 - acc: 0.9996
Epoch 17/32
 - 0s - loss: 0.0084 - acc: 0.9996
Epoch 18/32
 - 0s - loss: 0.0033 - acc: 0.9996
Epoch 19/32
 - 0s - loss: 0.0029 - acc: 0.9996
Epoch 20/32
 - 0s - loss: 0.0028 - acc: 0.9998
Epoch 21/32
 - 0s - loss: 0.0027 - acc: 0.9999
Epoch 22/32
 - 0s - lo

Epoch 16/32
 - 0s - loss: 0.0031 - acc: 0.9996
Epoch 17/32
 - 0s - loss: 0.0027 - acc: 0.9996
Epoch 18/32
 - 0s - loss: 0.0025 - acc: 0.9996
Epoch 19/32
 - 0s - loss: 0.0023 - acc: 0.9996
Epoch 20/32
 - 0s - loss: 0.0020 - acc: 0.9996
Epoch 21/32
 - 0s - loss: 0.0020 - acc: 0.9997
Epoch 22/32
 - 0s - loss: 0.0019 - acc: 0.9997
Epoch 23/32
 - 0s - loss: 0.0018 - acc: 0.9997
Epoch 24/32
 - 0s - loss: 0.0018 - acc: 0.9996
Epoch 25/32
 - 0s - loss: 0.0017 - acc: 0.9997
Epoch 26/32
 - 0s - loss: 0.0016 - acc: 0.9996
Epoch 27/32
 - 0s - loss: 0.0016 - acc: 0.9997
Epoch 28/32
 - 0s - loss: 0.0016 - acc: 0.9997
Epoch 29/32
 - 0s - loss: 0.0015 - acc: 0.9997
Epoch 30/32
 - 0s - loss: 0.0015 - acc: 0.9997
Epoch 31/32
 - 0s - loss: 0.0015 - acc: 0.9998
Epoch 32/32
 - 0s - loss: 0.0015 - acc: 0.9998
Epoch 1/32
 - 0s - loss: 0.7303 - acc: 0.9436
Epoch 2/32
 - 0s - loss: 0.0934 - acc: 0.9967
Epoch 3/32
 - 0s - loss: 0.0645 - acc: 0.9992
Epoch 4/32
 - 0s - loss: 0.0582 - acc: 0.9993
Epoch 5/32
 - 0s 

Epoch 32/32
 - 0s - loss: 0.0031 - acc: 0.9997
Epoch 1/32
 - 0s - loss: 3.6434 - acc: 0.7114
Epoch 2/32
 - 0s - loss: 0.0960 - acc: 0.9890
Epoch 3/32
 - 0s - loss: 0.0254 - acc: 0.9940
Epoch 4/32
 - 0s - loss: 0.0129 - acc: 0.9970
Epoch 5/32
 - 0s - loss: 0.0080 - acc: 0.9979
Epoch 6/32
 - 0s - loss: 0.0052 - acc: 0.9987
Epoch 7/32
 - 0s - loss: 0.0037 - acc: 0.9993
Epoch 8/32
 - 0s - loss: 0.0026 - acc: 0.9993
Epoch 9/32
 - 0s - loss: 0.0019 - acc: 0.9993
Epoch 10/32
 - 0s - loss: 0.0013 - acc: 0.9996
Epoch 11/32
 - 0s - loss: 8.8578e-04 - acc: 0.9996
Epoch 12/32
 - 0s - loss: 6.7544e-04 - acc: 0.9999
Epoch 13/32
 - 0s - loss: 5.4646e-04 - acc: 0.9999
Epoch 14/32
 - 0s - loss: 4.7077e-04 - acc: 0.9999
Epoch 15/32
 - 0s - loss: 4.1357e-04 - acc: 0.9999
Epoch 16/32
 - 0s - loss: 3.8763e-04 - acc: 0.9999
Epoch 17/32
 - 0s - loss: 3.2566e-04 - acc: 0.9999
Epoch 18/32
 - 0s - loss: 2.7569e-04 - acc: 0.9999
Epoch 19/32
 - 0s - loss: 2.5192e-04 - acc: 0.9999
Epoch 20/32
 - 0s - loss: 2.3876e

Epoch 13/32
 - 0s - loss: 0.0024 - acc: 0.9994
Epoch 14/32
 - 0s - loss: 0.0023 - acc: 0.9994
Epoch 15/32
 - 0s - loss: 0.0022 - acc: 0.9994
Epoch 16/32
 - 0s - loss: 0.0021 - acc: 0.9994
Epoch 17/32
 - 0s - loss: 0.0019 - acc: 0.9994
Epoch 18/32
 - 0s - loss: 0.0018 - acc: 0.9994
Epoch 19/32
 - 0s - loss: 0.0017 - acc: 0.9995
Epoch 20/32
 - 0s - loss: 0.0017 - acc: 0.9996
Epoch 21/32
 - 0s - loss: 0.0016 - acc: 0.9996
Epoch 22/32
 - 0s - loss: 0.0015 - acc: 0.9996
Epoch 23/32
 - 0s - loss: 0.0014 - acc: 0.9996
Epoch 24/32
 - 0s - loss: 0.0014 - acc: 0.9996
Epoch 25/32
 - 0s - loss: 0.0013 - acc: 0.9996
Epoch 26/32
 - 0s - loss: 0.0013 - acc: 0.9996
Epoch 27/32
 - 0s - loss: 0.0012 - acc: 0.9996
Epoch 28/32
 - 0s - loss: 0.0011 - acc: 0.9996
Epoch 29/32
 - 0s - loss: 9.9636e-04 - acc: 0.9996
Epoch 30/32
 - 0s - loss: 9.1287e-04 - acc: 0.9996
Epoch 31/32
 - 0s - loss: 8.0776e-04 - acc: 0.9997
Epoch 32/32
 - 0s - loss: 6.6556e-04 - acc: 0.9998
Epoch 1/32
 - 0s - loss: 0.3868 - acc: 0.964

In [None]:
"""
f = open("GeneratorHypersAbove50percentAccuracy.txt", "w")
f.write("""""" Hidden layer counts for Generator model that resulted in over 50% generated attacks labeled correctly:
    ------------------------------------------------------------------------------------------------
    """""")
f.close()
"""

while(1):
    # generate random numbers for the hidden layer sizes of our generator
    gen_hidden1 = np.random.randint(1, 101)
    gen_hidden2 = np.random.randint(1, 101)
    gen_hidden3 = np.random.randint(1, 101)
    
    i = 0
    
    
    # train 5 times on each setup, in case we get unlucky initalization on an otherwise good setup
    while i < 5:
        # create a unique filename in case we want to store the results (good accuracy)
        result_filename = "GANresultsNeptune%.0f%.0f%.0fiter%.0f.txt" % (gen_hidden1, gen_hidden2, gen_hidden3, i)

        trainGAN(gen_hidden1, gen_hidden2, gen_hidden3)
        
        # load generate attacks from file
        results = np.loadtxt("GANresultsNeptune.txt")

        # predict attack lables (as encoded integers)
        y_pred = estimator.predict(results)
        print(y_pred)

        # create appropriate labels for our generated neptune attacks
        neptune_labels = np.full((len(results),), neptune_index[0])

        # convert integer labels back to string, get all unique strings and their count
        predicted_as_label = attack_labels[y_pred]
        unique_labels = np.unique(predicted_as_label)

        for label in unique_labels:
            print("Attack type: %s     number predicted:  %.0f" % (label, len(np.where(predicted_as_label == label)[0])))
    
        print()
        # create a confusion matrix of the results
        cm = confusion_matrix(neptune_labels, y_pred)
        
        accuracy = np.trace(cm) / cm.sum()
        print(cm)
        print("total: " + str(cm.sum()))
        print("accuracy: " + str(accuracy))
        
        if accuracy > .50:
            f = open("GeneratorHypersAbove50percentAccuracyNeptune.txt", "a")
            f.write("""
            
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Accuracy: %.3f
Generator hidden layer 1 size: %.0f
Generator hidden layer 2 size: %.0f
Generator hidden layer 3 size: %.0f
Iteration %.0f
Result file name: %s
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" % (accuracy, gen_hidden1, gen_hidden2, gen_hidden3, i, result_filename))
            f.close()
            
            f = open(result_filename, "w")
            f.close()
            np.savetxt(result_filename, results, fmt="%.0f")
        
        i = i + 1
            


Real neptune attacks:
[[  0   0   9   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0 140   9   0   0   0   0   4   2   0   4   9   4   3   0
    0   0   0   0   0   0]
 [  0   0  11   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0  16  11   0   0   0   0  11   2   0   4  11   5   1   0
    0   0   0   0   0   0]]
0 [D loss: 0.395958, acc.: 63.48%] [G loss: 0.616264] [Loss change: 0.616, Loss increases: 0]
100 [D loss: 0.581769, acc.: 50.39%] [G loss: 0.417016] [Loss change: -0.006, Loss increases: 1]
200 [D loss: 0.609354, acc.: 50.39%] [G loss: 0.379611] [Loss change: -0.006, Loss increases: 0]
300 [D loss: 0.652682, acc.: 50.00%] [G loss: 0.370601] [Loss change: 0.003, Loss increases: 1]
400 [D loss: 0.652847, acc.: 50.20%] [G loss: 0.358442] [Loss change: 0.001, Loss increases: 1]
500 [D loss: 0.682156, acc.: 50.00%] [G loss: 0.338889] [Loss change: -0.012, Loss increases: 2]
600 [D loss: 0.680580, acc.: 50.00%] [G loss: 0.34

0 [D loss: 0.427849, acc.: 50.20%] [G loss: 0.492551] [Loss change: 0.493, Loss increases: 0]
100 [D loss: 0.784889, acc.: 50.00%] [G loss: 0.283358] [Loss change: 0.004, Loss increases: 1]
200 [D loss: 0.857896, acc.: 50.00%] [G loss: 0.258796] [Loss change: 0.003, Loss increases: 1]
300 [D loss: 0.868165, acc.: 50.00%] [G loss: 0.245174] [Loss change: -0.008, Loss increases: 1]
400 [D loss: 0.879024, acc.: 50.00%] [G loss: 0.238002] [Loss change: 0.000, Loss increases: 0]
500 [D loss: 0.866680, acc.: 50.00%] [G loss: 0.253638] [Loss change: -0.002, Loss increases: 3]
600 [D loss: 0.874471, acc.: 50.00%] [G loss: 0.252261] [Loss change: -0.007, Loss increases: 2]
700 [D loss: 0.846795, acc.: 50.78%] [G loss: 0.260655] [Loss change: -0.012, Loss increases: 2]
800 [D loss: 0.829869, acc.: 51.37%] [G loss: 0.272381] [Loss change: 0.010, Loss increases: 0]
900 [D loss: 0.821491, acc.: 50.00%] [G loss: 0.250497] [Loss change: -0.008, Loss increases: 0]
1000 [D loss: 0.829914, acc.: 50.00%]

100 [D loss: 0.572944, acc.: 50.00%] [G loss: 0.405306] [Loss change: -0.001, Loss increases: 0]
200 [D loss: 0.816454, acc.: 50.00%] [G loss: 0.270080] [Loss change: -0.003, Loss increases: 0]
300 [D loss: 0.998530, acc.: 50.00%] [G loss: 0.212990] [Loss change: -0.003, Loss increases: 1]
400 [D loss: 1.145167, acc.: 50.00%] [G loss: 0.168651] [Loss change: -0.006, Loss increases: 0]
500 [D loss: 1.141919, acc.: 50.00%] [G loss: 0.158888] [Loss change: -0.002, Loss increases: 0]
600 [D loss: 1.167729, acc.: 50.00%] [G loss: 0.145288] [Loss change: -0.005, Loss increases: 0]
700 [D loss: 1.122823, acc.: 50.00%] [G loss: 0.151430] [Loss change: -0.005, Loss increases: 0]
800 [D loss: 1.065877, acc.: 50.00%] [G loss: 0.165284] [Loss change: 0.012, Loss increases: 0]
900 [D loss: 1.112819, acc.: 50.00%] [G loss: 0.152839] [Loss change: 0.005, Loss increases: 0]
1000 [D loss: 1.081627, acc.: 50.00%] [G loss: 0.143989] [Loss change: -0.007, Loss increases: 3]
1100 [D loss: 1.116577, acc.: 5