In [1]:
import numpy as np
import pandas as pd
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.datasets import mnist
from keras.layers import Input, Dense, Reshape, Flatten, Dropout
from keras.layers import BatchNormalization, Activation, ZeroPadding2D
from keras.layers.advanced_activations import LeakyReLU
from keras.models import Sequential, Model
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef
from keras.optimizers import Adam


Using TensorFlow backend.


In [2]:
def build_discriminator():
    model = Sequential()
    model.add(Dense(41, input_dim=41, activation='relu'))  # discriminator takes 41 values from our dataset
    model.add(Dense(30, activation='relu'))
    model.add(Dense(15, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # outputs 0 to 1, 1 being read and 0 being fake

    # model.summary()

    attack = Input(shape=(41,))
    validity = model(attack)

    return Model(attack, validity)

In [3]:
def build_generator(hidden1, hidden2, hidden3):
    model = Sequential()
    model.add(Dense(hidden1, input_dim=41))  # arbitrarily selected 100 for our input noise vector?
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(hidden2))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(hidden3))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(41, activation='relu'))  # outputs a generated vector of the same size as our data (41)

    # model.summary()

    noise = Input(shape=(41,))
    attack = model(noise)
    return Model(noise, attack)

In [4]:
def trainGAN(gen_hidden1, gen_hidden2, gen_hidden3):
    batch_size = 256
    epochs = 7000
    optimizer = Adam(0.0002, 0.5)
    
    dataframe = pd.read_csv('../../CSV/kdd_neptune_only_5000.csv').sample(500) # sample 100 data points randomly from the csv
    
    # apply "le.fit_transform" to every column (usually only works on 1 column)
    le = LabelEncoder()
    dataframe_encoded = dataframe.apply(le.fit_transform)
    dataset = dataframe_encoded.values
    
    #to visually judge results
    print("Real neptune attacks:")
    print(dataset[:2])
    
    # Set X as our input data and Y as our label
    X_train = dataset[:, 0:41].astype(float)
    Y_train = dataset[:, 41]
    
    # labels for data. 1 for valid attacks, 0 for fake (generated) attacks
    valid = np.ones((batch_size, 1))
    fake = np.zeros((batch_size, 1))
    
    # build the discriminator portion
    discriminator = build_discriminator();
    discriminator.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    # build the generator portion
    generator = build_generator(gen_hidden1, gen_hidden2, gen_hidden3)
    
    #input and output of our combined model
    z = Input(shape=(41,))
    attack = generator(z)
    validity = discriminator(attack)
    
    # build combined model from generator and discriminator
    combined = Model(z, validity)
    combined.compile(loss='binary_crossentropy', optimizer=optimizer)
    
    #break condition for training (when diverging)
    loss_increase_count = 0;
    prev_g_loss = 0;
    
    for epoch in range(epochs):

        # ---------------------
        #  Train Discriminator
        # ---------------------
        
        # selecting batch_size random attacks from our training data
        idx = np.random.randint(0, X_train.shape[0], batch_size)
        attacks = X_train[idx]
        
        # generate a matrix of noise vectors
        noise = np.random.normal(0, 1, (batch_size, 41))
        
        # create an array of generated attacks
        gen_attacks = generator.predict(noise)
        
        # loss functions, based on what metrics we specify at model compile time
        d_loss_real = discriminator.train_on_batch(attacks, valid)
        d_loss_fake = discriminator.train_on_batch(gen_attacks, fake)
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
        
        # generator loss function
        g_loss = combined.train_on_batch(noise, valid)
        
        if epoch % 100 == 0:
            print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f] [Loss change: %.3f, Loss increases: %.0f]" % (epoch, d_loss[0], 100 * d_loss[1], g_loss, g_loss - prev_g_loss, loss_increase_count))
        
        # if our generator loss icreased this iteration, increment the counter by 1
        if (g_loss - prev_g_loss) > 0:
            loss_increase_count = loss_increase_count + 1
        else: 
            loss_increase_count = 0  # otherwise, reset it to 0, we are still training effectively
            
        prev_g_loss = g_loss
            
        if loss_increase_count > 5:
            print('Stoping on iteration: ', epoch)
            break
            
        if epoch % 20 == 0:
            f = open("GANresultsNeptune.txt", "a")
            np.savetxt("GANresultsNeptune.txt", gen_attacks, fmt="%.0f")
            f.close()
            
    # peek at our results
    results = np.loadtxt("GANresultsNeptune.txt")
    print("Generated Neptune attacks: ")
    print(results[:2])
        
        


In [5]:
# Initialize Random Number Generator
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

# load dataset

dataframe = pd.read_csv("../../CSV/normalAndNeptune.csv")#, header=True) 

# samples 10000 random data points from 500k
dataframe = dataframe.sample(n=15000)
# LabelEncoder, turns all our categorical data into integers
le = LabelEncoder()

# apply "le.fit_transform" to every column (usually only works on 1 column)
dataframe_encoded = dataframe.apply(le.fit_transform)
attack_labels = le.classes_
indices_of_neptune = np.where(attack_labels == 'neptune.')
neptune_index = indices_of_neptune[0]
dataset = dataframe_encoded.values

print(attack_labels)
print(neptune_index)

#Set X as our input data and Y as our label
X = dataset[:,0:41].astype(float)
Y = dataset[:,41]


['neptune.' 'normal.']
[0]


In [6]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)
# print(dummy_y)
#print(len(dummy_y[0]))
num_of_classes = len(dummy_y[0])  # the length of dummy y is the number of classes we have in our small sample
# since we are randomly sampling from a large dataset, we might not get 1 of every class in our sample
# we need to set output layer to be equal to the length of our dummy_y vectors
print(num_of_classes)


2


In [7]:
# define baseline model
def baseline_model():
    # create model
    model = Sequential()
    
    inputs = 41
    hidden_layer1 = 10
    hidden_layer2 = 5
    hidden_layer3 = 0
    outputs = num_of_classes  #needs to be this variable in case we forget to sample. Could end up having 10 classes or 12, etc
    
    model.add(Dense(hidden_layer1, input_dim=inputs, activation='relu'))
    if hidden_layer2 != 0:
        model.add(Dense(hidden_layer2, activation='relu'))
    if hidden_layer3 != 0:
        model.add(Dense(hidden_layer3, activation='relu'))
    model.add(Dense(outputs, activation='softmax'))
    
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) #optimizer=adam
    return model

In [8]:
#for i in range(0,10):
estimator = KerasClassifier(build_fn=baseline_model, epochs=32, batch_size=200, verbose=2)

kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
y_pred = cross_val_predict(estimator, X, dummy_y, cv=kfold)
results = cross_val_score(estimator, X, dummy_y, cv=kfold)

trained_classifier = estimator.fit(X, Y)
print(type(estimator))

cm = confusion_matrix(Y, y_pred)
print(cm)
print("total: " + str(cm.sum()))
print("accuracy: " + str(np.trace(cm) / cm.sum()))
print("Matthews correlation coefficient: " + str(matthews_corrcoef(Y, y_pred)))



print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

f = open("../../Results/discriminatorResults.txt", "a+")
f.write("TP: %d, FP: %d, FN: %d, TN: %d\n" % (cm[0][0], cm[0][1], cm[1][0], cm[1][1]))
f.close()


Epoch 1/32
 - 1s - loss: 6.5063 - acc: 0.5590
Epoch 2/32
 - 0s - loss: 1.2563 - acc: 0.8716
Epoch 3/32
 - 0s - loss: 0.2356 - acc: 0.9817
Epoch 4/32
 - 0s - loss: 0.1958 - acc: 0.9830
Epoch 5/32
 - 0s - loss: 0.0743 - acc: 0.9856
Epoch 6/32
 - 0s - loss: 0.0316 - acc: 0.9870
Epoch 7/32
 - 0s - loss: 0.0182 - acc: 0.9920
Epoch 8/32
 - 0s - loss: 0.0072 - acc: 0.9982
Epoch 9/32
 - 0s - loss: 0.0050 - acc: 0.9995
Epoch 10/32
 - 0s - loss: 0.0046 - acc: 0.9996
Epoch 11/32
 - 0s - loss: 0.0026 - acc: 0.9996
Epoch 12/32
 - 0s - loss: 0.0024 - acc: 0.9996
Epoch 13/32
 - 0s - loss: 0.0023 - acc: 0.9996
Epoch 14/32
 - 0s - loss: 0.0023 - acc: 0.9996
Epoch 15/32
 - 0s - loss: 0.0022 - acc: 0.9996
Epoch 16/32
 - 0s - loss: 0.0022 - acc: 0.9996
Epoch 17/32
 - 0s - loss: 0.0022 - acc: 0.9996
Epoch 18/32
 - 0s - loss: 0.0019 - acc: 0.9996
Epoch 19/32
 - 0s - loss: 0.0019 - acc: 0.9996
Epoch 20/32
 - 0s - loss: 0.0017 - acc: 0.9996
Epoch 21/32
 - 0s - loss: 0.0015 - acc: 0.9996
Epoch 22/32
 - 0s - lo

 - 0s - loss: 0.0013 - acc: 0.9997
Epoch 16/32
 - 0s - loss: 0.0012 - acc: 0.9998
Epoch 17/32
 - 0s - loss: 0.0012 - acc: 0.9998
Epoch 18/32
 - 0s - loss: 0.0011 - acc: 0.9998
Epoch 19/32
 - 0s - loss: 0.0011 - acc: 0.9998
Epoch 20/32
 - 0s - loss: 0.0011 - acc: 0.9998
Epoch 21/32
 - 0s - loss: 9.9173e-04 - acc: 0.9998
Epoch 22/32
 - 0s - loss: 0.0010 - acc: 0.9998
Epoch 23/32
 - 0s - loss: 9.2374e-04 - acc: 0.9998
Epoch 24/32
 - 0s - loss: 9.0054e-04 - acc: 0.9998
Epoch 25/32
 - 0s - loss: 8.4276e-04 - acc: 0.9998
Epoch 26/32
 - 0s - loss: 8.3150e-04 - acc: 0.9999
Epoch 27/32
 - 0s - loss: 7.9061e-04 - acc: 0.9998
Epoch 28/32
 - 0s - loss: 7.5055e-04 - acc: 0.9999
Epoch 29/32
 - 0s - loss: 7.2023e-04 - acc: 0.9999
Epoch 30/32
 - 0s - loss: 6.9713e-04 - acc: 0.9999
Epoch 31/32
 - 0s - loss: 6.6152e-04 - acc: 0.9999
Epoch 32/32
 - 0s - loss: 6.1333e-04 - acc: 0.9999
Epoch 1/32
 - 1s - loss: 0.4251 - acc: 0.9350
Epoch 2/32
 - 0s - loss: 0.0608 - acc: 0.9941
Epoch 3/32
 - 0s - loss: 0.009

 - 0s - loss: 0.0020 - acc: 0.9996
Epoch 28/32
 - 0s - loss: 0.0016 - acc: 0.9997
Epoch 29/32
 - 0s - loss: 0.0016 - acc: 0.9997
Epoch 30/32
 - 0s - loss: 0.0015 - acc: 0.9997
Epoch 31/32
 - 0s - loss: 0.0013 - acc: 0.9997
Epoch 32/32
 - 0s - loss: 0.0012 - acc: 0.9997
Epoch 1/32
 - 1s - loss: 2.9310 - acc: 0.7453
Epoch 2/32
 - 0s - loss: 0.1427 - acc: 0.9924
Epoch 3/32
 - 0s - loss: 0.0174 - acc: 0.9954
Epoch 4/32
 - 0s - loss: 0.0088 - acc: 0.9972
Epoch 5/32
 - 0s - loss: 0.0059 - acc: 0.9983
Epoch 6/32
 - 0s - loss: 0.0043 - acc: 0.9987
Epoch 7/32
 - 0s - loss: 0.0031 - acc: 0.9993
Epoch 8/32
 - 0s - loss: 0.0023 - acc: 0.9996
Epoch 9/32
 - 0s - loss: 0.0019 - acc: 0.9996
Epoch 10/32
 - 0s - loss: 0.0015 - acc: 0.9999
Epoch 11/32
 - 0s - loss: 0.0012 - acc: 0.9999
Epoch 12/32
 - 0s - loss: 0.0011 - acc: 0.9999
Epoch 13/32
 - 0s - loss: 9.1586e-04 - acc: 0.9999
Epoch 14/32
 - 0s - loss: 8.1121e-04 - acc: 0.9999
Epoch 15/32
 - 0s - loss: 7.2522e-04 - acc: 0.9999
Epoch 16/32
 - 0s - lo

Epoch 5/32
 - 0s - loss: 0.0083 - acc: 0.9991
Epoch 6/32
 - 0s - loss: 0.0036 - acc: 0.9991
Epoch 7/32
 - 0s - loss: 0.0029 - acc: 0.9993
Epoch 8/32
 - 0s - loss: 0.0025 - acc: 0.9993
Epoch 9/32
 - 0s - loss: 0.0022 - acc: 0.9994
Epoch 10/32
 - 0s - loss: 0.0019 - acc: 0.9996
Epoch 11/32
 - 0s - loss: 0.0017 - acc: 0.9996
Epoch 12/32
 - 0s - loss: 0.0015 - acc: 0.9996
Epoch 13/32
 - 0s - loss: 0.0014 - acc: 0.9996
Epoch 14/32
 - 0s - loss: 0.0013 - acc: 0.9997
Epoch 15/32
 - 0s - loss: 0.0012 - acc: 0.9997
Epoch 16/32
 - 0s - loss: 0.0012 - acc: 0.9997
Epoch 17/32
 - 0s - loss: 0.0011 - acc: 0.9998
Epoch 18/32
 - 0s - loss: 0.0011 - acc: 0.9998
Epoch 19/32
 - 0s - loss: 0.0010 - acc: 0.9998
Epoch 20/32
 - 0s - loss: 9.7702e-04 - acc: 0.9998
Epoch 21/32
 - 0s - loss: 9.4960e-04 - acc: 0.9998
Epoch 22/32
 - 0s - loss: 9.3540e-04 - acc: 0.9998
Epoch 23/32
 - 0s - loss: 8.9780e-04 - acc: 0.9998
Epoch 24/32
 - 0s - loss: 8.7828e-04 - acc: 0.9998
Epoch 25/32
 - 0s - loss: 8.5892e-04 - acc: 0

In [9]:
"""
f = open("GeneratorHypersAbove50percentAccuracy.txt", "w")
f.write("""""" Hidden layer counts for Generator model that resulted in over 50% generated attacks labeled correctly:
    ------------------------------------------------------------------------------------------------
    """""")
f.close()
"""

while(1):
    # generate random numbers for the hidden layer sizes of our generator
    gen_hidden1 = np.random.randint(1, 101)
    gen_hidden2 = np.random.randint(1, 101)
    gen_hidden3 = np.random.randint(1, 101)
    
    i = 0
    
    
    # train 5 times on each setup, in case we get unlucky initalization on an otherwise good setup
    while i < 5:
        # create a unique filename in case we want to store the results (good accuracy)
        result_filename = "../../Results/GANresultsNeptune%.0f%.0f%.0fiter%.0f.txt" % (gen_hidden1, gen_hidden2, gen_hidden3, i)

        trainGAN(gen_hidden1, gen_hidden2, gen_hidden3)
        
        # load generate attacks from file
        results = np.loadtxt("../../Results/GANresultsNeptune.txt")

        # predict attack lables (as encoded integers)
        y_pred = estimator.predict(results)
        print(y_pred)

        # create appropriate labels for our generated neptune attacks
        neptune_labels = np.full((len(results),), neptune_index[0])

        # convert integer labels back to string, get all unique strings and their count
        predicted_as_label = attack_labels[y_pred]
        unique_labels = np.unique(predicted_as_label)

        for label in unique_labels:
            print("Attack type: %s     number predicted:  %.0f" % (label, len(np.where(predicted_as_label == label)[0])))
    
        print()
        # create a confusion matrix of the results
        cm = confusion_matrix(neptune_labels, y_pred)
        
        accuracy = np.trace(cm) / cm.sum()
        print(cm)
        print("total: " + str(cm.sum()))
        print("accuracy: " + str(accuracy))
        
        if accuracy > .50:
            f = open("../../Results/GeneratorHypersAbove50percentAccuracyNeptune.txt", "a")
            f.write("""
            
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Accuracy: %.3f
Generator hidden layer 1 size: %.0f
Generator hidden layer 2 size: %.0f
Generator hidden layer 3 size: %.0f
Iteration %.0f
Result file name: %s
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" % (accuracy, gen_hidden1, gen_hidden2, gen_hidden3, i, result_filename))
            f.close()
            result_filename = "../../Results/" + result_filename
            
            f = open(result_filename, "w")
            f.close()
            np.savetxt(result_filename, results, fmt="%.0f")
        
        i = i + 1
            


Real neptune attacks:
[[ 0  0 17  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 10  0
   1  0  0  0  1  3  0  6  0  0  3  0  0  0  0  0  0  0]
 [ 0  0 22  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 34  9
   1  0  0  0  8  2  0  6  9  4  1  0  0  0  0  0  0  0]]
0 [D loss: 0.382158, acc.: 74.80%] [G loss: 0.631542] [Loss change: 0.632, Loss increases: 0]
100 [D loss: 0.622788, acc.: 50.00%] [G loss: 0.401457] [Loss change: -0.003, Loss increases: 0]
200 [D loss: 0.662134, acc.: 50.00%] [G loss: 0.358740] [Loss change: -0.006, Loss increases: 0]
300 [D loss: 0.694411, acc.: 50.00%] [G loss: 0.328874] [Loss change: -0.016, Loss increases: 1]
400 [D loss: 0.710439, acc.: 50.00%] [G loss: 0.320491] [Loss change: 0.007, Loss increases: 1]
500 [D loss: 0.719216, acc.: 50.00%] [G loss: 0.320770] [Loss change: 0.010, Loss increases: 0]
600 [D loss: 0.743956, acc.: 50.00%] [G loss: 0.305907] [Loss change: 0.000, Loss increases: 0]
700 [D loss: 0.758458, acc.: 50.00%] [G lo

FileNotFoundError: [Errno 2] No such file or directory: '../../Results/../../Results/GANresultsNeptune739075iter0.txt'