In [1]:
import numpy as np
import pandas as pd
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.datasets import mnist
from keras.layers import Input, Dense, Reshape, Flatten, Dropout
from keras.layers import BatchNormalization, Activation, ZeroPadding2D
from keras.layers.advanced_activations import LeakyReLU
from keras.models import Sequential, Model
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef
from keras.optimizers import Adam


Using TensorFlow backend.


In [2]:
def build_discriminator():
    model = Sequential()
    model.add(Dense(41, input_dim=41, activation='relu'))  # discriminator takes 41 values from our dataset
    model.add(Dense(30, activation='relu'))
    model.add(Dense(15, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # outputs 0 to 1, 1 being read and 0 being fake

    # model.summary()

    attack = Input(shape=(41,))
    validity = model(attack)

    return Model(attack, validity)

In [3]:
def build_generator(hidden1, hidden2, hidden3):
    model = Sequential()
    model.add(Dense(hidden1, input_dim=41))  # arbitrarily selected 100 for our input noise vector?
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(hidden2))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(hidden3))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(41, activation='relu'))  # outputs a generated vector of the same size as our data (41)

    # model.summary()

    noise = Input(shape=(41,))
    attack = model(noise)
    return Model(noise, attack)

In [4]:
def trainGAN(gen_hidden1, gen_hidden2, gen_hidden3):
    batch_size = 256
    epochs = 7000
    optimizer = Adam(0.0002, 0.5)
    
    dataframe = pd.read_csv('../../CSV/portsweep.csv').sample(500) # sample 100 data points randomly from the csv
    
    # apply "le.fit_transform" to every column (usually only works on 1 column)
    le = LabelEncoder()
    dataframe_encoded = dataframe.apply(le.fit_transform)
    dataset = dataframe_encoded.values
    
    #to visually judge results
    print("Real portsweep attacks:")
    print(dataset[:2])
    
    # Set X as our input data and Y as our label
    X_train = dataset[:, 0:41].astype(float)
    Y_train = dataset[:, 41]
    
    # labels for data. 1 for valid attacks, 0 for fake (generated) attacks
    valid = np.ones((batch_size, 1))
    fake = np.zeros((batch_size, 1))
    
    # build the discriminator portion
    discriminator = build_discriminator();
    discriminator.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    # build the generator portion
    generator = build_generator(gen_hidden1, gen_hidden2, gen_hidden3)
    
    #input and output of our combined model
    z = Input(shape=(41,))
    attack = generator(z)
    validity = discriminator(attack)
    
    # build combined model from generator and discriminator
    combined = Model(z, validity)
    combined.compile(loss='binary_crossentropy', optimizer=optimizer)
    
    #break condition for training (when diverging)
    loss_increase_count = 0;
    prev_g_loss = 0;
    
    for epoch in range(epochs):

        # ---------------------
        #  Train Discriminator
        # ---------------------
        
        # selecting batch_size random attacks from our training data
        idx = np.random.randint(0, X_train.shape[0], batch_size)
        attacks = X_train[idx]
        
        # generate a matrix of noise vectors
        noise = np.random.normal(0, 1, (batch_size, 41))
        
        # create an array of generated attacks
        gen_attacks = generator.predict(noise)
        
        # loss functions, based on what metrics we specify at model compile time
        d_loss_real = discriminator.train_on_batch(attacks, valid)
        d_loss_fake = discriminator.train_on_batch(gen_attacks, fake)
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
        
        # generator loss function
        g_loss = combined.train_on_batch(noise, valid)
        
        if epoch % 100 == 0:
            print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f] [Loss change: %.3f, Loss increases: %.0f]" % (epoch, d_loss[0], 100 * d_loss[1], g_loss, g_loss - prev_g_loss, loss_increase_count))
        
        # if our generator loss icreased this iteration, increment the counter by 1
        if (g_loss - prev_g_loss) > 0:
            loss_increase_count = loss_increase_count + 1
        else: 
            loss_increase_count = 0  # otherwise, reset it to 0, we are still training effectively
            
        prev_g_loss = g_loss
            
        if loss_increase_count > 5:
            print('Stoping on iteration: ', epoch)
            break
            
        if epoch % 20 == 0:
            f = open("../../Results/GANresultsportsweep.txt", "a")
            np.savetxt("../../Results/GANresultsportsweep.txt", gen_attacks, fmt="%.0f")
            f.close()
            
    # peek at our results
    results = np.loadtxt("../../Results/GANresultsportsweep.txt")
    print("Generated portsweep attacks: ")
    print(results[:2])
        
        


In [None]:
# Initialize Random Number Generator
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

# load dataset

dataframe = pd.read_csv("../../CSV/normalAndPortsweep.csv")#, header=True) 

# samples 10000 random data points from 500k
dataframe = dataframe.sample(n=10000)
# LabelEncoder, turns all our categorical data into integers
le = LabelEncoder()

# apply "le.fit_transform" to every column (usually only works on 1 column)
dataframe_encoded = dataframe.apply(le.fit_transform)
attack_labels = le.classes_
indices_of_portsweep = np.where(attack_labels == 'portsweep.')
portsweep_index = indices_of_portsweep[0]
dataset = dataframe_encoded.values

print(attack_labels)
print(portsweep_index)

#Set X as our input data and Y as our label
X = dataset[:,0:41].astype(float)
Y = dataset[:,41]


['normal.' 'portsweep.']
[1]


In [None]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)
# print(dummy_y)
#print(len(dummy_y[0]))
num_of_classes = len(dummy_y[0])  # the length of dummy y is the number of classes we have in our small sample
# since we are randomly sampling from a large dataset, we might not get 1 of every class in our sample
# we need to set output layer to be equal to the length of our dummy_y vectors
print(num_of_classes)


2


In [None]:
# define baseline model
def baseline_model():
    # create model
    model = Sequential()
    
    inputs = 41
    hidden_layer1 = 10
    hidden_layer2 = 5
    hidden_layer3 = 0
    outputs = num_of_classes  #needs to be this variable in case we forget to sample. Could end up having 10 classes or 12, etc
    
    model.add(Dense(hidden_layer1, input_dim=inputs, activation='relu'))
    if hidden_layer2 != 0:
        model.add(Dense(hidden_layer2, activation='relu'))
    if hidden_layer3 != 0:
        model.add(Dense(hidden_layer3, activation='relu'))
    model.add(Dense(outputs, activation='softmax'))
    
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) #optimizer=adam
    return model

In [None]:
#for i in range(0,10):
estimator = KerasClassifier(build_fn=baseline_model, epochs=32, batch_size=200, verbose=2)

kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
y_pred = cross_val_predict(estimator, X, dummy_y, cv=kfold)
results = cross_val_score(estimator, X, dummy_y, cv=kfold)

trained_classifier = estimator.fit(X, Y)
print(type(estimator))

cm = confusion_matrix(Y, y_pred)
print(cm)
print("total: " + str(cm.sum()))
print("accuracy: " + str(np.trace(cm) / cm.sum()))
print("Matthews correlation coefficient: " + str(matthews_corrcoef(Y, y_pred)))



print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

f = open("../../Results/discriminatorResults.txt", "a+")
f.write("TP: %d, FP: %d, FN: %d, TN: %d\n" % (cm[0][0], cm[0][1], cm[1][0], cm[1][1]))
f.close()


Epoch 1/32
 - 1s - loss: 3.6850 - acc: 0.7438
Epoch 2/32
 - 0s - loss: 0.6218 - acc: 0.9487
Epoch 3/32
 - 0s - loss: 0.3135 - acc: 0.9707
Epoch 4/32
 - 0s - loss: 0.2408 - acc: 0.9778
Epoch 5/32
 - 0s - loss: 0.2146 - acc: 0.9801
Epoch 6/32
 - 0s - loss: 0.2021 - acc: 0.9813
Epoch 7/32
 - 0s - loss: 0.1929 - acc: 0.9834
Epoch 8/32
 - 0s - loss: 0.1857 - acc: 0.9851
Epoch 9/32
 - 0s - loss: 0.1791 - acc: 0.9866
Epoch 10/32
 - 0s - loss: 0.1733 - acc: 0.9876
Epoch 11/32
 - 0s - loss: 0.1576 - acc: 0.9883
Epoch 12/32
 - 0s - loss: 0.1211 - acc: 0.9904
Epoch 13/32
 - 0s - loss: 0.1069 - acc: 0.9920
Epoch 14/32
 - 0s - loss: 0.0931 - acc: 0.9921
Epoch 15/32
 - 0s - loss: 0.0733 - acc: 0.9932
Epoch 16/32
 - 0s - loss: 0.0468 - acc: 0.9944
Epoch 17/32
 - 0s - loss: 0.0297 - acc: 0.9961
Epoch 18/32
 - 0s - loss: 0.0232 - acc: 0.9969
Epoch 19/32
 - 0s - loss: 0.0205 - acc: 0.9977
Epoch 20/32
 - 0s - loss: 0.0192 - acc: 0.9980
Epoch 21/32
 - 0s - loss: 0.0186 - acc: 0.9982
Epoch 22/32
 - 0s - lo

Epoch 17/32
 - 0s - loss: 7.6598 - acc: 0.5247
Epoch 18/32
 - 0s - loss: 7.6598 - acc: 0.5248
Epoch 19/32
 - 0s - loss: 7.6598 - acc: 0.5247
Epoch 20/32
 - 0s - loss: 7.6598 - acc: 0.5247
Epoch 21/32
 - 0s - loss: 7.6598 - acc: 0.5248
Epoch 22/32
 - 0s - loss: 7.6598 - acc: 0.5247
Epoch 23/32
 - 0s - loss: 7.6599 - acc: 0.5248
Epoch 24/32
 - 0s - loss: 7.6599 - acc: 0.5247
Epoch 25/32
 - 0s - loss: 7.6598 - acc: 0.5247
Epoch 26/32
 - 0s - loss: 7.6598 - acc: 0.5248
Epoch 27/32
 - 0s - loss: 7.6599 - acc: 0.5247
Epoch 28/32
 - 0s - loss: 7.6598 - acc: 0.5247
Epoch 29/32
 - 0s - loss: 7.6598 - acc: 0.5248
Epoch 30/32
 - 0s - loss: 7.6598 - acc: 0.5248
Epoch 31/32
 - 0s - loss: 7.6598 - acc: 0.5248
Epoch 32/32
 - 0s - loss: 7.6598 - acc: 0.5247
Epoch 1/32
 - 1s - loss: 4.7338 - acc: 0.6560
Epoch 2/32
 - 0s - loss: 0.1010 - acc: 0.9837
Epoch 3/32
 - 0s - loss: 0.0341 - acc: 0.9957
Epoch 4/32
 - 0s - loss: 0.0283 - acc: 0.9974
Epoch 5/32
 - 0s - loss: 0.0266 - acc: 0.9976
Epoch 6/32
 - 0s -

Epoch 1/32
 - 1s - loss: 7.5778 - acc: 0.5292
Epoch 2/32
 - 0s - loss: 1.7899 - acc: 0.8459
Epoch 3/32
 - 0s - loss: 0.1810 - acc: 0.9728
Epoch 4/32
 - 0s - loss: 0.0260 - acc: 0.9951
Epoch 5/32
 - 0s - loss: 0.0200 - acc: 0.9972
Epoch 6/32
 - 0s - loss: 0.0187 - acc: 0.9974
Epoch 7/32
 - 0s - loss: 0.0178 - acc: 0.9976
Epoch 8/32
 - 0s - loss: 0.0177 - acc: 0.9978
Epoch 9/32
 - 0s - loss: 0.0167 - acc: 0.9977
Epoch 10/32
 - 0s - loss: 0.0165 - acc: 0.9979
Epoch 11/32
 - 0s - loss: 0.0147 - acc: 0.9981
Epoch 12/32
 - 0s - loss: 0.0134 - acc: 0.9982
Epoch 13/32
 - 0s - loss: 0.0119 - acc: 0.9980
Epoch 14/32
 - 0s - loss: 0.0103 - acc: 0.9981
Epoch 15/32
 - 0s - loss: 0.0087 - acc: 0.9989
Epoch 16/32
 - 0s - loss: 0.0074 - acc: 0.9990
Epoch 17/32
 - 0s - loss: 0.0067 - acc: 0.9991
Epoch 18/32
 - 0s - loss: 0.0064 - acc: 0.9993
Epoch 19/32
 - 0s - loss: 0.0062 - acc: 0.9993
Epoch 20/32
 - 0s - loss: 0.0061 - acc: 0.9991
Epoch 21/32
 - 0s - loss: 0.0059 - acc: 0.9993
Epoch 22/32
 - 0s - lo

Epoch 17/32
 - 0s - loss: 0.0138 - acc: 0.9989
Epoch 18/32
 - 0s - loss: 0.0136 - acc: 0.9989
Epoch 19/32
 - 0s - loss: 0.0134 - acc: 0.9989
Epoch 20/32
 - 0s - loss: 0.0133 - acc: 0.9989
Epoch 21/32
 - 0s - loss: 0.0131 - acc: 0.9989
Epoch 22/32
 - 0s - loss: 0.0130 - acc: 0.9989
Epoch 23/32
 - 0s - loss: 0.0129 - acc: 0.9989
Epoch 24/32
 - 0s - loss: 0.0128 - acc: 0.9989
Epoch 25/32
 - 0s - loss: 0.0127 - acc: 0.9989
Epoch 26/32
 - 0s - loss: 0.0126 - acc: 0.9989
Epoch 27/32
 - 0s - loss: 0.0125 - acc: 0.9988
Epoch 28/32
 - 0s - loss: 0.0124 - acc: 0.9988
Epoch 29/32
 - 0s - loss: 0.0124 - acc: 0.9988
Epoch 30/32
 - 0s - loss: 0.0123 - acc: 0.9989
Epoch 31/32
 - 0s - loss: 0.0122 - acc: 0.9989
Epoch 32/32
 - 0s - loss: 0.0121 - acc: 0.9989
Epoch 1/32
 - 1s - loss: 1.7670 - acc: 0.8672
Epoch 2/32
 - 0s - loss: 0.3490 - acc: 0.9619
Epoch 3/32
 - 0s - loss: 0.0645 - acc: 0.9872
Epoch 4/32
 - 0s - loss: 0.0360 - acc: 0.9918
Epoch 5/32
 - 0s - loss: 0.0285 - acc: 0.9930
Epoch 6/32
 - 0s -

In [None]:
"""
f = open("GeneratorHypersAbove50percentAccuracy.txt", "w")
f.write("""""" Hidden layer counts for Generator model that resulted in over 50% generated attacks labeled correctly:
    ------------------------------------------------------------------------------------------------
    """""")
f.close()
"""

while(1):
    # generate random numbers for the hidden layer sizes of our generator
    gen_hidden1 =  np.random.randint(1, 101)
    gen_hidden2 =  np.random.randint(1, 101)
    gen_hidden3 =  np.random.randint(1, 101)
    
    i = 0
    
    
    # train 5 times on each setup, in case we get unlucky initalization on an otherwise good setup
    while i < 100:
        # create a unique filename in case we want to store the results (good accuracy)
        result_filename = "../../Results/GANresultsportsweep%.0f%.0f%.0fiter%.0ftry2.txt" % (gen_hidden1, gen_hidden2, gen_hidden3, i)

        trainGAN(gen_hidden1, gen_hidden2, gen_hidden3)
        
        # load generate attacks from file
        results = np.loadtxt("../../Results/GANresultsportsweep.txt")

        # predict attack lables (as encoded integers)
        y_pred = estimator.predict(results)
        print(y_pred)

        # create appropriate labels for our generated portsweep attacks
        portsweep_labels = np.full((len(results),), portsweep_index[0])

        # convert integer labels back to string, get all unique strings and their count
        predicted_as_label = attack_labels[y_pred]
        unique_labels = np.unique(predicted_as_label)

        for label in unique_labels:
            print("Attack type: %s     number predicted:  %.0f" % (label, len(np.where(predicted_as_label == label)[0])))
    
        print()
        # create a confusion matrix of the results
        cm = confusion_matrix(portsweep_labels, y_pred)
        
        accuracy = np.trace(cm) / cm.sum()
        print(cm)
        print("total: " + str(cm.sum()))
        print("accuracy: " + str(accuracy))
        
        if accuracy > .50:
            f = open("../../Results/GeneratorHypersAbove50percentAccuracyportsweep.txt", "a")
            f.write("""
            
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Accuracy: %.3f
Generator hidden layer 1 size: %.0f
Generator hidden layer 2 size: %.0f
Generator hidden layer 3 size: %.0f
Iteration %.0f
Result file name: %s
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" % (accuracy, gen_hidden1, gen_hidden2, gen_hidden3, i, result_filename))
            f.close()
            result_filename = "../../Results/" + result_filename
            
            f = open(result_filename, "w")
            f.close()
            np.savetxt(result_filename, results, fmt="%.0f")
        
        i = i + 1
            


Real portsweep attacks:
[[ 0  1 15  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  1
   0  0  1  1  1  0  0 27  0  0 51 68  0  0  0 67  3  0]
 [ 0  1 15  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  1
   0  0  1  1  1  0  0 27  0  0 63 68  0  0  0 67  3  0]]
0 [D loss: 0.377604, acc.: 53.52%] [G loss: 0.596988] [Loss change: 0.597, Loss increases: 0]
100 [D loss: 0.725978, acc.: 50.39%] [G loss: 0.335499] [Loss change: 0.007, Loss increases: 0]
200 [D loss: 0.771083, acc.: 50.39%] [G loss: 0.293191] [Loss change: -0.012, Loss increases: 1]
300 [D loss: 0.744543, acc.: 49.80%] [G loss: 0.297916] [Loss change: 0.004, Loss increases: 1]
400 [D loss: 0.776364, acc.: 50.20%] [G loss: 0.291112] [Loss change: -0.013, Loss increases: 4]
500 [D loss: 0.776631, acc.: 50.20%] [G loss: 0.299490] [Loss change: 0.011, Loss increases: 0]
600 [D loss: 0.769311, acc.: 49.80%] [G loss: 0.294165] [Loss change: -0.000, Loss increases: 0]
700 [D loss: 0.760264, acc.: 50.00%] [G 