In [2]:
	
import numpy as np
import pandas as pd
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.datasets import mnist
from keras.layers import Input, Dense, Reshape, Flatten, Dropout
from keras.layers import BatchNormalization, Activation, ZeroPadding2D
from keras.layers import LeakyReLU
from keras.models import Sequential, Model
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef
from keras.optimizers import Adam

# Discriminator

### Psuedo Code

Function build_discriminator:
    Initialize a sequential model
    Add a dense layer with 41 units, input dimension 41, and 'relu' activation to the model
    Add a dense layer with 30 units and 'relu' activation to the model
    Add a dense layer with 15 units and 'relu' activation to the model
    Add a dense layer with 1 unit and 'sigmoid' activation to the model
    Define an input 'attack' with shape 41
    Pass 'attack' through the model to get 'validity'
    Return a model with input 'attack' and output 'validity'
End Function

### Code

In [None]:

def build_discriminator():
    model = Sequential()
    model.add(Dense(41, input_dim=41, activation='relu'))  # discriminator takes 41 values from our dataset
    model.add(Dense(30, activation='relu'))
    model.add(Dense(15, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # outputs 0 to 1, 1 being read and 0 being fake

    # model.summary()

    attack = Input(shape=(41,))
    validity = model(attack)

    return Model(attack, validity)

# Generator

### Psuedo Code

Function build_generator with parameters hidden1, hidden2, hidden3:
    Initialize a sequential model
    Add a dense layer with 'hidden1' units, input dimension 41 to the model
    Add a LeakyReLU layer with alpha 0.2 to the model
    Add a BatchNormalization layer with momentum 0.8 to the model
    Add a dense layer with 'hidden2' units to the model
    Add a LeakyReLU layer with alpha 0.2 to the model
    Add a BatchNormalization layer with momentum 0.8 to the model
    Add a dense layer with 'hidden3' units to the model
    Add a LeakyReLU layer with alpha 0.2 to the model
    Add a BatchNormalization layer with momentum 0.8 to the model
    Add a dense layer with 41 units and 'relu' activation to the model
    Define an input 'noise' with shape 41
    Pass 'noise' through the model to get 'attack'
    Return a model with input 'noise' and output 'attack'
End Function

### Code

In [None]:

def build_generator(hidden1, hidden2, hidden3):
    model = Sequential()
    model.add(Dense(hidden1, input_dim=41))  # arbitrarily selected 100 for our input noise vector?
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(hidden2))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(hidden3))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(41, activation='relu'))  # outputs a generated vector of the same size as our data (41)

    # model.summary()

    noise = Input(shape=(41,))
    attack = model(noise)
    return Model(noise, attack)

# Train

# Pseudocode for trainGAN function

1. Define function trainGAN with parameters gen_hidden1, gen_hidden2, gen_hidden3
2. Set batch_size, epochs, and optimizer
3. Load and sample data from the CSV file
4. Encode the categorical data in the dataframe
5. Print some real attacks for visual inspection
6. Split the dataset into features (X_train) and labels (Y_train)
7. Define labels for valid and fake attacks
8. Build and compile the discriminator
9. Build the generator
10. Define the input and output for the combined model
11. Build and compile the combined model
12. Initialize variables for tracking the generator's loss
13. For each epoch in the range of epochs:
    1. Select a batch of real attacks from the training data
    2. Generate a batch of fake attacks
    3. Train the discriminator on both real and fake attacks
    4. Train the generator to try to fool the discriminator
    5. Print the losses of the discriminator and generator every 100 epochs
    6. If the generator's loss has increased for more than 5 consecutive epochs, stop the training
    7. Every 20 epochs, save the generated attacks to a text file
14. Load the generated attacks from the text file and print some of them

In [None]:
def trainGAN(gen_hidden1, gen_hidden2, gen_hidden3):
    batch_size = 256
    epochs = 7000
    optimizer = Adam(0.0002, 0.5)
    
    dataframe = pd.read_csv('../../CSV/portsweep.csv').sample(500) # sample 100 data points randomly from the csv
    
    # apply "le.fit_transform" to every column (usually only works on 1 column)
    le = LabelEncoder()
    dataframe_encoded = dataframe.apply(le.fit_transform)
    dataset = dataframe_encoded.values
    
    #to visually judge results
    print("Real portsweep attacks:")
    print(dataset[:2])
    
    # Set X as our input data and Y as our label
    X_train = dataset[:, 0:41].astype(float)
    Y_train = dataset[:, 41]
    
    # labels for data. 1 for valid attacks, 0 for fake (generated) attacks
    valid = np.ones((batch_size, 1))
    fake = np.zeros((batch_size, 1))
    
    # build the discriminator portion
    discriminator = build_discriminator();
    discriminator.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    # build the generator portion
    generator = build_generator(gen_hidden1, gen_hidden2, gen_hidden3)
    
    #input and output of our combined model
    z = Input(shape=(41,))
    attack = generator(z)
    validity = discriminator(attack)
    
    # build combined model from generator and discriminator
    combined = Model(z, validity)
    combined.compile(loss='binary_crossentropy', optimizer=optimizer)
    
    #break condition for training (when diverging)
    loss_increase_count = 0;
    prev_g_loss = 0;
    
    for epoch in range(epochs):

        # ---------------------
        #  Train Discriminator
        # ---------------------
        
        # selecting batch_size random attacks from our training data
        idx = np.random.randint(0, X_train.shape[0], batch_size)
        attacks = X_train[idx]
        
        # generate a matrix of noise vectors
        noise = np.random.normal(0, 1, (batch_size, 41))
        
        # create an array of generated attacks
        gen_attacks = generator.predict(noise)
        
        # loss functions, based on what metrics we specify at model compile time
        d_loss_real = discriminator.train_on_batch(attacks, valid)
        d_loss_fake = discriminator.train_on_batch(gen_attacks, fake)
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
        
        # generator loss function
        g_loss = combined.train_on_batch(noise, valid)
        
        if epoch % 100 == 0:
            print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f] [Loss change: %.3f, Loss increases: %.0f]" % (epoch, d_loss[0], 100 * d_loss[1], g_loss, g_loss - prev_g_loss, loss_increase_count))
        
        # if our generator loss icreased this iteration, increment the counter by 1
        if (g_loss - prev_g_loss) > 0:
            loss_increase_count = loss_increase_count + 1
        else: 
            loss_increase_count = 0  # otherwise, reset it to 0, we are still training effectively
            
        prev_g_loss = g_loss
            
        if loss_increase_count > 5:
            print('Stoping on iteration: ', epoch)
            break
            
        if epoch % 20 == 0:
            f = open("../../Results/GANresultsportsweep.txt", "a")
            np.savetxt("../../Results/GANresultsportsweep.txt", gen_attacks, fmt="%.0f")
            f.close()
            
    # peek at our results
    results = np.loadtxt("../../Results/GANresultsportsweep.txt")
    print("Generated portsweep attacks: ")
    print(results[:2])

## *** Pre Proccess ***

# model

Function baseline_model:
    Initialize a sequential model
    Define 'inputs' as 41
    Define 'hidden_layer1' as 10
    Define 'hidden_layer2' as 5
    Define 'hidden_layer3' as 0
    Define 'outputs' as the number of classes
    Add a dense layer with 'hidden_layer1' units, input dimension 'inputs', and 'relu' activation to the model
    If 'hidden_layer2' is not 0:
        Add a dense layer with 'hidden_layer2' units and 'relu' activation to the model
    If 'hidden_layer3' is not 0:
        Add a dense layer with 'hidden_layer3' units and 'relu' activation to the model
    Add a dense layer with 'outputs' units and 'softmax' activation to the model
    Compile the model with 'categorical_crossentropy' loss, 'adam' optimizer, and 'accuracy' as a metric
    Return the model
End Function

In [None]:
# define baseline model
def baseline_model():
    # create model
    model = Sequential()
    
    inputs = 41
    hidden_layer1 = 10
    hidden_layer2 = 5
    hidden_layer3 = 0
    outputs = num_of_classes  #needs to be this variable in case we forget to sample. Could end up having 10 classes or 12, etc
    
    model.add(Dense(hidden_layer1, input_dim=inputs, activation='relu'))
    if hidden_layer2 != 0:
        model.add(Dense(hidden_layer2, activation='relu'))
    if hidden_layer3 != 0:
        model.add(Dense(hidden_layer3, activation='relu'))
    model.add(Dense(outputs, activation='softmax'))
    
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) #optimizer=adam
    return model

# build

Initialize a KerasClassifier 'estimator' with the function 'baseline_model', 32 epochs, batch size 200, and verbosity level 2
Initialize a KFold 'kfold' with 10 splits, shuffle enabled, and a random state 'seed'
Predict the cross-validated outputs 'y_pred' for the estimator on inputs 'X' and labels 'dummy_y' with cross-validation 'kfold'
Calculate the cross-validation score 'results' for the estimator on inputs 'X' and labels 'dummy_y' with cross-validation 'kfold'
Fit the estimator on inputs 'X' and labels 'Y' and store the trained classifier
Print the type of the estimator
Calculate the confusion matrix 'cm' between labels 'Y' and predicted labels 'y_pred'
Print the confusion matrix
Print the total sum of the confusion matrix
Print the accuracy of the predictions, calculated as the trace of the confusion matrix divided by the total sum of the confusion matrix
Print the Matthews correlation coefficient between labels 'Y' and predicted labels 'y_pred'
Print the mean and standard deviation of the cross-validation scores, multiplied by 100 to get percentages
Open the file "../../Results/discriminatorResults.txt" in append mode as 'f'
Write the true positives, false positives, false negatives, and true negatives from the confusion matrix to the file
Close the file

In [None]:
#for i in range(0,10):
estimator = KerasClassifier(build_fn=baseline_model, epochs=32, batch_size=200, verbose=2)

kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
y_pred = cross_val_predict(estimator, X, dummy_y, cv=kfold)
results = cross_val_score(estimator, X, dummy_y, cv=kfold)

trained_classifier = estimator.fit(X, Y)
print(type(estimator))

cm = confusion_matrix(Y, y_pred)
print(cm)
print("total: " + str(cm.sum()))
print("accuracy: " + str(np.trace(cm) / cm.sum()))
print("Matthews correlation coefficient: " + str(matthews_corrcoef(Y, y_pred)))



print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

f = open("../../Results/discriminatorResults.txt", "a+")
f.write("TP: %d, FP: %d, FN: %d, TN: %d\n" % (cm[0][0], cm[0][1], cm[1][0], cm[1][1]))
f.close()

# results

Start an infinite loop:
    Generate random integers 'gen_hidden1', 'gen_hidden2', and 'gen_hidden3' between 1 and 101
    Initialize a counter 'i' to 0
    Start a loop that runs 100 times:
        Define a unique result filename 'result_filename' using 'gen_hidden1', 'gen_hidden2', 'gen_hidden3', and 'i'
        Train the GAN with 'gen_hidden1', 'gen_hidden2', and 'gen_hidden3'
        Load the generated attacks 'results' from a file
        Predict the labels 'y_pred' for the generated attacks using the estimator
        Print 'y_pred'
        Create labels 'portsweep_labels' for the generated attacks
        Convert the predicted labels 'y_pred' to string labels 'predicted_as_label'
        Get the unique labels 'unique_labels' in 'predicted_as_label'
        For each label in 'unique_labels':
            Print the label and its count in 'predicted_as_label'
        Print a newline
        Calculate the confusion matrix 'cm' between 'portsweep_labels' and 'y_pred'
        Calculate the accuracy as the trace of 'cm' divided by the sum of 'cm'
        Print 'cm', the total sum of 'cm', and the accuracy
        If the accuracy is greater than 0.5:
            Open a file in append mode and write the accuracy, 'gen_hidden1', 'gen_hidden2', 'gen_hidden3', 'i', and 'result_filename' to the file
            Close the file
            Define a new result filename 'result_filename' with a directory prefix
            Open 'result_filename' in write mode and close it
            Save 'results' to 'result_filename'
        Increment 'i' by 1
End infinite loop

In [None]:
while(1):
    # generate random numbers for the hidden layer sizes of our generator
    gen_hidden1 =  np.random.randint(1, 101)
    gen_hidden2 =  np.random.randint(1, 101)
    gen_hidden3 =  np.random.randint(1, 101)
    
    i = 0
    
    
    # train 5 times on each setup, in case we get unlucky initalization on an otherwise good setup
    while i < 100:
        # create a unique filename in case we want to store the results (good accuracy)
        result_filename = "../../Results/GANresultsportsweep%.0f%.0f%.0fiter%.0ftry2.txt" % (gen_hidden1, gen_hidden2, gen_hidden3, i)

        trainGAN(gen_hidden1, gen_hidden2, gen_hidden3)
        
        # load generate attacks from file
        results = np.loadtxt("../../Results/GANresultsportsweep.txt")

        # predict attack lables (as encoded integers)
        y_pred = estimator.predict(results)
        print(y_pred)

        # create appropriate labels for our generated portsweep attacks
        portsweep_labels = np.full((len(results),), portsweep_index[0])

        # convert integer labels back to string, get all unique strings and their count
        predicted_as_label = attack_labels[y_pred]
        unique_labels = np.unique(predicted_as_label)

        for label in unique_labels:
            print("Attack type: %s     number predicted:  %.0f" % (label, len(np.where(predicted_as_label == label)[0])))
    
        print()
        # create a confusion matrix of the results
        cm = confusion_matrix(portsweep_labels, y_pred)
        
        accuracy = np.trace(cm) / cm.sum()
        print(cm)
        print("total: " + str(cm.sum()))
        print("accuracy: " + str(accuracy))
        
        if accuracy > .50:
            f = open("../../Results/GeneratorHypersAbove50percentAccuracyportsweep.txt", "a")
            f.write("""
            
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Accuracy: %.3f
Generator hidden layer 1 size: %.0f
Generator hidden layer 2 size: %.0f
Generator hidden layer 3 size: %.0f
Iteration %.0f
Result file name: %s
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" % (accuracy, gen_hidden1, gen_hidden2, gen_hidden3, i, result_filename))
            f.close()
            result_filename = "../../Results/" + result_filename
            
            f = open(result_filename, "w")
            f.close()
            np.savetxt(result_filename, results, fmt="%.0f")
        
        i = i + 1