In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import math
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Dropout, BatchNormalization
from keras.layers.advanced_activations import LeakyReLU
from keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder

Using TensorFlow backend.


In [2]:
def build_discriminator(layer1, layer2, layer3):
    model = Sequential()
    model.add(Dense(layer1, input_dim=41)) #discriminator takes 41 values from our dataset
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dropout(0.3))
    model.add(Dense(layer2))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dropout(0.3))
    model.add(Dense(layer3))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid')) #outputs 0 to 1, 1 being real and 0 being fake

    attack = Input(shape=(41,))
    validity = model(attack)

    return Model(attack, validity)

In [3]:
def build_generator(layer1, layer2, layer3):
    model = Sequential()
    model.add(Dense(layer1, input_dim=41))
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(layer2))
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(layer3))
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(41, activation='relu'))

    noise = Input(shape=(41,))
    attack = model(noise)
    return Model(noise, attack)

In [4]:
def GAN_model(layer1, layer2, layer3):
    optimizer = Adam(0.001)
    
    #build generator and discriminator (mirrored)
    generator = build_generator(layer1, layer2, layer3)
    
    discriminator = build_discriminator(layer3, layer2, layer1)
    discriminator.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    #input and output of our combined model
    z = Input(shape=(41,))
    attack = generator(z)
    validity = discriminator(attack)
    
    #build combined model from generator and discriminator
    combined = Model(z, validity)
    combined.compile(loss='binary_crossentropy', optimizer=optimizer)
    return combined, discriminator, generator

In [5]:
def train_loop(combined, discriminator, generator, epochs):
    epochs = epochs
    batch_size = 4999
    dataframe = pd.read_csv('../CSV/kdd_neptune_only_5000.csv').sample(batch_size)
    
    #apply "le.fit_transform" to every column (usually only works on 1 column)
    le = LabelEncoder()
    dataframe_encoded = dataframe.apply(le.fit_transform)
    dataset = dataframe_encoded.values
    
    #labels for data. 1 for valid attacks, 0 for fake (generated) attacks
    valid = np.ones((batch_size, 1))
    fake = np.zeros((batch_size, 1))
    
    #Set X as our input data and Y as our label
    X_train = dataset[:, 0:41].astype(float)
    Y_train = dataset[:, 41]
    
    #break condition for training (when diverging)
    loss_increase_count = 0
    prev_g_loss = 0
    
    #generating a np array of numbers 0..batch_size-1
    idx = np.arange(batch_size)
    
    for epoch in range(epochs):
        #selecting batch_size random attacks from our training data
        #idx = np.random.randint(0, X_train.shape[0], batch_size)
        attacks = X_train[idx]
        
        #generate a matrix of noise vectors
        noise = np.random.normal(0, 1, (batch_size, 41))
        
        #create an array of generated attacks
        gen_attacks = generator.predict(noise)
        
        #loss functions, based on what metrics we specify at model compile time
        d_loss_real = discriminator.train_on_batch(attacks, valid)
        d_loss_fake = discriminator.train_on_batch(gen_attacks, fake)
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
        
        #generator loss function
        g_loss = combined.train_on_batch(noise, valid)
        
        if epoch % 50 == 0:
            print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f] [Loss change: %.3f, Loss increases: %.0f]" % 
                  (epoch, d_loss[0], 100 * d_loss[1], g_loss, g_loss - prev_g_loss, loss_increase_count))
        
            #saving results to txt to track them as the gan is training
            f = open("GANresultsNeptune.txt", "a")
            np.savetxt("GANresultsNeptune.txt", gen_attacks, fmt="%d")
            f.close()
        
            results = np.loadtxt("GANresultsNeptune.txt")
            y_pred = estimator.predict(results)
        
            right = (y_pred == 1).sum()
            wrong = len(y_pred)-(y_pred == 1).sum()
            accuracy = (right/float(right+wrong))
            print("Number of right predictions: %d" % right)
            print("Number of wrong predictions: %d" % wrong)
            print("Accuracy: %.4f " % accuracy)      

In [6]:
#Initialize Random Number Generator
#fix random seed for reproducibility
seed = 7
np.random.seed(seed)

#load dataset
dataframe = pd.read_csv("../CSV/normalAndNeptune.csv")

#samples n random data points
dataframe = dataframe.sample(n=1000000)
#LabelEncoder, turns all our categorical data into integers
le = LabelEncoder()

#apply "le.fit_transform" to every column (usually only works on 1 column)
dataframe_encoded = dataframe.apply(le.fit_transform)
attack_labels = le.classes_
indices_of_neptune = np.where(attack_labels == 'neptune.')
neptune_index = indices_of_neptune[0]
dataset = dataframe_encoded.values

#Set X as our input data and Y as our label
X = dataset[:,0:41].astype(float)
Y = dataset[:,41]

In [7]:
#Get validation data
validationToTrainRatio = 0.05
validationSize = int(validationToTrainRatio * len(X))
validationData = X[:validationSize]
validationLabels = Y[:validationSize]
X = X[validationSize:]
Y = Y[validationSize:]

#Get test data
testToTrainRatio = 0.05
testSize = int(testToTrainRatio * len(X))
testData = X[:testSize]
testLabels = Y[:testSize]
X = X[testSize:]
Y = Y[testSize:]

In [8]:
def baseline_model(layers, units, dropout_rate, input_shape, num_classes):
    model = Sequential()
    model.add(Dropout(rate=dropout_rate, input_shape=input_shape))
    for _ in range(layers-1):
        model.add(Dense(units=units, activation=tf.nn.relu))
        model.add(Dropout(rate=dropout_rate))

    model.add(Dense(units=num_classes, activation=tf.nn.sigmoid))
    model.compile(optimizer=tf.train.AdamOptimizer(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])
    return model

In [9]:
estimator = baseline_model(layers=2, units=32, dropout_rate=0.5, input_shape=X.shape[1:], num_classes=1)

callbacks = [tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=2)]

history = estimator.fit(X,
                    Y,
                    epochs=5,
                    batch_size=1024,
                    callbacks=callbacks,
                    validation_data=(validationData, validationLabels),
                    verbose=2)

Train on 902500 samples, validate on 50000 samples
Epoch 1/5
 - 6s - loss: 0.5924 - acc: 0.9302 - val_loss: 0.0039 - val_acc: 0.9999
Epoch 2/5
 - 5s - loss: 0.0470 - acc: 0.9861 - val_loss: 0.0065 - val_acc: 0.9998
Epoch 3/5
 - 5s - loss: 0.0325 - acc: 0.9898 - val_loss: 0.0034 - val_acc: 0.9999
Epoch 4/5
 - 5s - loss: 0.0277 - acc: 0.9909 - val_loss: 0.0014 - val_acc: 0.9999
Epoch 5/5
 - 5s - loss: 0.0246 - acc: 0.9917 - val_loss: 0.0011 - val_acc: 0.9999


In [10]:
#Evalueating model on the testset
#[loss, accuracy]
print(estimator.evaluate(testData, testLabels))

[0.001588432508278148, 0.9998947368421053]


In [11]:
#creating GAN model
combined, discriminator, generator = GAN_model(8, 16, 32)

In [None]:
#training GAN model
train_loop(combined, discriminator, generator, 3000)

0 [D loss: 2.912833, acc.: 47.12%] [G loss: 0.773026] [Loss change: 0.773, Loss increases: 0]
Number of right predictions: 0
Number of wrong predictions: 4999
Accuracy: 0.0000 
50 [D loss: 1.059959, acc.: 50.19%] [G loss: 0.201338] [Loss change: 0.201, Loss increases: 0]
Number of right predictions: 10
Number of wrong predictions: 4989
Accuracy: 0.0020 
100 [D loss: 1.275534, acc.: 50.31%] [G loss: 0.134777] [Loss change: 0.135, Loss increases: 0]
Number of right predictions: 0
Number of wrong predictions: 4999
Accuracy: 0.0000 
150 [D loss: 1.429064, acc.: 50.40%] [G loss: 0.117276] [Loss change: 0.117, Loss increases: 0]
Number of right predictions: 0
Number of wrong predictions: 4999
Accuracy: 0.0000 
200 [D loss: 1.550088, acc.: 50.72%] [G loss: 0.108939] [Loss change: 0.109, Loss increases: 0]
Number of right predictions: 0
Number of wrong predictions: 4999
Accuracy: 0.0000 
250 [D loss: 1.587757, acc.: 50.69%] [G loss: 0.113296] [Loss change: 0.113, Loss increases: 0]
Number of r