In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.datasets import mnist
from keras.layers import Input, Dense, Reshape, Flatten, Dropout
from keras.layers import BatchNormalization, Activation, ZeroPadding2D
from keras.layers.advanced_activations import LeakyReLU
from keras.models import Sequential, Model
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef
from keras.optimizers import Adam


Using TensorFlow backend.


In [2]:
def build_discriminator():
    model = Sequential() #41 30 15
    model.add(Dense(64, input_dim=41))  # discriminator takes 41 values from our dataset
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dropout(0.3))
    model.add(Dense(32))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dropout(0.3))
    model.add(Dense(16))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))  # outputs 0 to 1, 1 being real and 0 being fake

    # model.summary()

    attack = Input(shape=(41,))
    validity = model(attack)

    return Model(attack, validity)

In [3]:
def build_generator(hidden1, hidden2, hidden3):
    model = Sequential()
    model.add(Dense(hidden1, input_dim=41))  # arbitrarily selected 100 for our input noise vector?
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(hidden2))
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(hidden3))
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(41, activation='relu'))  # outputs a generated vector of the same size as our data (41)

    # model.summary()

    noise = Input(shape=(41,))
    attack = model(noise)
    return Model(noise, attack)

In [4]:
def trainGAN(gen_hidden1, gen_hidden2, gen_hidden3):
    batch_size = 4999
    epochs = 3001
    optimizer = Adam(0.001)
    
    dataframe = pd.read_csv('../CSV/kdd_neptune_only_5000.csv').sample(4999)
    
    # apply "le.fit_transform" to every column (usually only works on 1 column)
    le = LabelEncoder()
    dataframe_encoded = dataframe.apply(le.fit_transform)
    dataset = dataframe_encoded.values
    
    #to visually judge results
    print("Real neptune attacks:")
    print(dataset[:2])
    
    # Set X as our input data and Y as our label
    X_train = dataset[:, 0:41].astype(float)
    Y_train = dataset[:, 41]
    
    # labels for data. 1 for valid attacks, 0 for fake (generated) attacks
    valid = np.ones((batch_size, 1))
    fake = np.zeros((batch_size, 1))
    
    # build the discriminator portion
    discriminator = build_discriminator();
    discriminator.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    # build the generator portion
    generator = build_generator(gen_hidden1, gen_hidden2, gen_hidden3)
    
    #input and output of our combined model
    z = Input(shape=(41,))
    attack = generator(z)
    validity = discriminator(attack)
    
    # build combined model from generator and discriminator
    combined = Model(z, validity)
    combined.compile(loss='binary_crossentropy', optimizer=optimizer)
    
    #break condition for training (when diverging)
    loss_increase_count = 0;
    prev_g_loss = 0;
    
    for epoch in range(epochs):

        # ---------------------
        #  Train Discriminator
        # ---------------------
        
        # selecting batch_size random attacks from our training data
        #idx = np.random.randint(0, X_train.shape[0], batch_size)
        idx = np.arange(4999)
        attacks = X_train[idx]
        
        # generate a matrix of noise vectors
        noise = np.random.normal(0, 1, (batch_size, 41))
        
        # create an array of generated attacks
        gen_attacks = generator.predict(noise)
        
        # loss functions, based on what metrics we specify at model compile time
        d_loss_real = discriminator.train_on_batch(attacks, valid)
        d_loss_fake = discriminator.train_on_batch(gen_attacks, fake)
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
        
        # generator loss function
        g_loss = combined.train_on_batch(noise, valid)
        
        if epoch % 500 == 0:
            print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f] [Loss change: %.3f, Loss increases: %.0f]" % (epoch, d_loss[0], 100 * d_loss[1], g_loss, g_loss - prev_g_loss, loss_increase_count))
        
        #if our generator loss icreased this iteration, increment the counter by 1
#         if (g_loss - prev_g_loss) > 0:
#             loss_increase_count = loss_increase_count + 1
#         else: 
#             loss_increase_count = 0  # otherwise, reset it to 0, we are still training effectively
            
#         prev_g_loss = g_loss
            
#         if loss_increase_count > 10:
#             print('Stoping on iteration: ', epoch)
#             break

    # generate a matrix of noise vectors
    noise = np.random.normal(0, 1, (batch_size, 41))
        
    # create an array of generated attacks
    gen_attacks = generator.predict(noise)
    f = open("GANresultsNeptune.txt", "a")
    np.savetxt("GANresultsNeptune.txt", gen_attacks, fmt="%d")
    f.close()        

In [5]:
# Initialize Random Number Generator
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

# load dataset

dataframe = pd.read_csv("../CSV/normalAndNeptune.csv")

# samples 10000 random data points from 500k
dataframe = dataframe.sample(n=500000)
# LabelEncoder, turns all our categorical data into integers
le = LabelEncoder()

# apply "le.fit_transform" to every column (usually only works on 1 column)
dataframe_encoded = dataframe.apply(le.fit_transform)
attack_labels = le.classes_
indices_of_neptune = np.where(attack_labels == 'neptune.')
neptune_index = indices_of_neptune[0]
dataset = dataframe_encoded.values

print(attack_labels)
print(neptune_index)

#Set X as our input data and Y as our label
X = dataset[:,0:41].astype(float)
Y = dataset[:,41]


['neptune.' 'normal.']
[0]


In [6]:
print((Y == 1).sum())
print((Y == 0).sum())

237615
262385


In [7]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)
# print(dummy_y)
#print(len(dummy_y[0]))
num_of_classes = len(dummy_y[0])  # the length of dummy y is the number of classes we have in our small sample
# since we are randomly sampling from a large dataset, we might not get 1 of every class in our sample
# we need to set output layer to be equal to the length of our dummy_y vectors
print(num_of_classes)

2


In [8]:
#Get validation data
validationToTrainRatio = 0.2
validationSize = int(validationToTrainRatio * len(X))
validationData = X[:validationSize]
validationLabels = Y[:validationSize]
X = X[validationSize:]
Y = Y[validationSize:]

#Get test data
testToTrainRatio = 0.1
testSize = int(testToTrainRatio * len(X))
testData = X[:testSize]
testLabels = Y[:testSize]
X = X[testSize:]
Y = Y[testSize:]

In [9]:
print((Y == 1).sum())
print((Y == 0).sum())

171075
188925


In [10]:
def baseline_model(layers, units, dropout_rate, input_shape, num_classes):
    model = keras.Sequential()
    model.add(keras.layers.Dropout(rate=dropout_rate, input_shape=input_shape))
    for _ in range(layers-1):
        model.add(keras.layers.Dense(units=units, activation=tf.nn.relu))
        model.add(keras.layers.Dropout(rate=dropout_rate))

    model.add(keras.layers.Dense(units=num_classes, activation=tf.nn.sigmoid))
    model.compile(optimizer=tf.train.AdamOptimizer(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])
    return model

In [11]:
estimator = baseline_model(layers=2, units=32, dropout_rate=0.5, input_shape=X.shape[1:], num_classes=1)

callbacks = [tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=2)]

history = estimator.fit(X,
                    Y,
                    epochs=40,
                    batch_size=1024,
                    callbacks=callbacks,
                    validation_data=(validationData, validationLabels),
                    verbose=2)

Train on 360000 samples, validate on 100000 samples
Epoch 1/40
 - 3s - loss: 1.8514 - acc: 0.8375 - val_loss: 0.0024 - val_acc: 0.9996
Epoch 2/40
 - 2s - loss: 0.2091 - acc: 0.9586 - val_loss: 0.0023 - val_acc: 0.9998
Epoch 3/40
 - 2s - loss: 0.0717 - acc: 0.9795 - val_loss: 0.0036 - val_acc: 0.9998
Epoch 4/40
 - 2s - loss: 0.0534 - acc: 0.9843 - val_loss: 0.0040 - val_acc: 0.9999


In [12]:
#Evalueating model on the testset
#[loss, accuracy]
print(estimator.evaluate(testData, testLabels))

[0.0041814853875082915, 0.9999]


In [13]:
# generate random numbers for the hidden layer sizes of our generator
gen_hidden1 = 16#np.random.randint(1, 101)
gen_hidden2 = 32#np.random.randint(1, 101)
gen_hidden3 = 64#np.random.randint(1, 101)
    
trainGAN(gen_hidden1, gen_hidden2, gen_hidden3)

# predict attack lables (as encoded integers)
results = np.loadtxt("GANresultsNeptune.txt")
y_pred = estimator.predict(results)
        
right = float((y_pred == 1).sum())
wrong = float(len(y_pred)-(y_pred == 1).sum())
print("Number of right predictions: %d" % right)
print("Number of wrong predictions: %d" % wrong)

Real neptune attacks:
[[  0   0  38   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0 174  17   1   3   0   0   8   2   0  25  17   7   2   0
    0   0   0   0   0   0]
 [  0   0  38   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0 147  19   1   3   0   0   9   2   0  25  19   8   4   0
    0   0   0   0   0   0]]
0 [D loss: 4.258030, acc.: 46.81%] [G loss: 0.815551] [Loss change: 0.816, Loss increases: 0]
500 [D loss: 2.062886, acc.: 50.51%] [G loss: 0.091353] [Loss change: 0.091, Loss increases: 0]
1000 [D loss: 2.015170, acc.: 50.76%] [G loss: 0.128834] [Loss change: 0.129, Loss increases: 0]
1500 [D loss: 1.885279, acc.: 51.58%] [G loss: 0.169821] [Loss change: 0.170, Loss increases: 0]
2000 [D loss: 1.804629, acc.: 51.73%] [G loss: 0.208966] [Loss change: 0.209, Loss increases: 0]
2500 [D loss: 1.660054, acc.: 53.35%] [G loss: 0.273184] [Loss change: 0.273, Loss increases: 0]
3000 [D loss: 1.690592, acc.: 47.33%] [G loss: 0.