In [1]:
import tensorflow as tf
import keras
import numpy as np
import pandas as pd
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Dropout, BatchNormalization
from keras.layers.advanced_activations import LeakyReLU
from keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder

Using TensorFlow backend.


In [None]:
def build_discriminator(layer1, layer2, layer3, alpha):
    model = Sequential()
    model.add(Dense(layer1, input_dim=41)) #discriminator takes 41 values from our dataset
    model.add(LeakyReLU(alpha=alpha))
    model.add(Dropout(0.3))
    model.add(Dense(layer2))
    model.add(LeakyReLU(alpha=alpha))
    model.add(Dropout(0.3))
    model.add(Dense(layer3))
    model.add(LeakyReLU(alpha=alpha))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid')) #outputs 0 to 1, 1 being real and 0 being fake

    attack = Input(shape=(41,))
    validity = model(attack)

    return Model(attack, validity)

In [None]:
def build_generator(layer1, layer2, layer3, alpha):
    model = Sequential()
    model.add(Dense(layer1, input_dim=41))
    #model.add(BatchNormalization())
    #model.add(LeakyReLU(alpha=alpha))
    #model.add(Dense(layer2))
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=alpha))
    model.add(Dense(layer3))
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=alpha))
    model.add(Dense(41, activation='relu'))

    noise = Input(shape=(41,))
    attack = model(noise)
    return Model(noise, attack)

In [None]:
def GAN_model(layer1, layer2, layer3, alpha):
    optimizer = Adam(0.001)
    
    #build generator and discriminator (mirrored)
    generator = build_generator(layer1, layer2, layer3, alpha)
    
    discriminator = build_discriminator(layer3, layer2, layer1, alpha)
    discriminator.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    #input and output of our combined model
    z = Input(shape=(41,))
    attack = generator(z)
    validity = discriminator(attack)
    
    #build combined model from generator and discriminator
    combined = Model(z, validity)
    combined.compile(loss='binary_crossentropy', optimizer=optimizer)
    return combined, discriminator, generator

In [None]:
ll = ["duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land", "wrong_fragment", "urgent", 
            "hot", "num_failed_logins", "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations", 
            "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate", 
            "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", 
            "dst_host_srv_count", "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", 
            "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate", "dst_host_rerror_rate", 
            "dst_host_srv_rerror_rate", "attack_type"]

In [None]:
def train_loop(combined, discriminator, generator, epochs):
    epochs = epochs+1
    batch_size = 4999
    dataframe = pd.read_csv('../../../../CSV/kdd_neptune_only_5000.csv').sample(batch_size)
    #dataframe = pd.read_csv('CSV/kdd_neptune_only_5000.csv').sample(batch_size)
    
    #apply "le.fit_transform" to every column (usually only works on 1 column)
    le = LabelEncoder()
    le.fit(ll)
    dataframe_encoded = dataframe.apply(le.fit_transform)
    dataset = dataframe_encoded.values
    
    f = open("GANNeptune.txt", "a")
    np.savetxt("GANNeptune.txt", dataset, fmt="%d")
    f.close()
    
#    d = defaultdict(LabelEncoder)
#     fit = dataframe.apply(lambda x: d[x.name].fit_transform(x))  # fit is encoded dataframe
#     dataset = fit.values   # transform to ndarray
    #print(fit)
    
#     print("===============================================")
#     print("decoded:")
#     print("===============================================")
#     decode_test = dataframe_encoded  # take a slice from the ndarray that we want to decode
#     #decode_test_df = pd.DataFrame(decode_test, columns=ll)  # turn that ndarray into a dataframe with correct column names and order
#     decoded = decode_test.apply(le.inverse_transform)  # decode that dataframe
#     print(decoded)
    
    #labels for data. 1 for valid attacks, 0 for fake (generated) attacks
    valid = np.ones((batch_size, 1))
    fake = np.zeros((batch_size, 1))
    
    #Set X as our input data and Y as our label
    X_train = dataset[:, 0:41].astype(int)
    Y_train = dataset[:, 41]
    
    #break condition for training (when diverging)
    loss_increase_count = 0
    prev_g_loss = 0
    
    #generating a np array of numbers 0..batch_size-1
    idx = np.arange(batch_size)
    
    for epoch in range(epochs):
        #selecting batch_size random attacks from our training data
        #idx = np.random.randint(0, X_train.shape[0], batch_size)
        attacks = X_train[idx]
        
        #generate a matrix of noise vectors
        noise = np.random.normal(0, 1, (batch_size, 41))
        
        #create an array of generated attacks
        gen_attacks = generator.predict(noise)
        
        #loss functions, based on what metrics we specify at model compile time
        d_loss_real = discriminator.train_on_batch(attacks, valid)
        d_loss_fake = discriminator.train_on_batch(gen_attacks, fake)
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
        
        #generator loss function
        g_loss = combined.train_on_batch(noise, valid)
        
        if epoch % 50 == 0:
            print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f] [Loss change: %.3f, Loss increases: %.0f]" % 
                  (epoch, d_loss[0], 100 * d_loss[1], g_loss, g_loss - prev_g_loss, loss_increase_count))
        
            #saving results to txt to track them as the gan is training
            f = open("GANresultsNeptune.txt", "a")
            np.savetxt("GANresultsNeptune.txt", gen_attacks, fmt="%d")
            f.close()
            
        
            results = np.loadtxt("GANresultsNeptune.txt")
            y_pred = estimator.predict(results)
        
            right = (y_pred == 1).sum()
            wrong = len(y_pred)-(y_pred == 1).sum()
            accuracy = (right/float(right+wrong))
            print("Number of right predictions: %d" % right)
            print("Number of wrong predictions: %d" % wrong)
            print("Accuracy: %.4f " % accuracy)      

In [None]:
#Initialize Random Number Generator
#fix random seed for reproducibility
seed = 7
np.random.seed(seed)

#load dataset
dataframe = pd.read_csv("../../../../CSV/normalAndNeptune.csv")
#dataframe = pd.read_csv("CSV/normalAndNeptune.csv")

#samples n random data points
dataframe = dataframe.sample(n=1000000)
#LabelEncoder, turns all our categorical data into integers
le = LabelEncoder()

#apply "le.fit_transform" to every column (usually only works on 1 column)
dataframe_encoded = dataframe.apply(le.fit_transform)
attack_labels = le.classes_
indices_of_neptune = np.where(attack_labels == 'neptune.')
neptune_index = indices_of_neptune[0]
dataset = dataframe_encoded.values

#Set X as our input data and Y as our label
X = dataset[:,0:41].astype(int)
Y = dataset[:,41]

In [None]:
#Get validation data
validationToTrainRatio = 0.05
validationSize = int(validationToTrainRatio * len(X))
validationData = X[:validationSize]
validationLabels = Y[:validationSize]
X = X[validationSize:]
Y = Y[validationSize:]

#Get test data
testToTrainRatio = 0.05
testSize = int(testToTrainRatio * len(X))
testData = X[:testSize]
testLabels = Y[:testSize]
X = X[testSize:]
Y = Y[testSize:]

In [None]:
def baseline_model(layers, units, dropout_rate, input_shape, num_classes):
    model = Sequential()
    model.add(Dropout(rate=dropout_rate, input_shape=input_shape))
    for _ in range(layers-1):
        model.add(Dense(units=units, activation='relu'))
        model.add(Dropout(rate=dropout_rate))

    model.add(Dense(units=num_classes, activation='sigmoid'))
    model.compile(optimizer=Adam(0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])
    return model

In [None]:
estimator = baseline_model(layers=2, units=32, dropout_rate=0.5, input_shape=X.shape[1:], num_classes=1)

callbacks = [keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=2)]

history = estimator.fit(X,
                    Y,
                    epochs=5,
                    batch_size=1024,
                    callbacks=callbacks,
                    validation_data=(validationData, validationLabels),
                    verbose=2)

In [None]:
#Evalueating model on the testset
#[loss, accuracy]
print(estimator.evaluate(testData, testLabels))

In [None]:
#creating GAN model
combined, discriminator, generator = GAN_model(8, 16, 32, 0.2)

In [None]:
#training GAN model
train_loop(combined, discriminator, generator, 1000)