In [1]:
import tensorflow as tf
import keras
import numpy as np
import pandas as pd
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Dropout, BatchNormalization
from keras.layers.advanced_activations import LeakyReLU
from keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from mysql import SQLConnector

Using TensorFlow backend.


In [2]:
def build_discriminator(layer1, layer2, layer3, alpha):
    model = Sequential()
    model.add(Dense(layer1, input_dim=41)) #discriminator takes 41 values from our dataset
    model.add(LeakyReLU(alpha=alpha))
    model.add(Dropout(0.3))
    model.add(Dense(layer2))
    model.add(LeakyReLU(alpha=alpha))
    model.add(Dropout(0.3))
    model.add(Dense(layer3))
    model.add(LeakyReLU(alpha=alpha))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid')) #outputs 0 to 1, 1 being real and 0 being fake

    attack = Input(shape=(41,))
    validity = model(attack)

    return Model(attack, validity)

In [3]:
def build_generator(layer1, layer2, layer3, alpha):
    model = Sequential()
    model.add(Dense(layer1, input_dim=41))
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=alpha))
    model.add(Dense(layer2))
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=alpha))
    model.add(Dense(layer3))
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=alpha))
    model.add(Dense(41, activation='relu'))

    noise = Input(shape=(41,))
    attack = model(noise)
    return Model(noise, attack)

In [4]:
def GAN_model(layer1, layer2, layer3, alpha):
    optimizer = Adam(0.001)
    
    #build generator and discriminator (mirrored)
    generator = build_generator(layer1, layer2, layer3, alpha)
    
    discriminator = build_discriminator(layer3, layer2, layer1, alpha)
    discriminator.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    #input and output of our combined model
    z = Input(shape=(41,))
    attack = generator(z)
    validity = discriminator(attack)
    
    #build combined model from generator and discriminator
    combined = Model(z, validity)
    combined.compile(loss='binary_crossentropy', optimizer=optimizer)
    return combined, discriminator, generator

In [5]:
ll = ["duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land", "wrong_fragment", "urgent", 
            "hot", "num_failed_logins", "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations", 
            "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate", 
            "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", 
            "dst_host_srv_count", "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", 
            "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate", "dst_host_rerror_rate", 
            "dst_host_srv_rerror_rate", "attack_type"]

In [6]:
def train_loop(combined, discriminator, generator, estimator, epochs):
    epochs = epochs+1
    batch_size = 30
    conn = SQLConnector()
    data = conn.pull_kdd99(attack='nmap', num=1554)
    dataframe = pd.DataFrame.from_records(data=data,
                columns=conn.pull_kdd99_columns(allQ=True))
    
    #apply "le.fit_transform" to every column (usually only works on 1 column)
    le = LabelEncoder()
    dataframe_encoded = dataframe.apply(le.fit_transform)
    dataset = dataframe_encoded.values
    
    f = open("NmapReal.txt", "a")
    np.savetxt("NmapReal.txt", dataset, fmt="%d")
    f.close()
    
    #labels for data. 1 for valid attacks, 0 for fake (generated) attacks
    valid = np.ones((batch_size, 1))
    fake = np.zeros((batch_size, 1))
    
    #Set X as our input data and Y as our label
    X_train = dataset[:, 0:41].astype(int)
    Y_train = dataset[:, 41]
    
    #break condition for training (when diverging)
    loss_increase_count = 0
    prev_g_loss = 0
    
    #generating a np array of numbers 0..batch_size-1
    idx = np.arange(batch_size)
    
    for epoch in range(epochs):
        #selecting batch_size random attacks from our training data
        #idx = np.random.randint(0, X_train.shape[0], batch_size)
        attacks = X_train[idx-1]
        
        #generate a matrix of noise vectors
        noise = np.random.normal(0, 1, (batch_size, 41))
        
        #create an array of generated attacks
        gen_attacks = generator.predict(noise)
        
        #loss functions, based on what metrics we specify at model compile time
        d_loss_real = discriminator.train_on_batch(attacks, valid)
        d_loss_fake = discriminator.train_on_batch(gen_attacks, fake)
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
        
        #generator loss function
        g_loss = combined.train_on_batch(noise, valid)
        
        if epoch % 50 == 0:
            print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f] [Loss change: %.3f, Loss increases: %.0f]" % 
                  (epoch, d_loss[0], 100 * d_loss[1], g_loss, g_loss - prev_g_loss, loss_increase_count))
        
            #saving results to txt to track them as the gan is training
            f = open("Nmap.txt", "a")
            np.savetxt("Nmap.txt", gen_attacks, fmt="%d")
            f.close()
            
            y_pred = estimator.predict(gen_attacks)
        
            right = (y_pred > 0.98).sum()
            wrong = len(y_pred)-(y_pred > 0.98).sum()
            accuracy = (right/float(right+wrong))
            print("Number of right predictions: %d" % right)
            print("Number of wrong predictions: %d" % wrong)
            print("Accuracy: %.4f " % accuracy)      

In [7]:
conn = SQLConnector()
data = conn.pull_kdd99(attack='nmap', num=1554)
data += conn.pull_kdd99(attack='normal', num=1554)
dataframe = pd.DataFrame.from_records(data=data,
            columns=conn.pull_kdd99_columns(allQ=True))

#LabelEncoder, turns all our categorical data into integers
le = LabelEncoder()

dataframe_encoded = dataframe.apply(le.fit_transform)
dataset = dataframe_encoded.values

#Set X as our input data and Y as our label
X = dataset[:,0:41].astype(int)
Y = dataset[:,41]
print(Y)
X, Y = shuffle(X, Y)

[0 0 0 ... 1 1 1]


In [8]:
#Get validation data
validationToTrainRatio = 0.10
validationSize = int(validationToTrainRatio * len(X))
validationData = X[:validationSize]
validationLabels = Y[:validationSize]
X = X[validationSize:]
Y = Y[validationSize:]

#Get test data
testToTrainRatio = 0.10
testSize = int(testToTrainRatio * len(X))
testData = X[:testSize]
testLabels = Y[:testSize]
X = X[testSize:]
Y = Y[testSize:]

In [9]:
def baseline_model(layers, units, dropout_rate, input_shape, num_classes):
    model = Sequential()
    model.add(Dropout(rate=dropout_rate, input_shape=input_shape))
    for _ in range(layers-1):
        model.add(Dense(units=units, activation='relu'))
        model.add(Dropout(rate=dropout_rate))

    model.add(Dense(units=num_classes, activation='sigmoid'))
    model.compile(optimizer=Adam(0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])
    return model

In [10]:
estimator = baseline_model(layers=2, units=32, dropout_rate=0.5, input_shape=X.shape[1:], num_classes=1)

callbacks = [keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=2)]

history = estimator.fit(X,
                    Y,
                    epochs=200,
                    batch_size=256,
                    callbacks=callbacks,
                    validation_data=(validationData, validationLabels),
                    verbose=2)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Train on 2519 samples, validate on 310 samples
Epoch 1/200
 - 1s - loss: 6.6433 - acc: 0.5042 - val_loss: 7.1385 - val_acc: 0.5290
Epoch 2/200
 - 0s - loss: 6.0969 - acc: 0.5280 - val_loss: 5.8557 - val_acc: 0.5903
Epoch 3/200
 - 0s - loss: 5.7502 - acc: 0.5534 - val_loss: 5.0466 - val_acc: 0.6452
Epoch 4/200
 - 0s - loss: 5.2581 - acc: 0.5879 - val_loss: 4.2942 - val_acc: 0.6935
Epoch 5/200
 - 0s - loss: 5.0003 - acc: 0.5955 - val_loss: 3.4859 - val_acc: 0.7452
Epoch 6/200
 - 0s - loss: 4.6453 - acc: 0.6252 - val_loss: 3.2013 - val_acc: 0.7677
Epoch 7/200
 - 0s - loss: 4.1059 - acc: 0.6610 - val_loss: 1.9920 - val_acc: 0.7903
Epoch 8/200
 - 0s - loss: 3.6945 - acc: 0.6765 - val_loss: 1.0234 - val_acc: 0.9000
Epoch 9/200
 - 0s - loss: 3.4357 - acc: 0.6900

In [11]:
#Evalueating model on the testset
#[loss, accuracy]
print(estimator.evaluate(testData, testLabels))

[0.6490930596142874, 0.9354838718222888]


In [12]:
#creating GAN model
combined, discriminator, generator = GAN_model(8, 16, 32, 0.2)

In [17]:
#training GAN model
train_loop(combined, discriminator, generator, estimator, 300)

0 [D loss: 0.977581, acc.: 55.00%] [G loss: 0.331942] [Loss change: 0.332, Loss increases: 0]
Number of right predictions: 0
Number of wrong predictions: 30
Accuracy: 0.0000 
50 [D loss: 1.051328, acc.: 53.33%] [G loss: 0.298288] [Loss change: 0.298, Loss increases: 0]
Number of right predictions: 0
Number of wrong predictions: 30
Accuracy: 0.0000 
100 [D loss: 1.030586, acc.: 51.67%] [G loss: 0.211810] [Loss change: 0.212, Loss increases: 0]
Number of right predictions: 0
Number of wrong predictions: 30
Accuracy: 0.0000 
150 [D loss: 1.056385, acc.: 50.00%] [G loss: 0.262110] [Loss change: 0.262, Loss increases: 0]
Number of right predictions: 0
Number of wrong predictions: 30
Accuracy: 0.0000 
200 [D loss: 0.987014, acc.: 53.33%] [G loss: 0.340660] [Loss change: 0.341, Loss increases: 0]
Number of right predictions: 0
Number of wrong predictions: 30
Accuracy: 0.0000 
250 [D loss: 1.050032, acc.: 51.67%] [G loss: 0.316486] [Loss change: 0.316, Loss increases: 0]
Number of right predic

In [14]:
conn = SQLConnector()
data = conn.pull_kdd99(attack='nmap', num=1554)
dataframe = pd.DataFrame.from_records(data=data,
            columns=conn.pull_kdd99_columns(allQ=True))
le = LabelEncoder()
dataframe_encoded = dataframe.apply(le.fit_transform)
dataset = dataframe_encoded.values
pred = estimator.predict(dataset[:, 0:41])
counter = 0
for x in pred:
    print(x)
    if x[0] <= 0.01:
        counter += 1
print('Total %d, predicted attacks %d' % (len(pred), counter))

[2.9802322e-07]
[1.2814999e-06]
[0.00041047]
[0.]
[0.]
[0.0002225]
[2.1010637e-05]
[3.5762787e-07]
[0.]
[0.]
[0.00026593]
[1.013279e-06]
[1.7881393e-07]
[8.940697e-08]
[3.5762787e-07]
[5.9604645e-08]
[0.00415692]
[0.]
[2.1457672e-06]
[2.8729439e-05]
[0.00021309]
[0.00302839]
[1.7464161e-05]
[0.00017974]
[0.]
[5.1259995e-06]
[3.5762787e-07]
[2.3841858e-07]
[0.]
[0.01252121]
[0.]
[0.]
[0.]
[2.3841858e-07]
[0.]
[8.940697e-08]
[1.4007092e-06]
[1.79708e-05]
[0.00021601]
[2.9802322e-07]
[0.]
[0.]
[0.]
[5.9604645e-08]
[0.]
[0.]
[5.9604645e-08]
[2.9802322e-08]
[3.695488e-06]
[0.]
[8.34465e-07]
[0.00023457]
[3.2782555e-07]
[0.]
[0.]
[0.00083429]
[0.]
[0.00017855]
[0.]
[3.2782555e-07]
[0.74377215]
[0.]
[0.]
[4.5865774e-05]
[8.34465e-07]
[8.136034e-06]
[0.8179622]
[0.]
[0.]
[1.4901161e-06]
[0.]
[9.641051e-05]
[2.7030706e-05]
[0.00012609]
[0.]
[0.]
[1.7285347e-06]
[7.209182e-05]
[0.]
[3.5583973e-05]
[0.2788751]
[1.5079975e-05]
[0.]
[1.4305115e-06]
[0.]
[1.3113022e-06]
[8.940697e-08]
[6.2584877e-07

In [15]:
from mysql import SQLConnector
conn = SQLConnector()

In [18]:

noise = np.random.normal(0, 1, (10, 41)) #927

#create an array of generated attacks
gen_attacks = generator.predict(noise).astype('int')
estimator.predict(gen_attacks)
#data = conn.pull_kdd99(attack='nmap', num=8)
# for x in gen_attacks:
#     conn.write_gens('927', x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], 
#                     x[11], x[12], x[13], x[14], x[15], x[16], x[17], x[18], x[19], x[20], 
#                     x[21], x[22], x[23], x[24], x[25], x[26], x[27], x[28], x[29], x[30], 
#                     x[31], x[32], x[33], x[34], x[35], x[36], x[37], x[38], x[39], x[40], '18')