In [1]:
import tensorflow as tf
import keras
import numpy as np
import pandas as pd
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Dropout, BatchNormalization
from keras.layers.advanced_activations import LeakyReLU
from keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from mysql import SQLConnector

Using TensorFlow backend.


In [2]:
def build_discriminator(layer1, layer2, layer3, alpha):
    model = Sequential()
    model.add(Dense(layer1, input_dim=41)) #discriminator takes 41 values from our dataset
    model.add(LeakyReLU(alpha=alpha))
    model.add(Dropout(0.3))
    model.add(Dense(layer2))
    model.add(LeakyReLU(alpha=alpha))
    model.add(Dropout(0.3))
    model.add(Dense(layer3))
    model.add(LeakyReLU(alpha=alpha))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid')) #outputs 0 to 1, 1 being real and 0 being fake

    attack = Input(shape=(41,))
    validity = model(attack)

    return Model(attack, validity)

In [3]:
def build_generator(layer1, layer2, layer3, alpha):
    model = Sequential()
    model.add(Dense(layer1, input_dim=41))
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=alpha))
    model.add(Dense(layer2))
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=alpha))
    model.add(Dense(layer3))
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=alpha))
    model.add(Dense(41, activation='relu'))

    noise = Input(shape=(41,))
    attack = model(noise)
    return Model(noise, attack)

In [4]:
def GAN_model(layer1, layer2, layer3, alpha):
    optimizer = Adam(0.001)
    
    #build generator and discriminator (mirrored)
    generator = build_generator(layer1, layer2, layer3, alpha)
    
    discriminator = build_discriminator(layer3, layer2, layer1, alpha)
    discriminator.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    #input and output of our combined model
    z = Input(shape=(41,))
    attack = generator(z)
    validity = discriminator(attack)
    
    #build combined model from generator and discriminator
    combined = Model(z, validity)
    combined.compile(loss='binary_crossentropy', optimizer=optimizer)
    return combined, discriminator, generator

In [5]:
ll = ["duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land", "wrong_fragment", "urgent", 
            "hot", "num_failed_logins", "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations", 
            "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate", 
            "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", 
            "dst_host_srv_count", "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", 
            "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate", "dst_host_rerror_rate", 
            "dst_host_srv_rerror_rate", "attack_type"]

In [6]:
def train_loop(combined, discriminator, generator, estimator, epochs):
    epochs = epochs+1
    batch_size = 30
    conn = SQLConnector()
    data = conn.pull_kdd99(attack='back', num=968)
    dataframe = pd.DataFrame.from_records(data=data,
                columns=conn.pull_kdd99_columns(allQ=True))
    
    #apply "le.fit_transform" to every column (usually only works on 1 column)
    le = LabelEncoder()
    dataframe_encoded = dataframe.apply(le.fit_transform)
    dataset = dataframe_encoded.values
    
    f = open("backReal.txt", "a")
    np.savetxt("backReal.txt", dataset, fmt="%d")
    f.close()
    
    #labels for data. 1 for valid attacks, 0 for fake (generated) attacks
    valid = np.ones((batch_size, 1))
    fake = np.zeros((batch_size, 1))
    
    #Set X as our input data and Y as our label
    X_train = dataset[:, 0:41].astype(int)
    Y_train = dataset[:, 41]
    
    #break condition for training (when diverging)
    loss_increase_count = 0
    prev_g_loss = 0
    
    #generating a np array of numbers 0..batch_size-1
    idx = np.arange(batch_size)
    
    for epoch in range(epochs):
        #selecting batch_size random attacks from our training data
        #idx = np.random.randint(0, X_train.shape[0], batch_size)
        attacks = X_train[idx-1]
        
        #generate a matrix of noise vectors
        noise = np.random.normal(0, 1, (batch_size, 41))
        
        #create an array of generated attacks
        gen_attacks = generator.predict(noise)
        
        #loss functions, based on what metrics we specify at model compile time
        d_loss_real = discriminator.train_on_batch(attacks, valid)
        d_loss_fake = discriminator.train_on_batch(gen_attacks, fake)
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
        
        #generator loss function
        g_loss = combined.train_on_batch(noise, valid)
        
        if epoch % 50 == 0:
            print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f] [Loss change: %.3f, Loss increases: %.0f]" % 
                  (epoch, d_loss[0], 100 * d_loss[1], g_loss, g_loss - prev_g_loss, loss_increase_count))
        
            #saving results to txt to track them as the gan is training
            f = open("back.txt", "a")
            np.savetxt("back.txt", gen_attacks, fmt="%d")
            f.close()
            
            y_pred = estimator.predict(gen_attacks)
        
            right = (y_pred >= 0.98).sum()
            wrong = len(y_pred)-(y_pred >= 0.98).sum()
            accuracy = (right/float(right+wrong))
            print("Number of right predictions: %d" % right)
            print("Number of wrong predictions: %d" % wrong)
            print("Accuracy: %.4f " % accuracy)      

In [7]:
conn = SQLConnector()
data = conn.pull_kdd99(attack='back', num=968)
data += conn.pull_kdd99(attack='normal', num=968)
dataframe = pd.DataFrame.from_records(data=data,
            columns=conn.pull_kdd99_columns(allQ=True))

#LabelEncoder, turns all our categorical data into integers
le = LabelEncoder()

dataframe_encoded = dataframe.apply(le.fit_transform)
dataset = dataframe_encoded.values

#Set X as our input data and Y as our label
X = dataset[:,0:41].astype(int)
Y = dataset[:,41]
print(Y)
X, Y = shuffle(X, Y)

[0 0 0 ... 1 1 1]


In [8]:
#Get validation data
validationToTrainRatio = 0.10
validationSize = int(validationToTrainRatio * len(X))
validationData = X[:validationSize]
validationLabels = Y[:validationSize]
X = X[validationSize:]
Y = Y[validationSize:]

#Get test data
testToTrainRatio = 0.10
testSize = int(testToTrainRatio * len(X))
testData = X[:testSize]
testLabels = Y[:testSize]
X = X[testSize:]
Y = Y[testSize:]

In [9]:
def baseline_model(layers, units, dropout_rate, input_shape, num_classes):
    model = Sequential()
    model.add(Dropout(rate=dropout_rate, input_shape=input_shape))
    for _ in range(layers-1):
        model.add(Dense(units=units, activation='relu'))
        model.add(Dropout(rate=dropout_rate))

    model.add(Dense(units=num_classes, activation='sigmoid'))
    model.compile(optimizer=Adam(0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])
    return model

In [10]:
estimator = baseline_model(layers=2, units=32, dropout_rate=0.5, input_shape=X.shape[1:], num_classes=1)

callbacks = [keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=2)]

history = estimator.fit(X,
                    Y,
                    epochs=200,
                    batch_size=256,
                    callbacks=callbacks,
                    validation_data=(validationData, validationLabels),
                    verbose=2)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Train on 1569 samples, validate on 193 samples
Epoch 1/200
 - 1s - loss: 6.5143 - acc: 0.5322 - val_loss: 2.1891 - val_acc: 0.8446
Epoch 2/200
 - 0s - loss: 6.7403 - acc: 0.5233 - val_loss: 2.3471 - val_acc: 0.8187
Epoch 3/200
 - 0s - loss: 6.6177 - acc: 0.5264 - val_loss: 3.1238 - val_acc: 0.7668


In [11]:
#Evalueating model on the testset
#[loss, accuracy]
print(estimator.evaluate(testData, testLabels))

[2.571803575274588, 0.7816091960874098]


In [12]:
#creating GAN model
combined, discriminator, generator = GAN_model(8, 16, 32, 0.2)

In [18]:
#training GAN model
train_loop(combined, discriminator, generator, estimator, 300)

0 [D loss: 1.014197, acc.: 53.33%] [G loss: 0.250629] [Loss change: 0.251, Loss increases: 0]
Number of right predictions: 30
Number of wrong predictions: 0
Accuracy: 1.0000 
50 [D loss: 0.962994, acc.: 48.33%] [G loss: 0.275464] [Loss change: 0.275, Loss increases: 0]
Number of right predictions: 30
Number of wrong predictions: 0
Accuracy: 1.0000 
100 [D loss: 0.999638, acc.: 45.00%] [G loss: 0.351420] [Loss change: 0.351, Loss increases: 0]
Number of right predictions: 30
Number of wrong predictions: 0
Accuracy: 1.0000 
150 [D loss: 1.073447, acc.: 48.33%] [G loss: 0.271253] [Loss change: 0.271, Loss increases: 0]
Number of right predictions: 30
Number of wrong predictions: 0
Accuracy: 1.0000 
200 [D loss: 0.885713, acc.: 51.67%] [G loss: 0.216677] [Loss change: 0.217, Loss increases: 0]
Number of right predictions: 30
Number of wrong predictions: 0
Accuracy: 1.0000 
250 [D loss: 1.062767, acc.: 50.00%] [G loss: 0.286770] [Loss change: 0.287, Loss increases: 0]
Number of right predic

In [14]:
conn = SQLConnector()
data = conn.pull_kdd99(attack='back', num=968)
dataframe = pd.DataFrame.from_records(data=data,
            columns=conn.pull_kdd99_columns(allQ=True))
le = LabelEncoder()
dataframe_encoded = dataframe.apply(le.fit_transform)
dataset = dataframe_encoded.values
pred = estimator.predict(dataset[:, 0:41])
counter = 0
for x in pred:
    print(x)
    if x[0] >= 0.98:
        counter += 1
print('Total %d, predicted attacks %d' % (len(pred), counter))

[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[0.9999904]
[1.]
[1.]
[0.978531]
[1.]
[1.]
[1.]
[0.99999994]
[1.]
[1.]
[1.]
[0.9985417]
[1.]
[1.]
[0.18840319]
[1.]
[1.]
[0.994249]
[0.9999976]
[0.99999917]
[0.99760175]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[0.99938214]
[0.99999976]
[1.]
[0.21893069]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[0.15312201]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[0.66860336]
[1.]
[1.]
[1.]
[0.16297367]
[0.21546447]
[1.]
[1.]
[1.]
[0.82340264]
[1.]
[1.]
[0.9997105]
[1.]
[0.18174356]
[1.]
[0.93237096]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[0.9998952]
[0.9965172]
[1.]
[1.]
[1.]
[1.]
[1.]
[0.99779785]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[0.9999998]
[1.]
[1.]
[0.9997877]
[0.993406]
[1.]
[1.]
[1.]
[0.51716167]
[1.]
[0.99994504]
[1.]
[1.]
[1.]
[0.9999956]
[1.]
[1.]
[0.4063697]
[1.]
[0.21933219]
[1.]
[0.99999994]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[0.83482987]
[1.]
[1.]
[0.99926674]
[0.99999976]
[1.]
[1.]
[0.70750886]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.

In [15]:
from mysql import SQLConnector
conn = SQLConnector()

In [19]:

noise = np.random.normal(0, 1, (10, 41)) #927

#create an array of generated attacks
gen_attacks = generator.predict(noise).astype('int')
estimator.predict(gen_attacks)
#data = conn.pull_kdd99(attack='nmap', num=8)
for x in gen_attacks:
    conn.write_gens('927', x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], 
                    x[11], x[12], x[13], x[14], x[15], x[16], x[17], x[18], x[19], x[20], 
                    x[21], x[22], x[23], x[24], x[25], x[26], x[27], x[28], x[29], x[30], 
                    x[31], x[32], x[33], x[34], x[35], x[36], x[37], x[38], x[39], x[40], '14')

array([[0.99999845],
       [1.        ],
       [0.99999994],
       [0.9997455 ],
       [1.        ],
       [0.9999991 ],
       [0.999996  ],
       [1.        ],
       [1.        ],
       [1.        ]], dtype=float32)