In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import pickle
import os
import collections

from sklearn.metrics import roc_curve, auc, precision_recall_fscore_support, average_precision_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import precision_recall_curve, auc, confusion_matrix, accuracy_score, plot_confusion_matrix
from sklearn.model_selection import train_test_split

import seaborn as sns

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.layers import Input, BatchNormalization, LeakyReLU, Dense, Reshape, Flatten, Activation
from tensorflow.keras.layers import Dropout, multiply, GaussianNoise, MaxPooling2D, concatenate
from tensorflow.keras import initializers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import ModelCheckpoint

from tqdm import tqdm

2023-02-18 19:54:42.269619: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


In [2]:
print(tf.__version__)

2.4.1


In [3]:
import tensorflow.python.platform.build_info as build
print(build.build_info['cuda_version'])
print(build.build_info['cudnn_version'])


10.1
7


In [4]:
pd.options.display.max_columns = None

In [5]:
#df_frag = pd.concat(map(pd.read_csv, glob.glob('data/csv_fragmentedV3/*.csv')))


In [6]:
#df = pd.read_csv('data/test/Friday-02-03-2018_TrafficForML_CICFlowMeter.csv')

In [7]:
# df = pd.concat(map(pd.read_csv, glob.glob('data/test/*.csv')))

In [8]:
filename = 'data/preprocessed_data.pickle'

input_file = open(filename, 'rb')
preprocessed_data = pickle.load(input_file)
input_file.close()

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [9]:
le = preprocessed_data['le']
x_train = preprocessed_data['x_train']
y_train = preprocessed_data['y_train']
x_test = preprocessed_data['x_test']
y_test = preprocessed_data['y_test']

In [10]:
pd.DataFrame(le.classes_, columns=['Type'])

Unnamed: 0,Type
0,Benign
1,Bot


In [11]:
assert x_train.shape[0] == y_train.shape[0]
assert x_test.shape[0] == y_test.shape[0]
assert x_train.shape[1] == x_test.shape[1]
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(1065425, 78)
(1065425,)
(188017, 78)
(188017,)


In [12]:
y_train

array([0, 0, 0, ..., 1, 0, 1])

In [13]:
# Labels normal data as 0, anomalies as 1

def make_labels_binary(label_encoder, labels):
    normal_data_index = np.where(label_encoder.classes_ == 'Benign')[0][0]
    new_labels = labels.copy()
    new_labels[labels != normal_data_index] = 1
    new_labels[labels == normal_data_index] = 0
    return new_labels

In [14]:
y_train = make_labels_binary(le, y_train)
y_test = make_labels_binary(le, y_test)

In [15]:
print(f'Number of anomalies in y_train: {y_train.sum():,}')
print(f'Number of anomalies in y_test: {y_test.sum():,}')

Number of anomalies in y_train: 421,003
Number of anomalies in y_test: 74,105


In [16]:
prev_len = len(x_train)
temp_df = x_train.copy()

In [17]:
# Subsetting only Normal Network packets in training set (excluding anomalies)
temp_df['label'] = y_train

In [18]:
temp_df = temp_df.loc[temp_df['label'] == 0]
temp_df = temp_df.drop('label', axis=1)
x_train = temp_df.copy()

print(
    f'Dataset has now the size of {(len(x_train)/prev_len):.2} of the original dataset')

Dataset has now the size of 0.6 of the original dataset


In [19]:
scaler = MinMaxScaler()

x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [20]:
dataset = {}
dataset['x_train'] = x_train.astype(np.float32)
dataset['y_train'] = y_train.astype(np.float32)
dataset['x_test'] = x_test.astype(np.float32)
dataset['y_test'] = y_test.astype(np.float32)

In [21]:
normals = collections.Counter(y_test)[0]
anomalies = collections.Counter(y_test)[1]
anomalies_percentage = anomalies / (normals + anomalies)
print('Number of Normal Network packets in the test set:', normals)
print('Number of Anomalous Network packets in the test set:', anomalies)
print('Ratio of anomalous to normal network packets: ', anomalies_percentage)

Number of Normal Network packets in the test set: 113912
Number of Anomalous Network packets in the test set: 74105
Ratio of anomalous to normal network packets:  0.3941398916055463


In [22]:
def get_generator(optim):
    generator = Sequential()
    generator.add(Dense(64, input_dim=78,
                  kernel_initializer=initializers.glorot_normal(seed=42)))
    generator.add(Activation('tanh'))

    generator.add(Dense(128))
    generator.add(Activation('tanh'))

    generator.add(Dense(78, activation='tanh'))

    generator.compile(loss='binary_crossentropy', optimizer=optim)

    return generator


def get_discriminator(optim):

    discriminator = Sequential()

    discriminator.add(Dense(256, input_dim=78,
                      kernel_initializer=initializers.glorot_normal(seed=42)))
    discriminator.add(Activation(tf.nn.leaky_relu))
    discriminator.add(Dropout(0.2))

    discriminator.add(Dense(1))
    discriminator.add(Activation('sigmoid'))

    discriminator.compile(loss='binary_crossentropy', optimizer=optim)

    return discriminator

In [23]:
def make_gan_network(discriminator, generator, optim, input_dim=78):
    discriminator.trainable = False
    gan_input = Input(shape=(input_dim,))
    x = generator(gan_input)
    gan_output = discriminator(x)

    gan = Model(inputs=gan_input, outputs=gan_output)
    gan.compile(loss='binary_crossentropy', optimizer=optim)

    return gan

In [24]:
learning_rate = 0.00001
momentum = 0.3
batch_size = 512
epochs = 100
adam = Adam(lr=learning_rate, beta_1=momentum)

In [25]:
# Training the GAN
x_train, y_train, x_test, y_test = dataset['x_train'], dataset['y_train'], dataset['x_test'], dataset['y_test']

In [26]:
# Calculating the number of batches based on the batch size
batch_count = x_train.shape[0] // batch_size
pbar = tqdm(total=epochs * batch_count, position=0, leave=True)
gan_loss = []
discriminator_loss = []

# Inititalizing the network
generator = get_generator(adam)
discriminator = get_discriminator(adam)
gan = make_gan_network(discriminator, generator, adam, input_dim=78)

print("Number params: ", gan.count_params())

for epoch in range(epochs):
    for index in range(batch_count):
        pbar.update(1)
        # Creating a random set of input noise and images
        noise = np.random.normal(0, 1, size=[batch_size, 78])

        # Generate fake samples
        generated_images = generator.predict_on_batch(noise)

        # Obtain a batch of normal network packets
        image_batch = x_train[index * batch_size: (index + 1) * batch_size]

        X = np.vstack((generated_images, image_batch))
        y_dis = np.ones(2*batch_size)
        y_dis[:batch_size] = 0

        # Train discriminator
        discriminator.trainable = True
        d_loss = discriminator.train_on_batch(X, y_dis)

        # Train generator
        noise = np.random.uniform(0, 1, size=[batch_size, 78])
        y_gen = np.ones(batch_size)
        discriminator.trainable = False
        g_loss = gan.train_on_batch(noise, y_gen)

        # Record the losses
        discriminator_loss.append(d_loss)
        gan_loss.append(g_loss)

    print("Epoch %d Batch %d/%d [D loss: %f] [G loss:%f]" %
          (epoch, index, batch_count, d_loss, g_loss))

  0%|                                                                                                                                                  | 0/125800 [00:00<?, ?it/s]2023-02-18 19:54:53.539449: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2023-02-18 19:54:53.540131: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2023-02-18 19:54:53.570886: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-18 19:54:53.571088: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:25:00.0 name: NVIDIA GeForce RTX 3070 computeCapability: 8.6
coreClock: 1.725GHz coreCount: 46 deviceMemorySize: 7.78GiB deviceMemoryBandwidth: 417.29GiB/s
2023-02-18 19:54:53.571110: I tensor

Number params:  43919


2023-02-18 19:54:55.858045: W tensorflow/core/framework/op_kernel.cc:1763] OP_REQUIRES failed at cwise_op_gpu_base.cc:89 : Internal: Failed to load in-memory CUBIN: CUDA_ERROR_NO_BINARY_FOR_GPU: no kernel image is available for execution on the device


InternalError:  Failed to load in-memory CUBIN: CUDA_ERROR_NO_BINARY_FOR_GPU: no kernel image is available for execution on the device
	 [[node sequential/activation/Tanh (defined at tmp/ipykernel_11050/4044302038.py:21) ]] [Op:__inference_predict_function_274]

Function call stack:
predict_function


In [None]:
fig, ax = plt.subplots()
plt.plot(discriminator_loss, label='Discriminator')
plt.plot(gan_loss, label='Generator')
plt.title("Training Loss GAN")
plt.ylabel('Loss')
plt.legend()
plt.savefig(figure_path + 'loss_gan.png')

In [None]:
pd.options.display.float_format = '{:20,.7f}'.format
results_df = pd.concat([pd.DataFrame(results), pd.DataFrame(y_test)], axis=1)
results_df.columns = ['results', 'y_test']
print('Mean score for normal packets :',
      results_df.loc[results_df['y_test'] == 0, 'results'].mean())
print('Mean score for anomalous packets :',
      results_df.loc[results_df['y_test'] == 1, 'results'].mean())

## Plots

In [None]:
def plot_confusion_matrix(cm, savefile, name, cmap=plt.cm.Greens):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title('Confusion matrix ' + name)
    plt.colorbar()
    plt.xticks(np.arange(2), ['Normal','Anomaly'], rotation=45)
    plt.yticks(np.arange(2), ['Normal','Anomaly'])
    plt.tight_layout()

    width, height = cm.shape

    for x in range(width):
        for y in range(height):
            plt.annotate(str(cm[x][y]), xy=(y, x), 
                        horizontalalignment='center',
                        verticalalignment='center')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(savefile)

def plot_accuracy(history, savefile, name):
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.legend(['accuracy', 'val_accuracy'])
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy in percent')
    plt.title(name)
    plt.savefig(savefile)
    
def plot_loss(history, savefile, name):
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.legend(['loss', 'val_loss'])
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title(name)
    plt.savefig(savefile)
    
def plot_roc(tpr, fpr, roc_auc, savefile, name):
    plt.figure(figsize=(10,10))
    plt.plot(fpr, tpr, lw=1, label='ROC curve (AUC = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='lime', linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC ' + name)
    plt.legend(loc="lower right")
    plt.savefig(savefile)


## Thresholds

In [None]:
# Obtaining the lowest "anomalies_percentage" score
per = np.percentile(results, anomalies_percentage*100)
y_pred = results.copy()
y_pred = np.array(y_pred)

In [None]:
# Thresholding based on the score
inds = (y_pred > per)
inds_comp = (y_pred <= per)
y_pred[inds] = 0
y_pred[inds_comp] = 1

precision, recall, f1, _ = precision_recall_fscore_support(
    y_test, y_pred, average='binary')
print('Accuracy Score :', accuracy_score(y_test, y_pred))
print('Precision :', precision)
print('Recall :', recall)
print('F1 :', f1)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
auc_curve = auc(fpr, tpr)

In [None]:
plot_roc(tpr, fpr, auc_curve, figure_path + 'confusion_gan.png', 'GAN')

In [None]:
cm = confusion_matrix(y_test, y_pred)

In [None]:
plot_confusion_matrix(cm, figure_path + 'confusion_gan.png', 'GAN')