In [44]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import keras_tuner as kt

from itertools import cycle
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from sklearn import metrics

In [45]:
lw = 1 # line weight for plt
SHUFFLE_BUFFER = 500
BATCH_SIZE = 32 # Model batch size
EPOCHS = 10 # Model number of epoch
MODELS_RESULTS={} # store for all model result for both balanced and resampled datasets
n_classes=10
num_columns = 42  # Number of columns in df
regularizers=tf.keras.regularizers.l2(0.001)
print(tf.__version__)
plt.rcParams['figure.dpi'] = 500
plt.rcParams['savefig.dpi'] = 500

2.16.1


In [46]:
# importing the train and test data dataset from file
train_val_csv = pd.read_csv('UNSW_NB15_training-set.csv')
test_csv = pd.read_csv('UNSW_NB15_testing-set.csv')

In [47]:
train_val=train_val_csv.drop(columns=['id','label'])
test=test_csv.drop(columns=['id','label'])

In [48]:
# Split the data into train and test with 80 train / 20 test
train,val = train_test_split(train_val, test_size=0.2, random_state = 1)

In [49]:
print(len(train), 'training examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

65865 training examples
16467 validation examples
175341 test examples


In [50]:
CLASSES=['Normal','Generic', 'Exploits', 'Fuzzers', 'DoS', 'Backdoor', 'Reconnaissance', 'Analysis', 'Shellcode', 'Worms']
print(CLASSES)
# # five metrics used for evaluation process
METRICS = [
      keras.metrics.CategoricalAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
      keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]

['Normal', 'Generic', 'Exploits', 'Fuzzers', 'DoS', 'Backdoor', 'Reconnaissance', 'Analysis', 'Shellcode', 'Worms']


In [51]:
def plot_chart(history,name):
    plt.figure(figsize=(10, 10))
    names =['loss','accuracy']
    c=0
    for n in names:
        c=c+1
        n_val = 'val_'+n
        hist = history.history[n]
        hist_val = history.history[n_val]
        plt.subplot(len(names),1,c)
        plt.plot(hist, label='Training {}'.format(n))
        plt.plot(hist_val, label='Validation {}'.format(n))
        plt.legend(loc='lower right')
        plt.ylabel(n)
        plt.ylim(0.4,1)
        plt.title('{} Training and Validation {}'.format(name,n))
    plt.xlabel('epoch')
    return plt

In [52]:
def group_by_data_type(data):
    # data=data.drop(columns=DROP_COL)
    columns=data.columns # get list of columns
    unique=data.dtypes.unique() # select one of each dtype
    result={'number_col':[],'string_category_col':[],'int_category_col':[],'labels':[]}
    for col in columns:
        if col=='attack_cat':
            temp= result['labels'] # init array
        elif data[col].dtypes=='float64':
            temp= result['number_col'] # init array
        elif data[col].dtypes=='int64':
             temp= result['int_category_col'] # init array
        else:
            temp= result['string_category_col'] # init array
        temp.append(col) # append the array
    return result

In [53]:
# convert dataprame to tensore datasets
def dataframe_to_dataset_multi_feature(dataframe):
    attack_cat=dataframe.pop('attack_cat')
    x_num=tf.convert_to_tensor(dataframe[number_col])
    x_string=tf.convert_to_tensor(dataframe[string_category_col])
    x_int=tf.convert_to_tensor(dataframe[int_category_col])
    attack_cat = attack_cat.map(CLASSES.index)
    y=tf.keras.utils.to_categorical(attack_cat, num_classes=10)
    return [x_num,x_string,x_int],y

In [54]:
train_df = train.copy();
val_df = val.copy();
test_df = test.copy();

In [55]:
# seperate the structured data into individal type
data_types = group_by_data_type(train_df) # return result from
number_col=data_types['number_col']
string_category_col=data_types['string_category_col']
int_category_col=data_types['int_category_col']

In [56]:
print('numbers column', number_col)
print('_________________________________________________')
print('string column',string_category_col)
print('_________________________________________________')
print('integer column',int_category_col)

numbers column ['dur', 'rate', 'sload', 'dload', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'tcprtt', 'synack', 'ackdat']
_________________________________________________
string column ['proto', 'service', 'state']
_________________________________________________
integer column ['spkts', 'dpkts', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'swin', 'stcpb', 'dtcpb', 'dwin', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports']


In [57]:
# ensure all preprosesing  layer adapt to the structure of the dataset
dataframe = train_df.copy()
_ds= dataframe_to_dataset_multi_feature(dataframe)
X,y=_ds
input_num_shape = X[0].shape[1:]
input_string_shape = X[1].shape[1:]
input_int_shape = X[2].shape[1:]

adapt_input_num = X[0]
adapt_input_string = X[1]
adapt_input_int = X[2]
adapt_input_string

<tf.Tensor: shape=(65865, 3), dtype=string, numpy=
array([[b'udp', b'dns', b'INT'],
       [b'udp', b'dns', b'INT'],
       [b'tcp', b'-', b'FIN'],
       ...,
       [b'unas', b'-', b'INT'],
       [b'tcp', b'-', b'FIN'],
       [b'tcp', b'-', b'FIN']], dtype=object)>

In [58]:
# # Create a Normalization layer and set its internal state using the training data
# normalizer = layers.experimental.preprocessing.Normalization()
# normalizer.adapt(adapt_input_num)

In [59]:
import tensorflow as tf
from tensorflow.keras import layers

# Create a Normalization layer and set its internal state using the training data
normalizer = layers.Normalization()
normalizer.adapt(adapt_input_num)


In [60]:
# Create a Intlookup layer and set its internal state using the training data
string_lookup = layers.StringLookup(output_mode="multi_hot")
string_lookup.adapt(adapt_input_string)
string_vocabulary=string_lookup.get_vocabulary()

In [61]:
# Create a Normalization layer and set its internal state using the training data
int_lookup = layers.IntegerLookup( output_mode="multi_hot")
int_lookup.adapt(adapt_input_int)

In [62]:
# # Load some data
def build_model(hp):
    # Tune the number of units in the first Dense layer
    # Choose an optimal value between 32-512
    # Tune the learning rate for the optimizer
    # Choose an optimal value from 0.01, 0.001, or 0.0001
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    # Input layer for numeric data
    input_num = keras.Input(shape=input_num_shape, name='numeric_inputs')
    # include the normalization layer
    x1 = normalizer(input_num)
    # dence layers for numeric data
    hp_units_num_1 = hp.Int('units_num_1', min_value=32, max_value=512, step=32)
    x1 = layers.Dense(units=hp_units_num_1,activation='relu',kernel_regularizer=regularizers)(x1)
    x1 = layers.Dropout(0.5)(x1)

    hp_units_num_2 = hp.Int('units_num_2', min_value=32, max_value=512, step=32)
    x1 = layers.Dense(units=hp_units_num_2,activation='relu',kernel_regularizer=regularizers)(x1)
    x1 = layers.Dropout(0.5)(x1)

    hp_units_num_3 = hp.Int('units_num_3', min_value=32, max_value=512, step=32)
    x1 = layers.Dense(units=hp_units_num_3,activation='relu',kernel_regularizer=regularizers)(x1)
    x1 = layers.Dropout(0.5)(x1)

    output_num = layers.Dense(128,activation='relu',kernel_regularizer=regularizers)(x1)
    model_1=keras.models.Model(inputs=input_num, outputs=output_num)
    print('SUBMODAL-INTEGER-NUMERIC')
    model_1.summary()
    y1= model_1(input_num)

  # Input layer for integer cartegory data
    input_int = keras.Input(shape=input_int_shape, name='cartegory_integer_inputs')
  # include the int_lookup layer
    x2 = int_lookup(input_int)
  # dence layers for integer data
    hp_units_int_1 = hp.Int('units_int_1', min_value=32, max_value=512, step=32)
    x2 = layers.Dense(units=hp_units_int_1,activation='relu',kernel_regularizer=regularizers)(x2)
    x2 = layers.Dropout(0.5)(x2)

    hp_units_int_2 = hp.Int('units_int_2', min_value=32, max_value=512, step=32)
    x2 = layers.Dense(units=hp_units_int_2,activation='relu',kernel_regularizer=regularizers)(x2)
    x2 = layers.Dropout(0.5)(x2)

    hp_units_int_3 = hp.Int('units_int_3', min_value=32, max_value=512, step=32)
    x2 = layers.Dense(units=hp_units_int_3,activation='relu',kernel_regularizer=regularizers)(x2)
    x2 = layers.Dropout(0.5)(x2)

    output_int = layers.Dense(128,activation='relu',kernel_regularizer=regularizers)(x2)
    model_2=keras.models.Model(inputs=input_int, outputs=output_int)
    print('SUBMODAL-INTEGER-SUMMARY')
    model_2.summary()
    y2= model_2(input_int)

  # Input layer for string cartegory data
    input_string = keras.Input(shape=input_string_shape, name='cartegory_string_inputs',dtype='string')
  # include the int_lookup layer
    x3 = string_lookup(input_string)
  # dence layers for string data
    hp_units_string_1 = hp.Int('units_string_1', min_value=32, max_value=512, step=32)
    x3 = layers.Dense(units=hp_units_string_1,activation='relu',kernel_regularizer=regularizers)(x3)
    x3 = layers.Dropout(0.5)(x3)

    hp_units_string_2 = hp.Int('units_string_2', min_value=32, max_value=512, step=32)
    x3 = layers.Dense(units=hp_units_string_2,activation='relu',kernel_regularizer=regularizers)(x3)
    x3 = layers.Dropout(0.5)(x3)


    hp_units_string_3 = hp.Int('units_string_3', min_value=32, max_value=512, step=32)
    x3 = layers.Dense(units=hp_units_string_3,activation='relu',kernel_regularizer=regularizers)(x3)
    x3 = layers.Dropout(0.5)(x3)

    output_string = layers.Dense(128,activation='relu',kernel_regularizer=regularizers)(x3)
    model_3=keras.models.Model(inputs=input_string, outputs=output_string)
    print('SUBMODAL-STRING-SUMMARY')
    model_3.summary()
    y3= model_3(input_string)

    avg = tf.keras.layers.Average()([y1,y2,y3])
    output = layers.Dense(n_classes, activation="softmax")(avg)
    model = keras.Model(inputs=[input_num,input_string,input_int], outputs=output)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate), loss=keras.losses.CategoricalCrossentropy(), metrics=METRICS)
    return model

In [63]:
# model fix, compyling and visualisation
# LATE-FUSION
df = train_df.copy()
X_train,y_train= dataframe_to_dataset_multi_feature(df)
df = val_df.copy()
X_val,y_val= dataframe_to_dataset_multi_feature(df)
df = test_df.copy()
X_test,y_test= dataframe_to_dataset_multi_feature(df)

In [64]:
print('LATE-FUSION')
name='LATE-FUSION'
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
print('fit model for:{}_____________________________________________________________________________________________________________'.format(name))
MODELS_RESULTS[name]={}
project_name='HYPERPARAM_'+name
tuner = kt.Hyperband(build_model,
                     objective='val_accuracy',
                     max_epochs=5,
                     factor=3,
                     directory='hyperparam_dir',
                     project_name=project_name)
print('----')
#     early stoppping if val_loss is behaving poorly
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
#     hyperparameter search based on 50 epochs
tuner.search(X_train, y_train, epochs=10, validation_data=[X_val,y_val], callbacks=[stop_early])
    # Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
model = tuner.hypermodel.build(best_hps)
print('----')
history = model.fit(X_train,y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=[X_val,y_val])
val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
hypermodel = tuner.hypermodel.build(best_hps)
history = hypermodel.fit(X_train,y_train, batch_size=BATCH_SIZE, epochs=best_epoch, validation_data=[X_val,y_val])
print('evaluation result model for:{} on TRAIN_________________________________________________________'.format(name))
loss,accuracy,precision,recall,auc,prc = hypermodel.evaluate(X_train,y_train)
MODELS_RESULTS[name]['train']={'loss':loss,'accuracy':accuracy,'precision':precision,'recall':recall,'auc':auc,'prc':prc}
print('loss:{} -accuracy:{} - precision:{} - recall:{} - auc:{} - prc:{}'.format(loss,accuracy,precision,recall,auc,prc))
print('----')
print('----')
print('evaluation result model for:{} on VALIDATION________________________________________________________________________________'.format(name))
loss,accuracy,precision,recall,auc,prc = hypermodel.evaluate(X_val,y_val)
MODELS_RESULTS[name]['validation']={'loss':loss,'accuracy':accuracy,'precision':precision,'recall':recall,'auc':auc,'prc':prc}
print('loss:{} -accuracy:{} - precision:{} - recall:{} - auc:{} - prc:{}'.format(loss,accuracy,precision,recall,auc,prc))
print('----')
print('----')
print('evaluation result model for:{} on TEST________________________________________________________________________________'.format(name))
loss,accuracy,precision,recall,auc,prc = hypermodel.evaluate(X_test,y_test)
MODELS_RESULTS[name]['test']={'loss':loss,'accuracy':accuracy,'precision':precision,'recall':recall,'auc':auc,'prc':prc}
print('loss:{} -accuracy:{} - precision:{} - recall:{} - auc:{} - prc:{}'.format(loss,accuracy,precision,recall,auc,prc))
print('----')
print('----')
print('plot:{}_________________________________________________________'.format(name))

LATE-FUSION
fit model for:LATE-FUSION_____________________________________________________________________________________________________________
Reloading Tuner from hyperparam_dir/HYPERPARAM_LATE-FUSION/tuner0.json
----
SUBMODAL-INTEGER-NUMERIC


SUBMODAL-INTEGER-SUMMARY


SUBMODAL-STRING-SUMMARY


----
Epoch 1/10
[1m2059/2059[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m359s[0m 173ms/step - accuracy: 0.6254 - auc: 0.9152 - loss: 2.3936 - prc: 0.7090 - precision: 0.8274 - recall: 0.4814 - val_accuracy: 0.8049 - val_auc: 0.9814 - val_loss: 1.0281 - val_prc: 0.9054 - val_precision: 0.8965 - val_recall: 0.7398
Epoch 2/10
[1m 292/2059[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m4:43[0m 160ms/step - accuracy: 0.7965 - auc: 0.9799 - loss: 1.0449 - prc: 0.8983 - precision: 0.9029 - recall: 0.7217

KeyboardInterrupt: 

In [None]:
import tensorflow as tf

# Check if GPU is available
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))


In [None]:
plot_chart(history,name)

In [None]:
y_pred = hypermodel.predict(X_test)

In [None]:
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_pred[:, i])
    roc_auc[i] = metrics.auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_pred.ravel())
roc_auc["micro"] = metrics.auc(fpr["micro"], tpr["micro"])

In [None]:
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
mean_tpr = np.zeros_like(all_fpr)


for i in range(n_classes):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes

In [None]:
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = metrics.auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
plt.figure()
plt.plot(
    fpr["micro"],
    tpr["micro"],
    label="micro-average ROC curve (area = {0:0.2f})".format(roc_auc["micro"]),
    color="deeppink",
    linestyle=":",
    linewidth=2,
)

plt.plot(
    fpr["macro"],
    tpr["macro"],
    label="macro-average ROC curve (area = {0:0.2f})".format(roc_auc["macro"]),
    color="navy",
    linestyle=":",
    linewidth=2,
)

colors = cycle(["aqua", "darkorange", "cornflowerblue","red","green","peru","tan","salmon","sienna","palegreen"])
for i, color in zip(range(n_classes), colors):
    plt.plot(
        fpr[i],
        tpr[i],
        color=color,
        lw=1,
        label="ROC for {0} (area = {1:0.2f})".format(CLASSES[i], roc_auc[i]),
    )

plt.plot([0, 1], [0, 1], "k--", lw=lw)
# plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("LATE-FUSION ROC MULTICLASS")
plt.legend(loc="lower right")
plt.show()

In [None]:
MODELS_RESULTS

In [None]:
pd.DataFrame(MODELS_RESULTS['LATE-FUSION'])