In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical
# from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import keras_tuner as kt

from itertools import cycle
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder

In [None]:
encoder = OneHotEncoder()

lw = 1 # line weight for plt
BATCH_SIZE = 32 # Model batch size
EPOCHS = 10 # Model number of epoch
MODELS_RESULTS={} # store for all model result for both balanced and resampled datasets
n_classes=10
num_columns = 42  # Number of columns in df
regularizers=tf.keras.regularizers.l2(0.001)
print(tf.__version__)
plt.rcParams['figure.dpi'] = 500
plt.rcParams['savefig.dpi'] = 500

In [None]:
# importing the train and test data dataset from file
train_val_csv = pd.read_csv('UNSW_NB15_training-set.csv')
test_csv = pd.read_csv('UNSW_NB15_testing-set.csv')

In [None]:
print(len(test_csv))

In [None]:
train_val=train_val_csv.drop(columns=['id','label'])
test=test_csv.drop(columns=['id','label'])

In [None]:
# Split the data into train and test with 80 train / 20 test
train,val = train_test_split(train_val, test_size=0.2, random_state = 1)

In [None]:
print(len(train), 'training examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

In [None]:
pip install tf-metrics

In [None]:
CLASSES=['Normal','Generic', 'Exploits', 'Fuzzers', 'DoS', 'Backdoor', 'Reconnaissance', 'Analysis', 'Shellcode', 'Worms']
print(CLASSES)
# # five metrics used for evaluation process
METRICS = [
      keras.metrics.CategoricalAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
      keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]

In [None]:
def plot_chart(history,name):
    plt.figure(figsize=(10, 10))
    names =['loss','accuracy']
    c=0
    for n in names:
        c=c+1
        n_val = 'val_'+n
        hist = history.history[n]
        hist_val = history.history[n_val]
        plt.subplot(len(names),1,c)
        plt.plot(hist, label='Training {}'.format(n))
        plt.plot(hist_val, label='Validation {}'.format(n))
        plt.legend(loc='lower right')
        plt.ylabel(n)
        plt.ylim(0.4,1)
        plt.title('{} Training and Validation {}'.format(name,n))
    plt.xlabel('epoch')
    return plt

In [None]:
def group_by_data_type(data):
    # data=data.drop(columns=DROP_COL)
    columns=data.columns # get list of columns
    unique=data.dtypes.unique() # select one of each dtype
    result={'number_col':[],'string_category_col':[],'int_category_col':[],'labels':[]}
    for col in columns:
        if col=='attack_cat':
            temp= result['labels'] # init array
        elif data[col].dtypes=='float64':
            temp= result['number_col'] # init array
        elif data[col].dtypes=='int64':
             temp= result['int_category_col'] # init array
        else:
            temp= result['string_category_col'] # init array
        temp.append(col) # append the array
    return result

In [None]:
def df_to_dataset(dataframe):
    dataframe = dataframe.copy()
    labels=dataframe.pop('attack_cat')
    y=tf.keras.utils.to_categorical(labels, num_classes=10)
    X=tf.convert_to_tensor(dataframe)
    return X,y

In [None]:
def number_col_process(data):
    normalized_df=(data-data.mean())/data.std()
    return normalized_df

In [None]:
def int_category_col_process(data):
    print('process string_category_col')
    columns=data.columns.values
    for col in columns:
        codes, uniques = pd.factorize(data[col])
        data[col] = codes
    return data

In [None]:
def string_category_col_process(data):
    columns=data.columns.values
    for col in columns:
        encoder_col = encoder.fit_transform(data[[col]]).toarray()
        data[col] = encoder_col
    return data

In [None]:
def labels_process(data):
    data['attack_cat'] = data['attack_cat'].map(CLASSES.index)
    return data

In [None]:
def process_data(df,data_types):
    result= df.copy()
    for name in data_types:
        print(name)
        types = data_types[name]
        selected_df = df[types]
        if name =='number_col':
            data = number_col_process(selected_df)
        if name =='int_category_col':
            data = int_category_col_process(selected_df)
        if name =='string_category_col':
            data = string_category_col_process(selected_df)
        if name =='labels':
            data = labels_process(selected_df)
        for tp in types:
            result[tp]=data[tp]
    return result

In [None]:
train_df = train.copy();
val_df = val.copy();
test_df = test.copy();
# seperate the structured data into individal type
data_types = group_by_data_type(train_df) # return result from

In [None]:
# model fix, compyling and visualisation
# TRADITIONAL-NETWORK
df = train_df.copy()
_ds= process_data(df,data_types)
X_train,y_train= df_to_dataset(_ds)

In [None]:
df = val_df.copy()
_ds= process_data(df,data_types)
X_val,y_val= df_to_dataset(_ds)

In [None]:
df = test_df.copy()
_ds= process_data(df,data_types)
X_test,y_test= df_to_dataset(_ds)

In [None]:
# # Load some data
def build_model(hp):
    # Tune the number of units in the first Dense layer
    # Choose an optimal value between 32-512
    # Tune the learning rate for the optimizer
    # Choose an optimal value from 0.01, 0.001, or 0.0001
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    # Input layer for numeric data
    inputs = keras.Input(shape=(42,), name='inputs')

    hp_units_1 = hp.Int('units_1', min_value=32, max_value=512, step=32)
    x = layers.Dense(hp_units_1,activation='relu',kernel_regularizer=regularizers)(inputs)
    x = layers.Dropout(0.5)(x)

    hp_units_2 = hp.Int('units_2', min_value=32, max_value=512, step=32)
    x = layers.Dense(units=hp_units_2,activation='relu',kernel_regularizer=regularizers)(x)
    x = layers.Dropout(0.5)(x)

    hp_units_3 = hp.Int('units_3', min_value=32, max_value=512, step=32)
    x = layers.Dense(units=hp_units_3,activation='relu',kernel_regularizer=regularizers)(x)
    x = layers.Dropout(0.5)(x)

    output = layers.Dense(n_classes, activation="softmax")(x)
    model = keras.Model(inputs,output)
    print('MODAL-SUMMARY')
    model.summary()
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate), loss=keras.losses.CategoricalCrossentropy(), metrics=METRICS)
    return model

In [None]:
print('TRADITIONAL-NETWORK')
name='TRADITIONAL-NETWORK'
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
print('fit model for:{}_____________________________________________________________________________________________________________'.format(name))
MODELS_RESULTS[name]={}
project_name='HYPERPARAM_'+name
tuner = kt.Hyperband(build_model,
                     objective='val_accuracy',
                     max_epochs=5,
                     factor=3,
                     directory='hyperparam_dir',
                     project_name=project_name)
print('----')
#     early stoppping if val_loss is behaving poorly
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
#     hyperparameter search based on 50 epochs
tuner.search(X_train, y_train, epochs=10, validation_data=[X_val,y_val], callbacks=[stop_early])
    # Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
model = tuner.hypermodel.build(best_hps)
print('----')
history = model.fit(X_train,y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=[X_val,y_val])
val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
hypermodel = tuner.hypermodel.build(best_hps)
history = hypermodel.fit(X_train,y_train, batch_size=BATCH_SIZE, epochs=best_epoch, validation_data=[X_val,y_val])
print('evaluation result model for:{} on TRAIN_________________________________________________________'.format(name))
loss,accuracy,precision,recall,auc,prc = hypermodel.evaluate(X_train,y_train)
MODELS_RESULTS[name]['train']={'loss':loss,'accuracy':accuracy,'precision':precision,'recall':recall,'auc':auc,'prc':prc}
print('loss:{} -accuracy:{} - precision:{} - recall:{} - auc:{} - prc:{}'.format(loss,accuracy,precision,recall,auc,prc))
print('----')
print('----')
print('evaluation result model for:{} on VALIDATION________________________________________________________________________________'.format(name))
loss,accuracy,precision,recall,auc,prc = hypermodel.evaluate(X_val,y_val)
MODELS_RESULTS[name]['validation']={'loss':loss,'accuracy':accuracy,'precision':precision,'recall':recall,'auc':auc,'prc':prc}
print('loss:{} -accuracy:{} - precision:{} - recall:{} - auc:{} - prc:{}'.format(loss,accuracy,precision,recall,auc,prc))
print('----')
print('----')
print('evaluation result model for:{} on TEST________________________________________________________________________________'.format(name))
loss,accuracy,precision,recall,auc,prc = hypermodel.evaluate(X_test,y_test)
MODELS_RESULTS[name]['test']={'loss':loss,'accuracy':accuracy,'precision':precision,'recall':recall,'auc':auc,'prc':prc}
print('loss:{} -accuracy:{} - precision:{} - recall:{} - auc:{} - prc:{}'.format(loss,accuracy,precision,recall,auc,prc))
print('----')
print('----')
print('plot:{}_________________________________________________________'.format(name))

In [None]:
plot_chart(history,name)

In [None]:
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, f1_score

In [None]:
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_pred[:, i])
    roc_auc[i] = metrics.auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_pred.ravel())
roc_auc["micro"] = metrics.auc(fpr["micro"], tpr["micro"])

In [None]:
y_pred_labels = np.argmax(y_pred, axis=1)
print(y_pred_labels)

In [None]:
print(y_test)
y_true = np.argmax(y_test, axis=1)
print(y_true)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
accuracy = accuracy_score(y_true, y_pred_labels)
print(accuracy)

In [None]:
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
mean_tpr = np.zeros_like(all_fpr)


for i in range(n_classes):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes

In [None]:
MODELS_RESULTS

In [None]:
pd.DataFrame(MODELS_RESULTS['TRADITIONAL-NETWORK'])