In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import csc_matrix
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import h5py
import time

In [None]:
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Conv2D, BatchNormalization, Activation, AveragePooling2D, Dense, Dropout, Flatten, Lambda, MaxPool2D, Conv2DTranspose, UpSampling2D, Concatenate, Add
from tensorflow.keras import regularizers, optimizers
from keras.preprocessing import image
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

from sklearn import model_selection, feature_selection, utils, ensemble, linear_model, metrics

In [None]:
print("Import data")

X = pd.read_hdf('drive/My Drive/data_window_botnet3.h5', key='data')
X.reset_index(drop=True, inplace=True)

X2 = pd.read_hdf('drive/My Drive/data_window3_botnet3(1).h5', key='data')
X2.reset_index(drop=True, inplace=True)

X = X.join(X2)

X.drop('window_id', axis=1, inplace=True)

y = X['Label_<lambda>']
X.drop('Label_<lambda>', axis=1, inplace=True)

labels = np.load("drive/My Drive/data_window_botnet3_labels.npy",allow_pickle=True)

Import data


In [None]:
print(X.columns.values)
print(labels)
print(np.where(labels == 'flow=From-Botne')[0][0])

y_bin6 = y==np.where(labels == 'flow=From-Botne')[0][0]
print("y", np.unique(y, return_counts=True))

['counts' 'Sport_nunique' 'DstAddr_nunique' 'Dport_nunique' 'Dur_sum'
 'Dur_mean' 'Dur_std' 'Dur_max' 'Dur_median' 'TotBytes_sum'
 'TotBytes_mean' 'TotBytes_std' 'TotBytes_max' 'TotBytes_median'
 'SrcBytes_sum' 'SrcBytes_mean' 'SrcBytes_std' 'SrcBytes_max'
 'SrcBytes_median' 'Sport_RU' 'DstAddr_RU' 'Dport_RU']
['flow=Background' 'flow=From-Norma' 'flow=Normal-V44' 'flow=From-Backg'
 'flow=To-Backgro' 'flow=To-Normal-' 'flow=From-Botne']
6
y (array([0, 1, 2, 3, 4, 5, 6]), array([1903499,    5483,       5,     986,  112392,     679,    1009]))


In [None]:
## NN
filename_weights = "model.h5"

def fprecision(y_true, y_pred):	
    """Precision metric.	
    Only computes a batch-wise average of precision. Computes the precision, a
    metric for multi-label classification of how many selected items are
    relevant.
    """	
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))	
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))	
    precision = true_positives / (predicted_positives + K.epsilon())	
    return precision

In [None]:
def frecall(y_true, y_pred):	
    """Recall metric.	
    Only computes a batch-wise average of recall. Computes the recall, a metric
    for multi-label classification of how many relevant items are selected.	
    """	
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))	
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))	
    recall = true_positives / (possible_positives + K.epsilon())	
    return recall

In [None]:
def ff1_score(y_true, y_pred):
    """Computes the F1 Score
    Only computes a batch-wise average of recall. Computes the recall, a metric
    for multi-label classification of how many relevant items are selected.	
    """
    p = fprecision(y_true, y_pred)
    r = frecall(y_true, y_pred)
    return (2 * p * r) / (p + r + K.epsilon())

In [None]:
def get_model(inputs, dropout=0.5, batchnorm=True):
    x = Dense(256, input_shape=(22,))(inputs)
    if batchnorm:
        x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = Dropout(dropout)(x)

    x = Dense(128, input_shape=(256,))(x)
    if batchnorm:
        x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = Dropout(dropout)(x)

    x = Dense(1, input_shape=(128,))(x)
    outputs = Activation("sigmoid")(x)
    
    model = Model(inputs=[inputs], outputs=[outputs])
    return model

In [None]:
nb_prediction = 50
np.random.seed(seed=123456)
tab_seed = np.random.randint(0, 1000000000, nb_prediction)
print(tab_seed)

tab_train_precision = np.array([0.]*nb_prediction)
tab_train_recall = np.array([0.]*nb_prediction)
tab_train_fbeta_score = np.array([0.]*nb_prediction)

tab_test_precision = np.array([0.]*nb_prediction)
tab_test_recall = np.array([0.]*nb_prediction)
tab_test_fbeta_score = np.array([0.]*nb_prediction)

[545331265  64051946 930796018 636193841  44994104 883990699 632376047
 123822635 544385883 780062752 370319575 553050788 864905352 385976778
 387642634 926825740 528719691 508226068 184796139 357437743 528333490
 581730774 850389862 904355447 555943458 826610574 471925446 441274154
 302315714 753964207 438316612 682070622 983063469 876566388 115035351
 603291380 407092436 139062484 590788013 387585019 441951173 260506609
 787917988 628041163 776749475 385983453 140379815 317878160 123195836
 263152482]


In [None]:
for i in range(0, nb_prediction):
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y_bin6, test_size=0.33, random_state=tab_seed[i])

    print(i)
    print("y_train", np.unique(y_train, return_counts=True))
    print("y_test", np.unique(y_test, return_counts=True))

    inputs = Input((22,), name='input')
    model = get_model(inputs, dropout=0, batchnorm=1)

0
y_train (array([False,  True]), array([1355452,     663]))
y_test (array([False,  True]), array([667592,    346]))
1
y_train (array([False,  True]), array([1355427,     688]))
y_test (array([False,  True]), array([667617,    321]))
2
y_train (array([False,  True]), array([1355426,     689]))
y_test (array([False,  True]), array([667618,    320]))
3
y_train (array([False,  True]), array([1355442,     673]))
y_test (array([False,  True]), array([667602,    336]))
4
y_train (array([False,  True]), array([1355438,     677]))
y_test (array([False,  True]), array([667606,    332]))
5
y_train (array([False,  True]), array([1355478,     637]))
y_test (array([False,  True]), array([667566,    372]))
6
y_train (array([False,  True]), array([1355452,     663]))
y_test (array([False,  True]), array([667592,    346]))
7
y_train (array([False,  True]), array([1355437,     678]))
y_test (array([False,  True]), array([667607,    331]))
8
y_train (array([False,  True]), array([1355440,     675]))
y_t

In [None]:
    callbacks = [
        ModelCheckpoint(filename_weights, verbose=1, save_best_only=True, save_weights_only=True)
    ]

    model.compile(optimizer=optimizers.Adam(lr=1e-3), loss=["binary_crossentropy"], metrics=[fprecision, frecall, ff1_score])
    #model.summary()

    tps = time.time()
    results = model.fit(X_train, y_train, batch_size=32, epochs=20, validation_split=0.15, shuffle=True, class_weight=None, verbose=0, callbacks=callbacks)
    print("Execution time = ", time.time()-tps)

    model.load_weights(filename_weights)

  super(Adam, self).__init__(name, **kwargs)



Epoch 1: val_loss improved from inf to 0.00214, saving model to model.h5

Epoch 2: val_loss did not improve from 0.00214

Epoch 3: val_loss improved from 0.00214 to 0.00070, saving model to model.h5

Epoch 4: val_loss did not improve from 0.00070

Epoch 5: val_loss did not improve from 0.00070

Epoch 6: val_loss improved from 0.00070 to 0.00064, saving model to model.h5

Epoch 7: val_loss did not improve from 0.00064

Epoch 8: val_loss did not improve from 0.00064

Epoch 9: val_loss did not improve from 0.00064

Epoch 10: val_loss did not improve from 0.00064

Epoch 11: val_loss did not improve from 0.00064

Epoch 12: val_loss did not improve from 0.00064

Epoch 13: val_loss did not improve from 0.00064

Epoch 14: val_loss did not improve from 0.00064

Epoch 15: val_loss did not improve from 0.00064

Epoch 16: val_loss did not improve from 0.00064

Epoch 17: val_loss did not improve from 0.00064

Epoch 18: val_loss did not improve from 0.00064

Epoch 19: val_loss did not improve from 

In [None]:
    y_pred_train = model.predict(X_train, batch_size=32, verbose=0)
    y_pred_train_bin = (y_pred_train > 0.5).astype(np.uint8)
    precision, recall, fbeta_score, support = metrics.precision_recall_fscore_support(y_train, y_pred_train_bin)
    tab_train_precision[i] = precision[1]
    tab_train_recall[i] = recall[1]
    tab_train_fbeta_score[i] = fbeta_score[1]

In [None]:
    y_pred_test = model.predict(X_test, batch_size=32, verbose=0)
    y_pred_test_bin = (y_pred_test > 0.5).astype(np.uint8)
    precision, recall, fbeta_score, support = metrics.precision_recall_fscore_support(y_test, y_pred_test_bin)
    tab_test_precision[i] = precision[1]
    tab_test_recall[i] = recall[1]
    tab_test_fbeta_score[i] = fbeta_score[1]

In [None]:
print("Train")
print("precision = ", tab_train_precision.mean(), tab_train_precision.std(), tab_train_precision)
print("recall = ", tab_train_recall.mean(), tab_train_recall.std(), tab_train_recall)
print("fbeta_score = ", tab_train_fbeta_score.mean(), tab_train_fbeta_score.std(), tab_train_fbeta_score)

print("Test")
print("precision = ", tab_test_precision.mean(), tab_test_precision.std(), tab_test_precision)
print("recall = ", tab_test_recall.mean(), tab_test_recall.std(), tab_test_recall)
print("fbeta_score = ", tab_test_fbeta_score.mean(), tab_test_fbeta_score.std(), tab_test_fbeta_score)

Train
precision =  0.019763912310286676 0.13834738617200676 [0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.98819562]
recall =  0.016863309352517987 0.1180431654676259 [0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.    