In [9]:
import tensorflow as tf

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import click
from functools import partial
import logging as LOGGER
from src.data.data import (
    create_directory,
    split_data,
    preprocess_netflow_data,
    preprocess_pcap_data,
    prepare_netflow_sequantial_data,
    prepare_pcap_sequantial_data,
)
LOGGER.basicConfig(format="%(asctime)s %(levelname)s %(message)s", level=LOGGER.INFO)
from keras.models import Model, Input
from keras.layers import Embedding, Dense, TimeDistributed, Dropout, Conv1D, Flatten, Convolution1D, MaxPooling1D
from keras.layers import Bidirectional, concatenate, SpatialDropout1D, GlobalMaxPooling1D, Lambda
from keras.layers.recurrent import SimpleRNN, GRU, LSTM
from keras.optimizers import RMSprop, Adam, SGD
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential, Model
import keras

In [2]:
data_file = '../data/aggregated_binetflows/ddos_1s.csv'
packet_type = 'netflow'
label_column='label'
transition=0
rnn_seq=10
forward_predict=1
standardize=True
poly=False
test_set_size=0.3
random_seed=1

In [53]:
data = pd.read_csv(data_file)
if packet_type == "netflow":
    data = preprocess_netflow_data(data, label_column, transition)
else:
    data = preprocess_pcap_data(data, label_column)

data = data[sorted(data.columns, reverse=True)]

LOGGER.info(f"Read {len(data)} records")
LOGGER.info(f"Preparing training and testing data ...")

if packet_type == "netflow":
    x, y = prepare_netflow_sequantial_data(
        data, rnn_seq, forward_predict, standardize, poly, transition )
else:
    x, y = prepare_pcap_sequantial_data(
        data, rnn_seq, forward_predict, standardize, poly
    )
x_tr, x_te, y_tr, y_te = split_data(x, y, test_set_size, random_seed, stratified=False)

2019-10-23 23:43:30,234 INFO Read 35590 records
2019-10-23 23:43:30,235 INFO Preparing training and testing data ...


3559.0


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [54]:
data.columns

Index(['sd_totbytes', 'sd_srcbytes', 'sd_packets', 'sd_duration', 's_totbytes',
       's_state', 's_srcip', 's_srcbytes', 's_src_port>=1024',
       's_src_port<1024', 's_src_ip_d', 's_src_ip_c', 's_src_ip_b',
       's_src_ip_a', 's_packets', 's_duration', 's_dstip', 's_dst_port>=1024',
       's_dst_port<1024', 's_dst_ip_d', 's_dst_ip_c', 's_dst_ip_b',
       's_dst_ip_a', 'n_udp', 'n_tcp', 'n_src_port>=1024', 'n_src_port<1024',
       'n_src_ip_na', 'n_src_ip_c', 'n_src_ip_b', 'n_src_ip_a',
       'n_normal_rate', 'n_normal', 'n_icmp', 'n_dst_port>=1024',
       'n_dst_port<1024', 'n_dst_ip_na', 'n_dst_ip_c', 'n_dst_ip_b',
       'n_dst_ip_a', 'n_conn', 'n_background_rate', 'n_background',
       'm_duration', 'label'],
      dtype='object')

In [55]:
{l : len(data.columns[data.columns.str.contains(l)].values) for l in set([i[:i.find('_')+1]for i in data.columns[:-1]])}

{'m_': 1, 'sd_': 4, 'n_': 20, 's_': 19}

In [56]:
{l : data.columns[data.columns.str.contains(l)].values for l in set([i[:i.find('_')+1]for i in data.columns[:-1]])}

{'m_': array(['m_duration'], dtype=object),
 'sd_': array(['sd_totbytes', 'sd_srcbytes', 'sd_packets', 'sd_duration'],
       dtype=object),
 'n_': array(['n_udp', 'n_tcp', 'n_src_port>=1024', 'n_src_port<1024',
        'n_src_ip_na', 'n_src_ip_c', 'n_src_ip_b', 'n_src_ip_a',
        'n_normal_rate', 'n_normal', 'n_icmp', 'n_dst_port>=1024',
        'n_dst_port<1024', 'n_dst_ip_na', 'n_dst_ip_c', 'n_dst_ip_b',
        'n_dst_ip_a', 'n_conn', 'n_background_rate', 'n_background'],
       dtype=object),
 's_': array(['s_totbytes', 's_state', 's_srcip', 's_srcbytes',
        's_src_port>=1024', 's_src_port<1024', 's_src_ip_d', 's_src_ip_c',
        's_src_ip_b', 's_src_ip_a', 's_packets', 's_duration', 's_dstip',
        's_dst_port>=1024', 's_dst_port<1024', 's_dst_ip_d', 's_dst_ip_c',
        's_dst_ip_b', 's_dst_ip_a'], dtype=object)}

In [99]:
sd_in = Input(shape=(rnn_seq, 4), dtype=tf.float32)
s_in = Input(shape=(rnn_seq, 19), dtype=tf.float32)
n_in = Input(shape=(rnn_seq, 20), dtype=tf.float32)
m_in = Input(shape=(rnn_seq, 1), dtype=tf.float32)
sd_gru = Bidirectional(GRU(80, recurrent_dropout=0.25, return_sequences=True))(sd_in)
n_gru = Bidirectional(GRU(80, recurrent_dropout=0.25, return_sequences=True))(n_in)
s_gru = Bidirectional(GRU(80, recurrent_dropout=0.25, return_sequences=True))(s_in)
m_gru = Bidirectional(GRU(80, recurrent_dropout=0.25, return_sequences=True))(m_in)
all_gru = concatenate([sd_gru, n_gru, s_gru, m_gru])
out = TimeDistributed(Dense(units=2 if transition == 0 else 4, activation='softmax', name='Output'))(all_gru)
model = Model([sd_in, s_in, n_in, m_in], out)

In [None]:
import sc

In [73]:
optimizer = keras.optimizers.Adam(lr=0.001)
model.compile(optimizer=optimizer,
          loss=keras.losses.categorical_crossentropy,
          metrics=["accuracy"])

In [83]:
x_tr = [np.array(x_tr[:, :, :4]), np.array(x_tr[:, :, 4:4+19]), np.array(x_tr[:, :, 4+19:4+19+20]), np.array(x_tr[:, :, -1:])]
x_te = [np.array(x_te[:, :, :4]), np.array(x_te[:, :, 4:4+19]), np.array(x_te[:, :, 4+19:4+19+20]), np.array(x_te[:, :, -1:])]
res = model.fit(xxxx_tr, y_tr, epochs=10, batch_size=32, validation_data=(xxxx_te, y_te))

Train on 2490 samples, validate on 1068 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [109]:
inputs = [Input(shape=(rnn_seq,)) for i in range(x_tr.shape[-1])]
embs = [(Embedding(rnn_seq, 80))(inputs[i]) for i in range(x_tr.shape[-1])]
grus = [Bidirectional(GRU(80, recurrent_dropout=0.25, return_sequences=True))(embs[i]) for i in range(x_tr.shape[-1])]
all_gru = concatenate(grus)
out = TimeDistributed(Dense(units=2 if transition == 0 else 4, activation='softmax', name='Output'))(all_gru)
model = Model(inputs, out)

In [110]:
optimizer = keras.optimizers.Adam(lr=0.001)
model.compile(optimizer=optimizer,
          loss=keras.losses.categorical_crossentropy,
          metrics=["accuracy"])

In [12]:
model = Sequential()
# model.add(Embedding(x.shape[1:], dtype=tf.float32))
model.add(Bidirectional(GRU(80, recurrent_dropout=0.25, return_sequences=True)))
model.add(TimeDistributed(Dense(units=2 if transition == 0 else 4, activation='softmax', name='Output')))

optimizer = keras.optimizers.Adam(lr=0.001)
model.compile(optimizer=optimizer,
          loss=keras.losses.categorical_crossentropy,
          metrics=["accuracy"])

In [111]:
res = model.fit([x_tr[:, :, i] for i in range(x_tr.shape[-1])], y_tr, epochs=10, batch_size=32, validation_data=([x_te[:, :, i] for i in range(x_te.shape[-1])], y_te))

Train on 2490 samples, validate on 1068 samples
Epoch 1/10


InvalidArgumentError: indices[5,0] = -1 is not in [0, 10)
	 [[{{node embedding_130/embedding_lookup}}]]

In [87]:
aa = model.predict(xxxx_te)

In [16]:
inputs = [Input(shape=(10, 1), dtype=tf.float32) for i in range(10)]
rnn_units = [Bidirectional(SimpleRNN(10, recurrent_dropout=0.3, return_sequences=True))(inputs[i]) for i in range(10)]
all_units = concatenate(rnn_units)
out = TimeDistributed(Dense(units=2, activation='softmax', name='Output'))(all_units)
model = Model(inputs, out)
optimizer = keras.optimizers.Adam(lr=0.001)
model.compile(optimizer=optimizer,
          loss=keras.losses.categorical_crossentropy,
          metrics=["accuracy"])