In [None]:
#Original imports from the example code
from __future__ import print_function
import tensorflow as tf
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Conv1D, GlobalMaxPooling1D, Embedding
from keras.datasets import imdb
from keras.utils import plot_model, pad_sequences
from keras import optimizers
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

#Implement the data modification by Arnau N.
import os, time
import pandas as pd
import seaborn as sns

In [None]:
#Importing the data from the CTU-13 .csv file
data_path=os.getcwd()+'/data'
df=pd.read_csv(data_path+'/dataset.csv', sep=';')
print("\n>>> Shape...\n")
print(df.shape)
print("\n>>> Headers...")
print(df.head())

In [None]:
print("\n>>> sTos, dTos, Proto, Dir, Label values (only printing Label values) ...")
df['sTos'].value_counts()
df['dTos'].value_counts()
df['Proto'].value_counts()
df['Dir'].value_counts()
df['Label'].value_counts()

In [None]:
print("\n>>> Types of different parameters ...\n")
print(df.dtypes)

In [None]:
## DROP COLUMNS NOT NEEDED
df.drop('StartTime',axis = 1,inplace= True)
df.drop('DstAddr',axis = 1,inplace= True)
df.drop('State',axis = 1,inplace= True)
df.drop('SrcAddr',axis = 1,inplace= True)
print(df.dtypes)

In [None]:
## MAP STRINGS TO INT
pmap = {'udp':0, 'tcp':1, 'icmp':2, 'igmp':3, 'rtcp':4, 'arp':5, 'rtp':6, 'ipv6-icmp':7, 'udt':8, 'rarp':9, 'ipx/spx':10, 'ipv6':11, 'pim':12}
df['Proto'] = df['Proto'].map(pmap)

dirmap = {'<->':0, ' ->':1, '<?>':2, '<- ':3, ' ?>':4, 'who':5}

df['Dir'] = df['Dir'].map(dirmap)

df['Proto'].value_counts()

In [None]:
df['Dir'].value_counts()

In [None]:
## DROP COLUMNS WITH NO CHANGING DATA OR NANs
df = df.dropna('columns') ## ports are dropped due to a wrong register
df = df[[col for col in df if df[col].nunique() > 1]]
print(df.dtypes)

In [None]:
corr = df.corr()
plt.figure(figsize=(15,12))
sns.heatmap(corr)
plt.show()

In [None]:
cor_thr=0.98
print('Shape before feature reduction: ', df.shape)
corr_matrix = df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))
to_drop = [column for column in upper.columns if any(upper[column] > cor_thr)]
print('Features to drop')
print(to_drop)
for i in to_drop:
    df.drop(i,axis = 1,inplace = True)
print('Shape after feature reduction: ', df.shape)

In [None]:
#
# Get data (from example code, not needed)
#
#print('Loading data...')
#(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=10000)
#test_data = x_test
#print(len(x_train), 'train sequences')
#print(len(x_test), 'test sequences')
#print(x_train[450])

#print('Pad sequences (samples x time)')
#x_train = tf.keras.preprocessing.sequence.pad_sequences(sequences=x_train, maxlen=800)
#x_test =  tf.keras.preprocessing.sequence.pad_sequences(sequences=x_test, maxlen=800)
#print('x_train shape:', x_train.shape)
#print('x_test shape:', x_test.shape)

#
# Data modification from Arnau N.
#

# Target variable and train set
y = df[['Label']]
x = df.drop(['Label',], axis=1)

sc = MinMaxScaler()
x= sc.fit_transform(X)

# Split test and train data 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
print(x_train.shape, x_test.shape)
print(y_train.shape, y_test.shape)

In [None]:
#
# prepeare model
#

print('Build model...')
model = Sequential()

model.add(Embedding(input_dim=71858, output_dim=100, input_length=5))
model.add(Dropout(0.5))
model.add(Conv1D(filters=250, kernel_size=3, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPooling1D())
model.add(Dense(250))
model.add(Dropout(0.5))
model.add(Activation('relu'))
model.add(Dense(5, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])              

In [None]:
model.summary()
#SVG(model_to_dot(model,show_shapes = True).create(prog='dot', format='svg'))

In [None]:
#
# Training
#
history = model.fit(x_train, x_train, batch_size=32, epochs=2, validation_data=(x_test, y_test))
history_dict = history.history
history_dict.keys()

In [None]:
#
# evaluation
#
results = model.evaluate(x_test, y_test)
print ("Accuracy on test set:" , results)
print('Test loss:', results[0])
print('Test accuracy:', results[1])

In [None]:
#
# Plot
#
val_loss = history.history['val_loss']
loss = history.history['loss']
accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']

epochs = range(1, len(accuracy) + 1)

plt.rcParams['figure.figsize'] = [10, 5]
plt.subplot(1, 2, 1)
plt.plot(epochs, loss, 'bo', label='Training loss', color='red')
plt.plot(epochs,val_loss , 'b', label='Validation loss', color='green')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(epochs, accuracy, 'bo', label='Training acc', color='red')
plt.plot(epochs, val_accuracy, 'b', label='Validation acc', color='green')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
#
# PREDICTION
#
model_prediction = Sequential()
model_prediction.add(Embedding(10000, 50, input_length=800))
model_prediction.add(Dropout(0.5))
model_prediction.add(Conv1D(filters=250, kernel_size=3, padding='valid', activation='relu', strides=1))
model_prediction.add(GlobalMaxPooling1D())
model_prediction.add(Dense(250))
model_prediction.add(Activation('relu'))
model_prediction.add(Dense(1))
model_prediction.add(Activation('sigmoid'))
model_prediction.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 

In [None]:
#
# training
#
history = model_prediction.fit(x_train, y_train, batch_size=32, epochs=3, validation_data=(x_test, y_test))

In [None]:
#
# Evaluation
#
results = model.evaluate(x_test, y_test)
print ("Accuracy on test set:" , results)
print('Test loss:', results[0])
print('Test accuracy:', results[1])

In [None]:
#
# Plot
#
plt.hist(model_prediction.predict(x_test))

In [None]:
#
# Prediction
#
y_pred = model_prediction.predict(x_test)
prediction_is_positive = y_pred > 0.5
label_is_negative = y_test.reshape((25000,1)) == 0

incorrect_cases = np.where(np.logical_and( prediction_is_positive  , label_is_negative ))[0]
#print ("All incorrect cases: ",incorrect_cases[0:])
print ("Predicted score: ", len(incorrect_cases))