In [None]:
import os, time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
data_path=os.getcwd()+'//data'
df=pd.read_csv(data_path+'/ctu.csv', sep=';')
print(df.shape)

In [None]:
print(df.head())

In [None]:
df['sTos'].value_counts()

In [None]:
df['dTos'].value_counts()

In [None]:
df['Proto'].value_counts()

In [None]:
df['Dir'].value_counts()

In [None]:
df['Label'].value_counts()

In [None]:
#Removing the Labels where only 1 index is available in order to do a validation phase
for i, j in zip(df['Label'].value_counts().index, df['Label'].value_counts()):
    if j < 10:
        df = df.drop(df[df["Label"] == i].index, axis=0)
# Choose the labels (...): for example 3 background labels, Attack type A, Other attacks...

In [None]:
print(df.dtypes)

In [None]:
## DROP COLUMNS NOT NEEDED
df.drop('StartTime',axis = 1,inplace= True)
df.drop('DstAddr',axis = 1,inplace= True)
df.drop('State',axis = 1,inplace= True)
df.drop('SrcAddr',axis = 1,inplace= True)
print(df.dtypes)

In [None]:
## MAP STRINGS TO INT
pmap = {'udp':0, 'tcp':1, 'icmp':2, 'igmp':3, 'rtcp':4, 'arp':5, 'rtp':6, 'ipv6-icmp':7, 'udt':8, 'rarp':9, 'ipx/spx':10, 'ipv6':11, 'pim':12}
df['Proto'] = df['Proto'].map(pmap)

dirmap = {'<->':0, ' ->':1, '<?>':2, '<- ':3, ' ?>':4, 'who':5}
df['Dir'] = df['Dir'].map(dirmap)

In [None]:
df['Proto'].value_counts()

In [None]:
df['Dir'].value_counts()

In [None]:
## DROP COLUMNS WITH NO CHANGING DATA OR NANs
df = df.dropna('columns') ## ports are dropped due to a wrong register
df = df[[col for col in df if df[col].nunique() > 1]]
print(df.dtypes)

In [None]:
df2 = df.drop("Label", axis=1)
df2.head()

In [None]:
corr = df2.corr()
plt.figure(figsize=(15,12))
sns.heatmap(corr)
plt.show()

In [None]:
cor_thr=0.95
print('Shape before feature reduction: ', df.shape)
corr_matrix = df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))
to_drop = [column for column in upper.columns if any(upper[column] > cor_thr)]
print('Features to drop')
print(to_drop)
for i in to_drop:
    df.drop(i,axis = 1,inplace = True)
print('Shape after feature reduction: ', df.shape)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Target variable and train set
Y = df[['Label']]
X = df.drop(['Label',], axis=1)

sc = StandardScaler()
X = sc.fit_transform(X)

# Split test and train data 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape)
print(Y_train.shape, Y_test.shape)

print(X)
print(Y)

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks import EarlyStopping

def ann():
    model = Sequential()
    model.add(Dense(30,input_dim =4,activation = 'relu',kernel_initializer='random_uniform'))
    model.add(Dense(5,activation='sigmoid',kernel_initializer='random_uniform'))
    model.add(Dense(37,activation='softmax'))
    model.compile(loss ='categorical_crossentropy',optimizer = 'adam',metrics = ['accuracy'])
    return model

model_ann = KerasClassifier(build_fn=ann,epochs=10,batch_size=64)
start_time = time.time()
callback = EarlyStopping(monitor='loss', patience=5)
history = model_ann.fit(X_train, Y_train.values.ravel(), validation_split=0.1, callbacks=[callback])
time_train_ann = time.time() - start_time

In [None]:
loss = history.history['val_loss']
accuracy = history.history['val_accuracy']

print(loss)
print(accuracy)
    
plt.rcParams['figure.figsize'] = [10, 5]
plt.subplot(1, 2, 1)
plt.plot(range(1, 11), loss, 'bo', label='Training loss', color='red')
plt.title('Training loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(range(1, 11), accuracy, 'bo', label='Training acc', color='red')
plt.title('Training accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score

def plot_confusion_matrix(Y_real, Y_pred, title=None):
    labels=np.unique(Y_real)
    nc=len(labels)
    data=np.zeros((nc,nc))
    pairs=zip(Y_real,Y_pred)
    pairs=list(pairs)
    for p in pairs: data[np.where(labels==p[0]),np.where(labels==p[1])]+=1
    row_total=np.sum(data,axis=1)
    data=data/row_total.reshape((nc,1))
    sns.set(color_codes=True)
    plt.figure(1, figsize=(9, 6))
    plt.title('Confusion Matrix') if title is None else plt.title(title)
    sns.set(font_scale=1.4)
    ax = sns.heatmap(data, annot=True, cmap='Blues', cbar_kws={'label': 'Scale'})
    ax.set_xticklabels(labels)
    ax.set_yticklabels(labels)
    ax.set(ylabel='True Label', xlabel='Predicted Label')
    plt.show()
    
  
print('ANN evaluation')
start_time = time.time()
Y_train_pred=model_ann.predict(X_train)
Y_test_pred=model_ann.predict(X_test)
time_eval_ann=time.time()-start_time

print('Overall accuracy training: ', balanced_accuracy_score(Y_train,Y_train_pred))
print('Overall accuracy test: ', balanced_accuracy_score(Y_test,Y_test_pred))

#Balanced_accuracy_score es una accuracy balanceada
#Accuracy_score es la accuracy general obtenida durante el entrenamiento

print('Training time (seconds per sample): ', time_train_ann/float(Y_train.shape[0]))
print('Evaluation time (seconds per sample): ', 
time_eval_ann/(float(Y_train.shape[0])+float(Y_test.shape[0])))

#plot_confusion_matrix(Y_train.values.ravel(),Y_train_pred, title='ANN - Training')
#plot_confusion_matrix(Y_test.values.ravel(),Y_test_pred, title='ANN - Testing')