#### <center> Network Intrusion Detection Model - NSLKDD Dataset - DNN </center>

#### Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import Normalizer
from sklearn.metrics import precision_score, recall_score,f1_score, accuracy_score,confusion_matrix

from keras import callbacks
from keras.utils.np_utils import to_categorical
from keras.models import Model, Sequential
from keras.layers import Dense, Dropout, Activation
from keras.callbacks import ModelCheckpoint, CSVLogger

from art.attacks import FastGradientMethod, SaliencyMapMethod
from art.classifiers import KerasClassifier

import matplotlib.pyplot as plt
%matplotlib inline

Using TensorFlow backend.


In [2]:
names=['duration','protocol_type','service','flag','src_bytes','dst_bytes','land',
       'wrong_fragment','urgent','hot','num_failed_logins','logged_in','num_compromised',
       'root_shell','su_attempted','num_root','num_file_creations','num_shells',
       'num_access_files','num_outbound_cmds','is_host_login','is_guest_login',
       'count','srv_count','serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate'
       ,'same_srv_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count',
       'dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate',
       'dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate',
       'dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate',
       'label']
print(len(names))

42


#### Loading the dataset

In [3]:
traindata=pd.read_csv('data/onetrain.txt',names=names,header=None)
testdata=pd.read_csv('data/onetest.txt',names=names,header=None)

In [4]:
print("Shape of Train and Test DF : ",traindata.shape," : ",testdata.shape)

Shape of Train and Test DF :  (125973, 42)  :  (22544, 42)


#### Preprocessing

In [5]:
data=pd.concat([traindata,testdata])
assert data.shape[0]==traindata.shape[0]+testdata.shape[0]
print("data shape : ",data.shape)

data shape :  (148517, 42)


In [6]:
data.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


In [7]:
print("Unique Labels : ",data['label'].unique())

Unique Labels :  ['normal' 'neptune' 'warezclient' 'ipsweep' 'portsweep' 'teardrop' 'nmap'
 'satan' 'smurf' 'pod' 'back' 'guess_passwd' 'ftp_write' 'multihop'
 'rootkit' 'buffer_overflow' 'imap' 'warezmaster' 'phf' 'land'
 'loadmodule' 'spy' 'perl' 'saint' 'mscan' 'apache2' 'snmpgetattack'
 'processtable' 'httptunnel' 'ps' 'snmpguess' 'mailbomb' 'named'
 'sendmail' 'xterm' 'worm' 'xlock' 'xsnoop' 'sqlattack' 'udpstorm']


In [8]:
# Normal attacks
data.loc[data.label=='normal','label']=0

In [9]:
# DoS attacks
data.loc[data.label=='neptune','label']=1
data.loc[data.label=='back','label']=1
data.loc[data.label=='land','label']=1
data.loc[data.label=='pod','label']=1
data.loc[data.label=='smurf','label']=1
data.loc[data.label=='teardrop','label']=1
data.loc[data.label=='mailbomb','label']=1 
data.loc[data.label=='processtable','label']=1 
data.loc[data.label=='udpstorm','label']=1 
data.loc[data.label=='apache2','label']=1 
data.loc[data.label=='worm','label']=1 

In [10]:
# User -to - Root(U2R)
data.loc[data.label=='buffer_overflow','label']=2
data.loc[data.label=='loadmodule','label']=2
data.loc[data.label=='perl','label']=2
data.loc[data.label=='rootkit','label']=2
data.loc[data.label=='sqlattack','label']=2 
data.loc[data.label=='xterm','label']=2
data.loc[data.label=='ps','label']=2

In [11]:
# Remote -to - Local(R2L)
data.loc[data.label=='ftp_write','label']=3
data.loc[data.label=='guess_passwd','label']=3
data.loc[data.label=='imap','label']=3
data.loc[data.label=='multihop','label']=3
data.loc[data.label=='phf','label']=3
data.loc[data.label=='spy','label']=3
data.loc[data.label=='warezclient','label']=3
data.loc[data.label=='warezmaster','label']=3
data.loc[data.label=='xlock','label']=3 
data.loc[data.label=='xsnoop','label']=3 
data.loc[data.label=='snmpgetattack','label']=3 
data.loc[data.label=='httptunnel','label']=3 
data.loc[data.label=='snmpguess','label']=3 
data.loc[data.label=='sendmail','label']=3 
data.loc[data.label=='named','label']=3 

In [12]:
# Probe attacls
data.loc[data.label=='satan','label']=4
data.loc[data.label=='ipsweep','label']=4
data.loc[data.label=='nmap','label']=4
data.loc[data.label=='portsweep','label']=4
data.loc[data.label=='saint','label']=4
data.loc[data.label=='mscan','label']=4 

In [13]:
data.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,0
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,0
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,1
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,0
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [14]:
data_new=pd.get_dummies(data)
print("Header : ",list(data_new))
data_new.head()

Header :  ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label', 'protocol_type_icmp', 'protocol_type_tcp', 'protocol_type_udp', 'service_IRC', 'service_X11', 'service_Z39_50', 'service_aol', 'service_auth', 'service_bgp', 'service_courier', 'service_csnet_ns', 'service_ctf', 'service_daytime', 'service_discard', 'service_domain', 'service_domain_u', 

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0,491,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,146,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,232,8153,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,0,199,420,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [15]:
#Dropping label column from the dataframe
data_label=np.array(data['label'])
data_new=data_new.drop(['label'],axis=1)

In [16]:
x_train=np.array(data_new[0:traindata.shape[0]][:])
x_test=np.array(data_new[traindata.shape[0]:][:])

y_train=data_label[0: traindata.shape[0]]
y_test=data_label[traindata.shape[0]:]

print("Shape of Train and Test : ",x_train.shape," : ",x_test.shape)

Shape of Train and Test :  (125973, 122)  :  (22544, 122)


In [17]:
scaler = Normalizer().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [18]:
y_train_ohe= to_categorical(y_train)
y_test_ohe= to_categorical(y_test)

In [19]:
print("First five labels : \n",y_train[10:20])
print("One hot encoding : \n",y_train_ohe[10:20,:])

First five labels : 
 [1 1 0 3 1 1 0 4 0 0]
One hot encoding : 
 [[0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]]


#### Designing the Model

In [20]:
NO_OF_EPOCHS = 1
BATCH_SIZE = 32

model = Sequential()
model.add(Dense(1024,input_dim=122,activation='relu'))  
model.add(Dropout(0.01))
model.add(Dense(768,activation='relu'))  
model.add(Dropout(0.01))
model.add(Dense(512,activation='relu'))  
model.add(Dropout(0.01))
model.add(Dense(256,activation='relu'))  
model.add(Dropout(0.01))
model.add(Dense(128,activation='relu'))  
model.add(Dropout(0.01))
model.add(Dense(5))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1024)              125952    
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 768)               787200    
_________________________________________________________________
dropout_2 (Dropout)          (None, 768)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 512)               393728    
_________________________________________________________________
dropout_3 (Dropout)  

#### Training the model

In [21]:
#To save the best models
checkpointer = callbacks.ModelCheckpoint(filepath="checkpoint-{epoch:04d}.hdf5", 
                                         verbose=1, save_best_only=True, monitor='acc',
                                         mode='max')

#To log the results after each epochs
csv_logger = CSVLogger('result.csv',separator=',', append=False)

#Training
model.fit(x_train,y_train_ohe,batch_size=BATCH_SIZE,
          epochs=NO_OF_EPOCHS,verbose=1,
          callbacks=[checkpointer,csv_logger])

Instructions for updating:
Use tf.cast instead.
Epoch 1/1

Epoch 00001: acc improved from -inf to 0.96385, saving model to checkpoint-0001.hdf5


<keras.callbacks.History at 0x1d81ab33978>

#### Helper functions

In [25]:
#Prints all the popular metrics
def printMetrics(true,pred):
    print("Accuracy : ",accuracy_score(true, pred))
    print("Precision",precision_score(true, pred , average="weighted"))
    print("Recall : ",recall_score(true, pred , average="weighted"))
    print("F1-score : ",f1_score(true, pred, average="weighted"))
    print("Confusion Matrix : ")
    print(confusion_matrix(true, pred))
    
#Computes the average features changed per datapoint
def adversarialFeatures(actual,adversarial):
    feats=dict()
    total=0
    orig_attack=actual - adversarial
    for i in range(0,orig_attack.shape [0]) :
        ind=np.where(orig_attack [i,:] != 0) [0]
        total += len(ind)
        for j in ind :
            if j in feats :
                feats [j] += 1
            else :
                feats [j]=1
                
    # The number of features that where changed for the adversarial samples
    print(" Number of unique features changed :",len(feats.keys()))
    print(" Number of average features changed per datapoint ",total / len(orig_attack))

#### Loading the saved weights 

In [26]:
model.load_weights("checkpoint-0001.hdf5")

#### Testing

In [27]:
y_pred = model.predict_classes(x_test,verbose=1)
printMetrics(y_test,y_pred)

Accuracy :  0.7376242015613911
Precision 0.6665368730603406
Recall :  0.7376242015613911
F1-score :  0.6868628428152727
Confusion Matrix : 
[[9473   41    0    1  196]
 [1125 5675    0    0  660]
 [  67    0    0    0    0]
 [2836    1    0    0   48]
 [ 614  326    0    0 1481]]


#### Adversarial Attacks

In [28]:
classifier = KerasClassifier(clip_values=(np.min(x_train), np.max(x_train)), model=model)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



In [29]:
fgsm = FastGradientMethod(classifier)

x_test_adv_fgsm = fgsm.generate(x=x_test)

y_adv_pred = model.predict_classes(x_test_adv_fgsm,verbose=1)
printMetrics(y_test,y_adv_pred)

adversarialFeatures(x_test,x_test_adv_fgsm)

Accuracy :  0.0646291696238467
Precision 0.08097859641021306
Recall :  0.0646291696238467
F1-score :  0.05775339059551669
Confusion Matrix : 
[[  21 1072    0    0 8618]
 [ 893  877    0    0 5690]
 [   0    5    0    0   62]
 [   4  606    0    0 2275]
 [ 268 1594    0    0  559]]
 Number of unique features changed : 122
 Number of average features changed per datapoint  64.57447657913414


In [31]:
jsma=SaliencyMapMethod(classifier)

x_test_adv_jsma=jsma.generate(x=x_test)

y_adv_pred = model.predict_classes(x_test_adv_jsma,verbose=1)
printMetrics(y_test,y_adv_pred)

adversarialFeatures(x_test,x_test_adv_jsma)

Accuracy :  0.050390347764371894
Precision 0.04040730830549541
Recall :  0.050390347764371894
F1-score :  0.0229009774383665
Confusion Matrix : 
[[ 196    0    0    0 9515]
 [ 660    0    0    0 6800]
 [   0    0    0    0   67]
 [  48    0    0    0 2837]
 [1481    0    0    0  940]]
 Number of unique features changed : 115
 Number of average features changed per datapoint  16.513795244854506
