In [1]:
# importing required libraries
import numpy as np
import pandas as pd 
import pickle # saving and loading trained model
from os import path

# importing required libraries for normalizing data
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

# importing library for plotting
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc

import tensorflow as tf
from tensorflow.keras.utils import to_categorical

from keras.layers import Dense, LSTM, MaxPool1D, Flatten, Dropout # importing dense layer
from keras.models import Sequential #importing Sequential layer
from keras.layers import Input
from keras.models import Model




In [2]:
feature=["duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment","urgent","hot",
          "num_failed_logins","logged_in","num_compromised","root_shell","su_attempted","num_root","num_file_creations","num_shells",
          "num_access_files","num_outbound_cmds","is_host_login","is_guest_login","count","srv_count","serror_rate","srv_serror_rate",
          "rerror_rate","srv_rerror_rate","same_srv_rate","diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count", 
          "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate",
          "dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate","label","difficulty"]

In [3]:
train='./nsl-kdd/KDDTrain+.txt'
test='./nsl-kdd/KDDTest+.txt'
test21='./nsl-kdd/KDDTest-21.txt'
train_data=pd.read_csv(train,names=feature)
test_data=pd.read_csv(test,names=feature)
test_data21 = pd.read_csv(test21, names= feature)
data= pd.concat([train_data, test_data], ignore_index=True)
print(data)

        duration protocol_type   service flag  src_bytes  dst_bytes  land  \
0              0           tcp  ftp_data   SF        491          0     0   
1              0           udp     other   SF        146          0     0   
2              0           tcp   private   S0          0          0     0   
3              0           tcp      http   SF        232       8153     0   
4              0           tcp      http   SF        199        420     0   
...          ...           ...       ...  ...        ...        ...   ...   
148512         0           tcp      smtp   SF        794        333     0   
148513         0           tcp      http   SF        317        938     0   
148514         0           tcp      http   SF      54540       8314     0   
148515         0           udp  domain_u   SF         42         42     0   
148516         0           tcp    sunrpc  REJ          0          0     0   

        wrong_fragment  urgent  hot  ...  dst_host_same_srv_rate  \
0      

In [4]:
data.drop(['difficulty'],axis=1,inplace=True)


In [5]:
# Redistribute across common attack class
def change_label(df):
    df.label.replace(['apache2','back','land','neptune','mailbomb','pod','processtable','smurf','teardrop','udpstorm','worm'],'Dos',inplace=True)
    df.label.replace(['ftp_write','guess_passwd','httptunnel','imap','multihop','named','phf','sendmail','snmpgetattack','snmpguess','spy','warezclient','warezmaster','xlock','xsnoop'],'R2L',inplace=True)      
    df.label.replace(['ipsweep','mscan','nmap','portsweep','saint','satan'],'Probe',inplace=True)
    df.label.replace(['buffer_overflow','loadmodule','perl','ps','rootkit','sqlattack','xterm'],'U2R',inplace=True)
    df = df[df['label'] != 'U2R']
    return df 

data = change_label(data)

In [6]:
data

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,Dos
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148512,0,tcp,smtp,SF,794,333,0,0,0,0,...,141,0.72,0.06,0.01,0.01,0.01,0.00,0.00,0.00,normal
148513,0,tcp,http,SF,317,938,0,0,0,0,...,255,1.00,0.00,0.01,0.01,0.01,0.00,0.00,0.00,normal
148514,0,tcp,http,SF,54540,8314,0,0,0,2,...,255,1.00,0.00,0.00,0.00,0.00,0.00,0.07,0.07,Dos
148515,0,udp,domain_u,SF,42,42,0,0,0,0,...,252,0.99,0.01,0.00,0.00,0.00,0.00,0.00,0.00,normal


In [7]:
label = pd.DataFrame(data.label)

In [8]:
label

Unnamed: 0,label
0,normal
1,normal
2,Dos
3,normal
4,normal
...,...
148512,normal
148513,normal
148514,Dos
148515,normal


In [9]:
# using standard scaler for normalizing
std_scaler = StandardScaler()
def standardization(df,col):
    for i in col:
        arr = df[i]
        arr = np.array(arr)
        df[i] = std_scaler.fit_transform(arr.reshape(len(arr),1))
    return df

numeric_col = data.select_dtypes(include='number').columns
data = standardization(data,numeric_col)

In [10]:
le2 = preprocessing.LabelEncoder()
enc_label = label.apply(le2.fit_transform)
data['intrusion'] = enc_label
print(data.shape)
data

(148398, 43)


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label,intrusion
0,-0.112487,tcp,ftp_data,SF,-0.007348,-0.004614,-0.014686,-0.085522,-0.007128,-0.093487,...,-0.813507,-0.278665,0.078551,-0.280670,-0.597963,-0.585070,-0.267276,-0.406920,normal,3
1,-0.112487,udp,other,SF,-0.007412,-0.004614,-0.014686,-0.085522,-0.007128,-0.093487,...,-1.192902,2.658160,2.380077,-0.280670,-0.597963,-0.585070,-0.422162,-0.406920,normal,3
2,-0.112487,tcp,private,S0,-0.007439,-0.004614,-0.014686,-0.085522,-0.007128,-0.093487,...,-0.969729,-0.175618,-0.472519,-0.280670,1.735183,1.741496,-0.422162,-0.406920,Dos,0
3,-0.112487,tcp,http,SF,-0.007396,-0.002414,-0.014686,-0.085522,-0.007128,-0.093487,...,1.038831,-0.433234,-0.375271,0.086775,-0.527968,-0.561804,-0.422162,-0.377103,normal,3
4,-0.112487,tcp,http,SF,-0.007402,-0.004501,-0.014686,-0.085522,-0.007128,-0.093487,...,1.038831,-0.433234,-0.472519,-0.280670,-0.597963,-0.585070,-0.422162,-0.406920,normal,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148512,-0.112487,tcp,smtp,SF,-0.007292,-0.004524,-0.014686,-0.085522,-0.007128,-0.093487,...,0.413946,-0.124095,-0.440103,-0.188809,-0.574631,-0.585070,-0.422162,-0.406920,normal,3
148513,-0.112487,tcp,http,SF,-0.007380,-0.004361,-0.014686,-0.085522,-0.007128,-0.093487,...,1.038831,-0.433234,-0.440103,-0.188809,-0.574631,-0.585070,-0.422162,-0.406920,normal,3
148514,-0.112487,tcp,http,SF,0.002640,-0.002370,-0.014686,-0.085522,-0.007128,0.900234,...,1.038831,-0.433234,-0.472519,-0.280670,-0.597963,-0.585070,-0.205322,-0.198202,Dos,0
148515,-0.112487,udp,domain_u,SF,-0.007431,-0.004603,-0.014686,-0.085522,-0.007128,-0.093487,...,1.016513,-0.381711,-0.472519,-0.280670,-0.597963,-0.585070,-0.422162,-0.406920,normal,3


In [11]:
data[['label', 'intrusion']][data['intrusion'] == 2]

Unnamed: 0,label,intrusion
13,R2L,2
48,R2L,2
148,R2L,2
190,R2L,2
222,R2L,2
...,...,...
148459,R2L,2
148461,R2L,2
148463,R2L,2
148474,R2L,2


In [12]:
data.drop(labels= ['label'], axis=1, inplace=True)
print(data.shape)

(148398, 42)


In [13]:
# one-hot-encoding categorical columns
data = pd.get_dummies(data,columns=['protocol_type','service','flag'],prefix="",prefix_sep="")  
print(data.shape)
print(data)

(148398, 123)
        duration  src_bytes  dst_bytes      land  wrong_fragment    urgent  \
0      -0.112487  -0.007348  -0.004614 -0.014686       -0.085522 -0.007128   
1      -0.112487  -0.007412  -0.004614 -0.014686       -0.085522 -0.007128   
2      -0.112487  -0.007439  -0.004614 -0.014686       -0.085522 -0.007128   
3      -0.112487  -0.007396  -0.002414 -0.014686       -0.085522 -0.007128   
4      -0.112487  -0.007402  -0.004501 -0.014686       -0.085522 -0.007128   
...          ...        ...        ...       ...             ...       ...   
148512 -0.112487  -0.007292  -0.004524 -0.014686       -0.085522 -0.007128   
148513 -0.112487  -0.007380  -0.004361 -0.014686       -0.085522 -0.007128   
148514 -0.112487   0.002640  -0.002370 -0.014686       -0.085522 -0.007128   
148515 -0.112487  -0.007431  -0.004603 -0.014686       -0.085522 -0.007128   
148516 -0.112487  -0.007439  -0.004614 -0.014686       -0.085522 -0.007128   

             hot  num_failed_logins  logged_in  n

In [14]:
X_data= data.drop(labels=['intrusion'], axis=1)
y_data= data['intrusion']
print(y_data)
print('X_train has shape:',X_data.shape,'\ny_train has shape:',y_data.shape)

0         3
1         3
2         0
3         3
4         3
         ..
148512    3
148513    3
148514    0
148515    3
148516    1
Name: intrusion, Length: 148398, dtype: int32
X_train has shape: (148398, 122) 
y_train has shape: (148398,)


In [15]:
y_data

0         3
1         3
2         0
3         3
4         3
         ..
148512    3
148513    3
148514    0
148515    3
148516    1
Name: intrusion, Length: 148398, dtype: int32

In [16]:
from sklearn.preprocessing import LabelBinarizer
y_data = LabelBinarizer().fit_transform(y_data)
print(y_data)
X_data=np.array(X_data)
y_data=np.array(y_data)

[[0 0 0 1]
 [0 0 0 1]
 [1 0 0 0]
 ...
 [1 0 0 0]
 [0 0 0 1]
 [0 1 0 0]]


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_data,y_data, test_size=0.2, random_state=42)
print(X_train.shape,'\n',X_test.shape)

(118718, 122) 
 (29680, 122)


In [18]:
# reshape input to be [samples, time steps, features]
print(len(X_train))
X_train = np.reshape(X_train, ( X_train.shape[0], 1 , X_train.shape[1] ))
X_test = np.reshape(X_test, ( X_test.shape[0], 1,  X_test.shape[1] ))
print(X_train.shape)

118718
(118718, 1, 122)


In [19]:
model = Sequential() # initializing model
model.add(LSTM(64,return_sequences=True,input_shape = (1, X_train.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(64,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(32,return_sequences=True))
model.add(Flatten())
model.add(Dense(units=50))
# output layer with softmax activation
model.add(Dense(units=4,activation='softmax'))




In [20]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy',tf.keras.metrics.Precision(),tf.keras.metrics.Recall()])




In [21]:
X_train = X_train.astype(float)
y_train = y_train.astype(float)

In [22]:
print(y_train)

[[0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 ...
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]]


In [23]:
history = model.fit(X_train, y_train, epochs=30, batch_size=5000,validation_split=0.2)

Epoch 1/30


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [24]:
X_test = X_test.astype(float)
y_test = y_test.astype(float)

In [25]:
# predicting target attribute on testing dataset
test_results = model.evaluate(X_test, y_test,verbose=1)
print(f'Test results - Loss: {test_results[0]} - Accuracy: {test_results[1]*100}%')

Test results - Loss: 0.04343991354107857 - Accuracy: 98.74663352966309%


In [26]:
results = model.evaluate(X_test, y_test,batch_size = 128)



In [27]:
print(model.metrics_names)
print(results)

['loss', 'accuracy', 'precision', 'recall']
[0.043439920991659164, 0.9874663352966309, 0.987662672996521, 0.9871967434883118]


In [28]:
from sklearn.metrics import classification_report

y_predict = model.predict(X_test)
y_pred = y_predict.argmax(axis=-1)
y_test = y_test.argmax(axis = -1 )
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10635
           1       0.97      0.98      0.98      2737
           2       0.88      0.86      0.87       740
           3       0.99      0.99      0.99     15568

    accuracy                           0.99     29680
   macro avg       0.96      0.96      0.96     29680
weighted avg       0.99      0.99      0.99     29680



In [65]:
model.save("LSTM.h5")

  saving_api.save_model(
