In [119]:
# importing required libraries
import numpy as np
import pandas as pd 
import pickle # saving and loading trained model
from os import path

# importing required libraries for normalizing data
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

# importing library for plotting
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc

import tensorflow as tf
from keras.utils import to_categorical

from keras.layers import Dense, LSTM, MaxPool1D, Flatten, Dropout # importing dense layer
from keras.models import Sequential #importing Sequential layer
from keras.layers import Input
from keras.models import Model

In [120]:
feature=["duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment","urgent","hot",
          "num_failed_logins","logged_in","num_compromised","root_shell","su_attempted","num_root","num_file_creations","num_shells",
          "num_access_files","num_outbound_cmds","is_host_login","is_guest_login","count","srv_count","serror_rate","srv_serror_rate",
          "rerror_rate","srv_rerror_rate","same_srv_rate","diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count", 
          "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate",
          "dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate","label","difficulty"]

In [121]:
def create_datas(data):
    train_data_90_percent = data.sample(frac=0.9)
    service_data = data.drop(train_data_90_percent.index)
    service_data.to_csv("./nsl-kdd/data_for_service.csv")  # that data needs for checking our analyzing model 
    return train_data_90_percent

data = pd.read_csv("./nsl-kdd/data.csv")
data = create_datas(data)
print(data)


        Unnamed: 0  duration protocol_type  service flag  src_bytes  \
136337      136337         4           tcp    pop_3   SF         32   
75917        75917         0           tcp  private   S0          0   
31492        31492         0           tcp  private   SH          0   
73810        73810         0           tcp     http   SF        235   
31324        31324         0           tcp     http   SF        284   
...            ...       ...           ...      ...  ...        ...   
124420      124420         0          icmp    eco_i   SF          8   
113800      113800         0           tcp     http   SF        344   
134103      134103         0           udp  private   SF         51   
37025        37025         0           tcp      mtp   S0          0   
104051      104051         1           tcp     http   SF      54540   

        dst_bytes  land  wrong_fragment  urgent  ...  dst_host_same_srv_rate  \
136337         93     0               0       0  ...               

In [122]:
data.drop(['difficulty'],axis=1,inplace=True)


In [123]:
# Redistribute across common attack class
def change_label(df):
    df.label.replace(['apache2','back','land','neptune','mailbomb','pod','processtable','smurf','teardrop','udpstorm','worm'],'Dos',inplace=True)
    df.label.replace(['ftp_write','guess_passwd','httptunnel','imap','multihop','named','phf','sendmail','snmpgetattack','snmpguess','spy','warezclient','warezmaster','xlock','xsnoop'],'R2L',inplace=True)      
    df.label.replace(['ipsweep','mscan','nmap','portsweep','saint','satan'],'Probe',inplace=True)
    df.label.replace(['buffer_overflow','loadmodule','perl','ps','rootkit','sqlattack','xterm'],'U2R',inplace=True)
    df = df[df['label'] != 'U2R']
    return df 

data = change_label(data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.label.replace(['apache2','back','land','neptune','mailbomb','pod','processtable','smurf','teardrop','udpstorm','worm'],'Dos',inplace=True)


In [124]:
data

Unnamed: 0.1,Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
136337,136337,4,tcp,pop_3,SF,32,93,0,0,0,...,42,0.98,0.05,0.02,0.05,0.00,0.00,0.00,0.00,R2L
75917,75917,0,tcp,private,S0,0,0,0,0,0,...,6,0.02,0.06,0.00,0.00,1.00,1.00,0.00,0.00,Dos
31492,31492,0,tcp,private,SH,0,0,0,0,0,...,1,0.01,0.94,0.95,0.00,0.95,1.00,0.00,0.00,Probe
73810,73810,0,tcp,http,SF,235,9132,0,0,0,...,255,1.00,0.00,0.01,0.03,0.00,0.00,0.00,0.00,normal
31324,31324,0,tcp,http,SF,284,6390,0,0,0,...,130,1.00,0.00,0.03,0.05,0.00,0.00,0.00,0.00,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124420,124420,0,icmp,eco_i,SF,8,0,0,0,0,...,40,1.00,0.00,1.00,0.50,0.00,0.00,0.00,0.00,Probe
113800,113800,0,tcp,http,SF,344,7160,0,0,0,...,255,1.00,0.00,0.01,0.01,0.00,0.00,0.00,0.00,normal
134103,134103,0,udp,private,SF,51,0,0,0,0,...,255,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,R2L
37025,37025,0,tcp,mtp,S0,0,0,0,0,0,...,7,0.03,0.07,0.00,0.00,1.00,1.00,0.00,0.00,Dos


In [125]:
label = pd.DataFrame(data.label)

In [126]:
label

Unnamed: 0,label
136337,R2L
75917,Dos
31492,Probe
73810,normal
31324,normal
...,...
124420,Probe
113800,normal
134103,R2L
37025,Dos


In [127]:
# using standard scaler for normalizing
std_scaler = StandardScaler()
def standardization(df,col):
    for i in col:
        arr = df[i]
        arr = np.array(arr)
        df[i] = std_scaler.fit_transform(arr.reshape(len(arr),1))
    return df

numeric_col = data.select_dtypes(include='number').columns
data = standardization(data,numeric_col)

In [128]:
le2 = preprocessing.LabelEncoder()
enc_label = label.apply(le2.fit_transform)
data['intrusion'] = enc_label
print(data.shape)
data

(133563, 44)


Unnamed: 0.1,Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label,intrusion
136337,1.450498,-0.111272,tcp,pop_3,SF,-0.007285,-0.004145,-0.014219,-0.085872,-0.00683,...,0.994101,-0.175950,-0.406801,0.180853,-0.598506,-0.585522,-0.422043,-0.407067,R2L,2
75917,0.039314,-0.112893,tcp,private,S0,-0.007291,-0.004170,-0.014219,-0.085872,-0.00683,...,-1.147942,-0.124480,-0.471735,-0.280473,1.733656,1.740225,-0.422043,-0.407067,Dos,0
31492,-0.998288,-0.112893,tcp,private,SH,-0.007291,-0.004170,-0.014219,-0.085872,-0.00683,...,-1.170255,4.404910,2.612620,-0.280473,1.617048,1.740225,-0.422043,-0.407067,Probe,1
73810,-0.009898,-0.112893,tcp,http,SF,-0.007250,-0.001734,-0.014219,-0.085872,-0.00683,...,1.038727,-0.433302,-0.439268,-0.003677,-0.598506,-0.585522,-0.422043,-0.407067,normal,3
31324,-1.002211,-0.112893,tcp,http,SF,-0.007241,-0.002465,-0.014219,-0.085872,-0.00683,...,1.038727,-0.433302,-0.374334,0.180853,-0.598506,-0.585522,-0.422043,-0.407067,normal,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124420,1.172162,-0.112893,icmp,eco_i,SF,-0.007290,-0.004170,-0.014219,-0.085872,-0.00683,...,1.038727,-0.433302,2.774955,4.332787,-0.598506,-0.585522,-0.422043,-0.407067,Probe,1
113800,0.924118,-0.112893,tcp,http,SF,-0.007230,-0.002260,-0.014219,-0.085872,-0.00683,...,1.038727,-0.433302,-0.439268,-0.188207,-0.598506,-0.585522,-0.422043,-0.407067,normal,3
134103,1.398320,-0.112893,udp,private,SF,-0.007282,-0.004170,-0.014219,-0.085872,-0.00683,...,1.038727,-0.433302,-0.471735,-0.280473,-0.598506,-0.585522,-0.422043,-0.407067,R2L,2
37025,-0.869058,-0.112893,tcp,mtp,S0,-0.007291,-0.004170,-0.014219,-0.085872,-0.00683,...,-1.125629,-0.073009,-0.471735,-0.280473,1.733656,1.740225,-0.422043,-0.407067,Dos,0


In [129]:
data[['label', 'intrusion']][data['intrusion'] == 2]

Unnamed: 0,label,intrusion
136337,R2L,2
137666,R2L,2
139522,R2L,2
138912,R2L,2
23677,R2L,2
...,...,...
136979,R2L,2
136625,R2L,2
20904,R2L,2
141663,R2L,2


In [130]:
data.drop(labels= ['label'], axis=1, inplace=True)
print(data.shape)

(133563, 43)


In [131]:
# one-hot-encoding categorical columns
data = pd.get_dummies(data,columns=['protocol_type','service','flag'],prefix="",prefix_sep="")  
print(data.shape)
print(data)

(133563, 123)
        Unnamed: 0  duration  src_bytes  dst_bytes      land  wrong_fragment  \
136337    1.450498 -0.111272  -0.007285  -0.004145 -0.014219       -0.085872   
75917     0.039314 -0.112893  -0.007291  -0.004170 -0.014219       -0.085872   
31492    -0.998288 -0.112893  -0.007291  -0.004170 -0.014219       -0.085872   
73810    -0.009898 -0.112893  -0.007250  -0.001734 -0.014219       -0.085872   
31324    -1.002211 -0.112893  -0.007241  -0.002465 -0.014219       -0.085872   
...            ...       ...        ...        ...       ...             ...   
124420    1.172162 -0.112893  -0.007290  -0.004170 -0.014219       -0.085872   
113800    0.924118 -0.112893  -0.007230  -0.002260 -0.014219       -0.085872   
134103    1.398320 -0.112893  -0.007282  -0.004170 -0.014219       -0.085872   
37025    -0.869058 -0.112893  -0.007291  -0.004170 -0.014219       -0.085872   
104051    0.696418 -0.112488   0.002332  -0.001952 -0.014219       -0.085872   

         urgent       hot

In [132]:
X_data= data.drop(labels=['intrusion'], axis=1)
y_data= data['intrusion']
print(y_data)
print('X_train has shape:',X_data.shape,'\ny_train has shape:',y_data.shape)

136337    2
75917     0
31492     1
73810     3
31324     3
         ..
124420    1
113800    3
134103    2
37025     0
104051    0
Name: intrusion, Length: 133563, dtype: int32
X_train has shape: (133563, 122) 
y_train has shape: (133563,)


In [133]:
y_data

136337    2
75917     0
31492     1
73810     3
31324     3
         ..
124420    1
113800    3
134103    2
37025     0
104051    0
Name: intrusion, Length: 133563, dtype: int32

In [134]:
from sklearn.preprocessing import LabelBinarizer
y_data = LabelBinarizer().fit_transform(y_data)
print(y_data)
X_data=np.array(X_data)
y_data=np.array(y_data)

[[0 0 1 0]
 [1 0 0 0]
 [0 1 0 0]
 ...
 [0 0 1 0]
 [1 0 0 0]
 [1 0 0 0]]


In [135]:
X_train, X_test, y_train, y_test = train_test_split(X_data,y_data, test_size=0.2, random_state=42)
print(X_train.shape,'\n',X_test.shape)

(106850, 122) 
 (26713, 122)


In [136]:
# reshape input to be [samples, time steps, features]
print(len(X_train))
X_train = np.reshape(X_train, ( X_train.shape[0], 1 , X_train.shape[1] ))
X_test = np.reshape(X_test, ( X_test.shape[0], 1,  X_test.shape[1] ))
print(X_train.shape)

106850
(106850, 1, 122)


In [137]:
model = Sequential() # initializing model
model.add(LSTM(64,return_sequences=True,input_shape = (1, X_train.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(64,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(32,return_sequences=True))
model.add(Flatten())
model.add(Dense(units=50))
# output layer with softmax activation
model.add(Dense(units=4,activation='softmax'))



In [138]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy',tf.keras.metrics.Precision(),tf.keras.metrics.Recall()])

In [139]:
X_train = X_train.astype(float)
y_train = y_train.astype(float)

In [140]:
print(y_train)

[[0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 ...
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]]


In [141]:
history = model.fit(X_train, y_train, epochs=30, batch_size=5000,validation_split=0.2)

Epoch 1/30
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 62ms/step - accuracy: 0.4538 - loss: 1.3584 - precision_2: 0.0000e+00 - recall_2: 0.0000e+00 - val_accuracy: 0.7919 - val_loss: 1.2337 - val_precision_2: 0.0000e+00 - val_recall_2: 0.0000e+00
Epoch 2/30
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - accuracy: 0.7967 - loss: 1.1428 - precision_2: 0.5571 - recall_2: 0.0482 - val_accuracy: 0.8297 - val_loss: 0.7425 - val_precision_2: 0.8723 - val_recall_2: 0.4717
Epoch 3/30
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - accuracy: 0.8315 - loss: 0.6559 - precision_2: 0.8783 - recall_2: 0.6057 - val_accuracy: 0.8459 - val_loss: 0.4445 - val_precision_2: 0.8940 - val_recall_2: 0.8133
Epoch 4/30
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 33ms/step - accuracy: 0.8441 - loss: 0.4276 - precision_2: 0.8999 - recall_2: 0.8183 - val_accuracy: 0.8570 - val_loss: 0.3461 - val_precision_2: 0.9358 - v

In [142]:
X_test = X_test.astype(float)
y_test = y_test.astype(float)

In [143]:
# predicting target attribute on testing dataset
test_results = model.evaluate(X_test, y_test,verbose=1)
print(f'Test results - Loss: {test_results[0]} - Accuracy: {test_results[1]*100}%')

[1m835/835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 989us/step - accuracy: 0.9896 - loss: 0.0346 - precision_2: 0.9902 - recall_2: 0.9894
Test results - Loss: 0.036833249032497406 - Accuracy: 98.90689849853516%


In [144]:
results = model.evaluate(X_test, y_test,batch_size = 128)

[1m209/209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9896 - loss: 0.0346 - precision_2: 0.9902 - recall_2: 0.9894


In [145]:
print(model.metrics_names)
print(results)

['loss', 'compile_metrics']
[0.036833252757787704, 0.9890689849853516, 0.9896215796470642, 0.98876953125]


In [146]:
from sklearn.metrics import classification_report

y_predict = model.predict(X_test)
y_pred = y_predict.argmax(axis=-1)
y_test = y_test.argmax(axis = -1 )
report = classification_report(y_test, y_pred)
print(report)

[1m835/835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9689
           1       0.97      0.98      0.98      2515
           2       0.92      0.89      0.91       708
           3       0.99      0.99      0.99     13801

    accuracy                           0.99     26713
   macro avg       0.97      0.96      0.97     26713
weighted avg       0.99      0.99      0.99     26713


In [149]:
import keras

keras.saving.save_model(model, "LSTM.keras")