In [15]:
import pandas as pd
import numpy as np
import joblib

In [16]:
normal = pd.read_csv('../datasets/InSDN_DatasetCSV/Normal_data.csv')
ovs = pd.read_csv('../datasets/InSDN_DatasetCSV/OVS.csv')
metasploitable = pd.read_csv('../datasets/InSDN_DatasetCSV/metasploitable-2.csv')

df = pd.concat([normal, ovs, metasploitable], ignore_index=True)

print(df)

                                         Flow ID          Src IP  Src Port  \
0       185.127.17.56-192.168.20.133-443-53648-6   185.127.17.56       443   
1       185.127.17.56-192.168.20.133-443-53650-6  192.168.20.133     53650   
2         192.168.20.133-192.168.20.2-35108-53-6  192.168.20.133     35108   
3         192.168.20.133-192.168.20.2-35108-53-6    192.168.20.2        53   
4       154.59.122.74-192.168.20.133-443-60900-6  192.168.20.133     60900   
...                                          ...             ...       ...   
343884  192.168.3.130-200.175.2.130-41966-4444-6   192.168.3.130     41966   
343885  192.168.3.130-200.175.2.130-41967-4444-6   192.168.3.130     41967   
343886   192.168.3.130-200.175.2.130-139-44791-6   200.175.2.130     44791   
343887  192.168.3.130-200.175.2.130-41966-4444-6   192.168.3.130     41966   
343888  192.168.3.130-200.175.2.130-41967-4444-6   192.168.3.130     41967   

                Dst IP  Dst Port  Protocol        Timestamp  Fl

### Missing datas

useless here because no missing datas

In [17]:
from sklearn.impute import SimpleImputer # to handle missing data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# print("Missing datas : " )
# print(df.isnull().sum())


# print(df[['Flow Duration', 'Flow Byts/s', 'Fwd Pkt Len Mean', 'Flow IAT Mean']].head())
# column = ['Flow Duration', 'Flow Byts/s', 'Fwd Pkt Len Mean', 'Flow IAT Mean']
# df[column] = df[column].fillna(df[column].median())

# column = ["SYN Flag Cnt", "Tot Fwd Pkts", "Fwd Act Data Pkts"]
# df.loc[:, column] = df.loc[:,column].fillna(0)

# print(df[['Flow Duration', 'Flow Byts/s', 'Fwd Pkt Len Mean', 'Flow IAT Mean']].head())
# print("Missing datas : " )
# print(df.isnull().sum())

print("Missing datas by columns :\n", df.isnull().sum())

num_cols_median = ['Flow Duration', 'Flow Byts/s', 'Fwd Pkt Len Mean', 'Flow IAT Mean']
df[num_cols_median] = df[num_cols_median].fillna(df[num_cols_median].median())

cols_fill0 = ["SYN Flag Cnt", "Tot Fwd Pkts", "Fwd Act Data Pkts"]
df[cols_fill0] = df[cols_fill0].fillna(0)
df = df.drop(columns=['Flow ID', 'Src IP', 'Dst IP', 'Timestamp'])

Missing datas by columns :
 Flow ID      0
Src IP       0
Src Port     0
Dst IP       0
Dst Port     0
            ..
Idle Mean    0
Idle Std     0
Idle Max     0
Idle Min     0
Label        0
Length: 84, dtype: int64


### Separing datas and labels

In [18]:
X = df.drop(columns=['Label']) # without labels
Y = df['Label'] # just labels

### Encoding categorical data
Difference between Label and OneHot : \
                                      - Label gives a number in int for each line (simple but the model could misunderstand the difference between numbers (priorities issues))\
                                      - OneHot encodes in binary columns (no hierarchy between column but a lot of columns if they are a lot of categories to label)

In [19]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_protocol = encoder.fit_transform(X[['Protocol']])
protocol_cols = encoder.get_feature_names_out(['Protocol'])
encoded_protocol_df = pd.DataFrame(encoded_protocol, columns=protocol_cols, index=X.index) # convert in dataframe
X = pd.concat([X.drop(columns=['Protocol']), encoded_protocol_df], axis=1) #concatenate with the encoded version of protocol

print(X)

# Y_encoder = LabelEncoder()
# Y = Y_encoder.fit_transform(Y)

Y = Y.apply(lambda x: 0 if x == 'Normal' else 1) # because we have several types of attacks and we wants bianaries Y

print(Y)

        Src Port  Dst Port  Flow Duration  Tot Fwd Pkts  Tot Bwd Pkts  \
0            443     53648         245230            44            40   
1          53650       443        1605449           107           149   
2          35108        53          53078             5             5   
3             53     35108           6975             1             1   
4          60900       443         190141            13            16   
...          ...       ...            ...           ...           ...   
343884     41966      4444         273133             2             3   
343885     41967      4444         267969             6             7   
343886     44791       139        1552555             6             5   
343887     41966      4444         272141             2             3   
343888     41967      4444         270361             6             7   

        TotLen Fwd Pkts  TotLen Bwd Pkts  Fwd Pkt Len Max  Fwd Pkt Len Min  \
0                124937             1071     

### Splitting into training set and test set
Split dataset into training and testing sets (70/30)

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
print(X_train)

        Src Port  Dst Port  Flow Duration  Tot Fwd Pkts  Tot Bwd Pkts  \
325042        80     44370          31423             0             2   
298378     38739        80           3838             4             4   
268971         0         0             24             0             2   
253598         0         0             38             0             2   
253091         0         0             13             0             2   
...          ...       ...            ...           ...           ...   
122579     43468        80          59812             2             6   
304137        80     33495       62379457             3             7   
152315     41830        80          34759             2             6   
117952     37776        80          15896             2             6   
305711        80     41927       63291589             3             7   

        TotLen Fwd Pkts  TotLen Bwd Pkts  Fwd Pkt Len Max  Fwd Pkt Len Min  \
325042                0                0     

### Feature Scaling
To have the same scale between each columns (for instance, `Flow Duration` is way bigger than `Tot Fwd Pkts` and the model could misinterpret it and gives more importance to the Income column)

In [21]:
scaler = StandardScaler()

numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])
print("X_train :\n", X_train, "\n")
print("X_test :\n", X_test)

X_train :
         Src Port  Dst Port  Flow Duration  Tot Fwd Pkts  Tot Bwd Pkts  \
325042 -0.924461  2.234924      -0.306988     -0.003916     -0.036937   
298378  0.709191 -0.490033      -0.308252     -0.001762     -0.019123   
268971 -0.927842 -0.494955      -0.308426     -0.003916     -0.036937   
253598 -0.927842 -0.494955      -0.308426     -0.003916     -0.036937   
253091 -0.927842 -0.494955      -0.308427     -0.003916     -0.036937   
...          ...       ...            ...           ...           ...   
122579  0.909030 -0.490033      -0.305688     -0.002839     -0.001309   
304137 -0.924461  1.565836       2.548979     -0.002300      0.007599   
152315  0.839811 -0.490033      -0.306835     -0.002839     -0.001309   
117952  0.668497 -0.490033      -0.307699     -0.002839     -0.001309   
305711 -0.924461  2.084618       2.590761     -0.002300      0.007599   

        TotLen Fwd Pkts  TotLen Bwd Pkts  Fwd Pkt Len Max  Fwd Pkt Len Min  \
325042        -0.009755        -0.

### Prepare data for Deep Learning (convert datas into float32)

In [22]:
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM


X_train = X_train.astype('float32')
X_test = X_test.astype('float32')


Y_train = np.array(Y_train).astype('float32')
Y_test = np.array(Y_test).astype('float32')

X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)

joblib.dump(X_train, '../preprocessed_data/InSDN/X_train.joblib')
joblib.dump(X_test, '../preprocessed_data/InSDN/X_test.joblib')
joblib.dump(Y_train, '../preprocessed_data/InSDN/Y_train.joblib')
joblib.dump(Y_test, '../preprocessed_data/InSDN/Y_test.joblib')

['../preprocessed_data/InSDN/Y_test.joblib']

### CNN training

In [23]:
CNN_model = Sequential([
    Input(shape=(X_train.shape[1], 1)),
    Conv1D(32, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2), # keep the maximum each 2 values (divide by 2 the number of values)
    Dropout(0.3), # disable 30% of neurons => reduce overfitting
    Flatten(), # transform the structure from 3D to 2D
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid') # binary output (0=normal or 1=attack)
])

CNN_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

CNN_model.summary()

history = CNN_model.fit(X_train, Y_train, epochs=10, batch_size=64, validation_data=(X_test, Y_test))

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_1 (Conv1D)           (None, 79, 32)            128       
                                                                 
 max_pooling1d_1 (MaxPoolin  (None, 39, 32)            0         
 g1D)                                                            
                                                                 
 dropout_6 (Dropout)         (None, 39, 32)            0         
                                                                 
 flatten_1 (Flatten)         (None, 1248)              0         
                                                                 
 dense_7 (Dense)             (None, 64)                79936     
                                                                 
 dropout_7 (Dropout)         (None, 64)                0         
                                                      

### LSTM training

In [24]:
LSTM_model = Sequential([
    Input(shape=(X_train.shape[1], 1)),
    LSTM(64, return_sequences=False),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

LSTM_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

LSTM_model.summary()


history = LSTM_model.fit(X_train, Y_train, epochs=5, batch_size=64, validation_data=(X_test, Y_test))


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 64)                16896     
                                                                 
 dropout_8 (Dropout)         (None, 64)                0         
                                                                 
 dense_9 (Dense)             (None, 64)                4160      
                                                                 
 dropout_9 (Dropout)         (None, 64)                0         
                                                                 
 dense_10 (Dense)            (None, 1)                 65        
                                                                 
Total params: 21121 (82.50 KB)
Trainable params: 21121 (82.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/5
Epoch 2/

### AE-LSTM

In [25]:
input_dim = X_train.shape[1]  # n_features

# encoder
inputs = Input(shape=(input_dim, 1))
encoded = LSTM(64, return_sequences=False)(inputs)
encoded = Dense(32, activation='relu')(encoded)

# classifier
x = Dropout(0.3)(encoded)
x = Dense(64, activation='relu')(x)
x = Dropout(0.3)(x)
output = Dense(1, activation='sigmoid')(x)

AE_LSTM_model = Model(inputs, output)

AE_LSTM_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
AE_LSTM_model.summary()


history = AE_LSTM_model.fit(X_train, Y_train, epochs=5, batch_size=64, validation_data=(X_test, Y_test)
)


Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 81, 1)]           0         
                                                                 
 lstm_3 (LSTM)               (None, 64)                16896     
                                                                 
 dense_11 (Dense)            (None, 32)                2080      
                                                                 
 dropout_10 (Dropout)        (None, 32)                0         
                                                                 
 dense_12 (Dense)            (None, 64)                2112      
                                                                 
 dropout_11 (Dropout)        (None, 64)                0         
                                                                 
 dense_13 (Dense)            (None, 1)                 65  

Let's save our models

In [26]:
CNN_model.save("../models/cnn_insdn_model.keras")
LSTM_model.save("../models/lstm_insdn_model.keras")
AE_LSTM_model.save("../models/ae-lstm_insdn_model.keras")

In [None]:
import pickle

In [None]:
file_name_cnn = '../models/cnn_InSDN.pkl' 
joblib.dump(CNN_model, file_name_cnn) 

file_name_lstm = '../models/lstm_InSDN.pkl' 
joblib.dump(LSTM_model, file_name_lstm) 

file_name_ae_lstm = '../models/ae-lstm_InSDN.pkl' 
joblib.dump(AE_LSTM_model, file_name_ae_lstm)

['../models/ae-lstm_CICIDS2017.pkl']

In [27]:
from tensorflow.keras.models import load_model

cnn_loaded = load_model('../models/cnn_insdn_model.keras')
results = cnn_loaded.evaluate(X_test, Y_test, batch_size=128)
print("test loss, test acc:", results)


test loss, test acc: [0.0031358785927295685, 0.9995056390762329]


In [28]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred_prob = cnn_loaded.predict(X_test)


y_pred = (y_pred_prob > 0.5).astype("int32")

# (precision, recall, F1-score)
print(classification_report(Y_test, y_pred, target_names=["Normal", "Attacks"]))

# Confusion Matrix
cm = confusion_matrix(Y_test, y_pred)
print("Confusion matrix :\n", cm)

# Displaying the attack number
tn, fp, fn, tp = cm.ravel()
print(f"Attacks detected (True Positive) : {tp}")
print(f"Normal traffic detected (True Negative) : {tn}")
print(f"Attacks missed (False Negative) : {fn}")
print(f"False alarms (False Positive) : {fp}")

              precision    recall  f1-score   support

      Normal       1.00      1.00      1.00     20566
     Attacks       1.00      1.00      1.00     82601

    accuracy                           1.00    103167
   macro avg       1.00      1.00      1.00    103167
weighted avg       1.00      1.00      1.00    103167

Confusion matrix :
 [[20528    38]
 [   13 82588]]
Attacks detected (True Positive) : 82588
Normal traffic detected (True Negative) : 20528
Attacks missed (False Negative) : 13
False alarms (False Positive) : 38
