In [1]:
import pandas as pd
import numpy as np
import joblib

In [2]:
ddos = pd.read_csv('../datasets/CICIDS2017/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv')
portscan = pd.read_csv('../datasets/CICIDS2017/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv')
friday = pd.read_csv('../datasets/CICIDS2017/Friday-WorkingHours-Morning.pcap_ISCX.csv')
monday = pd.read_csv('../datasets/CICIDS2017/Monday-WorkingHours.pcap_ISCX.csv')
infiltration = pd.read_csv('../datasets/CICIDS2017/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv')
webattacks = pd.read_csv('../datasets/CICIDS2017/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv')
tuesday = pd.read_csv('../datasets/CICIDS2017/Tuesday-WorkingHours.pcap_ISCX.csv')
wednesday = pd.read_csv('../datasets/CICIDS2017/Wednesday-workingHours.pcap_ISCX.csv')

df = pd.concat([ddos, portscan, friday, monday, infiltration, webattacks, tuesday, wednesday], ignore_index=True)

print(df)

          Destination Port   Flow Duration   Total Fwd Packets  \
0                    54865               3                   2   
1                    55054             109                   1   
2                    55055              52                   1   
3                    46236              34                   1   
4                    54863               3                   2   
...                    ...             ...                 ...   
2830738                 53           32215                   4   
2830739                 53             324                   2   
2830740              58030              82                   2   
2830741                 53         1048635                   6   
2830742                 53           94939                   4   

          Total Backward Packets  Total Length of Fwd Packets  \
0                              0                           12   
1                              1                            6   
2           

### Missing datas
useless here because no missing datas

In [3]:
from sklearn.impute import SimpleImputer # to handle missing data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

print("Missing datas by columns :\n", df.isnull().sum())

#here there is no missing data so we don't have to manage this

df.replace([np.inf, -np.inf], np.nan, inplace=True)
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns


imputer = SimpleImputer(strategy='mean') 
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

Missing datas by columns :
  Destination Port              0
 Flow Duration                 0
 Total Fwd Packets             0
 Total Backward Packets        0
Total Length of Fwd Packets    0
                              ..
Idle Mean                      0
 Idle Std                      0
 Idle Max                      0
 Idle Min                      0
 Label                         0
Length: 79, dtype: int64


### Separing datas and labels

In [4]:
df.columns = df.columns.str.strip() # to clean spaces on the columns names
X = df.drop(columns=['Label']) # without labels
Y = df['Label'] # just labels

### Encoding categorical data
Difference between Label and OneHot : \
                                      - Label gives a number in int for each line (simple but the model could misunderstand the difference between numbers (priorities issues))\
                                      - OneHot encodes in binary columns (no hierarchy between column but a lot of columns if they are a lot of categories to label)

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# no need of OneHotEncoder because all columns except Label are numerical columns

Y = Y.apply(lambda x: 0 if x == 'BENIGN' else 1) # because we have several types of attacks and we wants bianaries Y

print(Y)

0          0
1          0
2          0
3          0
4          0
          ..
2830738    0
2830739    0
2830740    0
2830741    0
2830742    0
Name: Label, Length: 2830743, dtype: int64


### Splitting into training set and test set
Split dataset into training and testing sets (70/30)

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
print(X_train)

         Destination Port  Flow Duration  Total Fwd Packets  \
2092860              53.0        64428.0                2.0   
2140044             443.0       757414.0               12.0   
1590488              53.0       184165.0                4.0   
128477            54342.0      1863306.0                6.0   
1336329              53.0          235.0                2.0   
...                   ...            ...                ...   
2249467              80.0     85562173.0                8.0   
963395              443.0    118654701.0               45.0   
2215104              80.0     87497358.0                8.0   
1484405             443.0      1543512.0                8.0   
305711               80.0    115280042.0               16.0   

         Total Backward Packets  Total Length of Fwd Packets  \
2092860                     2.0                         78.0   
2140044                     8.0                        545.0   
1590488                     2.0                    

### Feature Scaling
To have the same scale between each columns (for instance, `Flow Duration` is way bigger than `Tot Fwd Pkts` and the model could misinterpret it and gives more importance to the Income column)

In [7]:
scaler = StandardScaler()


numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])
print("X_train :\n", X_train, "\n")
print("X_test :\n", X_test)

X_train :
          Destination Port  Flow Duration  Total Fwd Packets  \
2092860         -0.438875      -0.437708          -0.009742   
2140044         -0.417554      -0.417124           0.003378   
1590488         -0.438875      -0.434152          -0.007118   
128477           2.529153      -0.384276          -0.004494   
1336329         -0.438875      -0.439615          -0.009742   
...                   ...            ...                ...   
2249467         -0.437399       2.101863          -0.001870   
963395          -0.417554       3.084823           0.046673   
2215104         -0.437399       2.159344          -0.001870   
1484405         -0.417554      -0.393775          -0.001870   
305711          -0.437399       2.984584           0.008626   

         Total Backward Packets  Total Length of Fwd Packets  \
2092860               -0.008360                    -0.041870   
2140044               -0.002448                    -0.000787   
1590488               -0.008360         

### Prepare data for Deep Learning (convert datas into float32)

In [8]:
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM


X_train = X_train.astype('float32')
X_test = X_test.astype('float32')


Y_train = np.array(Y_train).astype('float32')
Y_test = np.array(Y_test).astype('float32')

X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)

joblib.dump(X_train, '../preprocessed_data/CICIDS/X_train.joblib')
joblib.dump(X_test, '../preprocessed_data/CICIDS/X_test.joblib')
joblib.dump(Y_train, '../preprocessed_data/CICIDS/Y_train.joblib')
joblib.dump(Y_test, '../preprocessed_data/CICIDS/Y_test.joblib')

2025-06-12 14:58:42.120218: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


['../preprocessed_data/CICIDS/Y_test.joblib']

### CNN training

In [9]:
CNN_model = Sequential([
    Input(shape=(X_train.shape[1], 1)),
    Conv1D(32, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2), # keep the maximum each 2 values (divide by 2 the number of values)
    Dropout(0.3), # disable 30% of neurons => reduce overfitting
    Flatten(), # transform the structure from 3D to 2D
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid') # binary output (0=normal or 1=attack)
])

CNN_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

CNN_model.summary()

history = CNN_model.fit(X_train, Y_train, epochs=5, batch_size=64, validation_data=(X_test, Y_test))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 76, 32)            128       
                                                                 
 max_pooling1d (MaxPooling1  (None, 38, 32)            0         
 D)                                                              
                                                                 
 dropout (Dropout)           (None, 38, 32)            0         
                                                                 
 flatten (Flatten)           (None, 1216)              0         
                                                                 
 dense (Dense)               (None, 64)                77888     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                        

2025-06-12 14:58:49.479646: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 618234240 exceeds 10% of free system memory.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### LSTM training

In [10]:
LSTM_model = Sequential([
    Input(shape=(X_train.shape[1], 1)),
    LSTM(64, return_sequences=False),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

LSTM_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

LSTM_model.summary()


history = LSTM_model.fit(X_train, Y_train, epochs=3, batch_size=64, validation_data=(X_test, Y_test))


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 64)                16896     
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 64)                4160      
                                                                 
 dropout_3 (Dropout)         (None, 64)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 21121 (82.50 KB)
Trainable params: 21121 (82.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/3


2025-06-12 15:12:19.659506: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 618234240 exceeds 10% of free system memory.


Epoch 2/3
Epoch 3/3


### AE-LSTM

In [11]:
input_dim = X_train.shape[1]  # n_features

# encoder
inputs = Input(shape=(input_dim, 1))
encoded = LSTM(64, return_sequences=False)(inputs)
encoded = Dense(32, activation='relu')(encoded)

# classifier
x = Dropout(0.3)(encoded)
x = Dense(64, activation='relu')(x)
x = Dropout(0.3)(x)
output = Dense(1, activation='sigmoid')(x)

AE_LSTM_model = Model(inputs, output)

AE_LSTM_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
AE_LSTM_model.summary()


history = AE_LSTM_model.fit(X_train, Y_train, epochs=3, batch_size=64, validation_data=(X_test, Y_test)
)


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 78, 1)]           0         
                                                                 
 lstm_1 (LSTM)               (None, 64)                16896     
                                                                 
 dense_4 (Dense)             (None, 32)                2080      
                                                                 
 dropout_4 (Dropout)         (None, 32)                0         
                                                                 
 dense_5 (Dense)             (None, 64)                2112      
                                                                 
 dropout_5 (Dropout)         (None, 64)                0         
                                                                 
 dense_6 (Dense)             (None, 1)                 65    

2025-06-12 15:47:01.536638: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 618234240 exceeds 10% of free system memory.


Epoch 2/3
Epoch 3/3


In [12]:
CNN_model.save("../models/cnn_CICIDS2017_model.keras")
LSTM_model.save("../models/lstm_CICIDS2017_model.keras")
AE_LSTM_model.save("../models/ae-lstm_CICIDS2017_model.keras")

In [15]:
import pickle

In [18]:
file_name_cnn = '../models/cnn_CICIDS2017.pkl' 
joblib.dump(CNN_model, file_name_cnn) 

file_name_lstm = '../models/lstm_CICIDS2017.pkl' 
joblib.dump(LSTM_model, file_name_lstm) 

file_name_ae_lstm = '../models/ae-lstm_CICIDS2017.pkl' 
joblib.dump(AE_LSTM_model, file_name_ae_lstm)

['../models/ae-lstm_CICIDS2017.pkl']

In [17]:
from tensorflow.keras.models import load_model

cnn_loaded = load_model('../models/cnn_CICIDS2017_model.keras')
results = cnn_loaded.evaluate(X_test, Y_test, batch_size=128)
print("test loss, test acc:", results)


test loss, test acc: [0.04541843757033348, 0.980359673500061]


In [14]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred_prob = cnn_loaded.predict(X_test)


y_pred = (y_pred_prob > 0.5).astype("int32")

# (precision, recall, F1-score)
print(classification_report(Y_test, y_pred, target_names=["Normal", "Attacks"]))

# Confusion Matrix
cm = confusion_matrix(Y_test, y_pred)
print("Confusion matrix :\n", cm)

# Displaying the attack number
tn, fp, fn, tp = cm.ravel()
print(f"Attacks detected (True Positive) : {tp}")
print(f"Normal traffic detected (True Negative) : {tn}")
print(f"Attacks missed (False Negative) : {fn}")
print(f"False alarms (False Positive) : {fp}")


              precision    recall  f1-score   support

      Normal       0.99      0.98      0.99    682324
     Attacks       0.93      0.97      0.95    166899

    accuracy                           0.98    849223
   macro avg       0.96      0.98      0.97    849223
weighted avg       0.98      0.98      0.98    849223

Confusion matrix :
 [[670976  11348]
 [  5331 161568]]
Attacks detected (True Positive) : 161568
Normal traffic detected (True Negative) : 670976
Attacks missed (False Negative) : 5331
False alarms (False Positive) : 11348
