In [2]:
import pandas as pd
import numpy as np

In [12]:
attacks = pd.read_csv('../datasets/CICIOT2023/tharindu_cleaned_version/nattack.csv')
benign = pd.read_csv('../datasets/CICIOT2023/tharindu_cleaned_version/nbengin.csv')

df = pd.concat([attacks, benign], ignore_index=True)
print(df)

         Header_Length   LLC   TCP   UDP  DHCP   ARP  ICMP  IGMP   IPv  \
0                37.80  1.00  1.00  0.00   0.0  0.00  0.00   0.0  1.00   
1                35.96  0.99  0.96  0.02   0.0  0.01  0.01   0.0  0.99   
2                36.44  1.00  1.00  0.00   0.0  0.00  0.00   0.0  1.00   
3                37.96  1.00  0.99  0.01   0.0  0.00  0.00   0.0  1.00   
4                37.04  1.00  0.95  0.05   0.0  0.00  0.00   0.0  1.00   
...                ...   ...   ...   ...   ...   ...   ...   ...   ...   
1054080          29.60  1.00  0.90  0.10   0.0  0.00  0.00   0.0  1.00   
1054081          32.00  1.00  1.00  0.00   0.0  0.00  0.00   0.0  1.00   
1054082          29.60  1.00  0.90  0.10   0.0  0.00  0.00   0.0  1.00   
1054083          32.00  1.00  1.00  0.00   0.0  0.00  0.00   0.0  1.00   
1054084          32.00  1.00  1.00  0.00   0.0  0.00  0.00   0.0  1.00   

         Tot sum  ...  fin_flag_number  syn_flag_number  rst_flag_number  \
0           8629  ...             0

### Missing datas
useless here because no missing datas

In [13]:
from sklearn.impute import SimpleImputer # to handle missing data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

print("Missing datas by columns :\n", df.isnull().sum())

#here there is no missing data so we don't have to manage this

df.replace([np.inf, -np.inf], np.nan, inplace=True)
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

imputer = SimpleImputer(strategy='mean')
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

Missing datas by columns :
 Header_Length      0
LLC                0
TCP                0
UDP                0
DHCP               0
ARP                0
ICMP               0
IGMP               0
IPv                0
Tot sum            0
SSH                0
Min                0
Max                0
AVG                0
Std                0
Tot size           0
IAT                0
Number             0
IRC                0
SMTP               0
Protocol Type      0
ece_flag_number    0
Time_To_Live       0
Rate               0
fin_flag_number    0
syn_flag_number    0
rst_flag_number    0
psh_flag_number    0
ack_flag_number    0
cwr_flag_number    0
Label              0
cluster            0
pca1               0
pca2               0
dtype: int64


### Separing datas and labels

In [14]:
df.columns = df.columns.str.strip() # to clean spaces on the columns names
X = df.drop(columns=['Label']) # without labels
Y = df['Label'] # just labels

### Encoding categorical data
Difference between Label and OneHot : \
                                      - Label gives a number in int for each line (simple but the model could misunderstand the difference between numbers (priorities issues))\
                                      - OneHot encodes in binary columns (no hierarchy between column but a lot of columns if they are a lot of categories to label)

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# no need of OneHotEncoder because all columns except Label are numerical columns

Y = Y.apply(lambda x: 0 if x == 'BenignTraffic' else 1) # because we have several types of attacks and we wants bianaries Y

print(Y)

0          1
1          1
2          1
3          1
4          1
          ..
1054080    1
1054081    1
1054082    1
1054083    1
1054084    1
Name: Label, Length: 1054085, dtype: int64


### Splitting into training set and test set
We are now splitting the dataset
The train set that has the full data to train and the test set which has only 3 columns for testing on smaller samples

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
print(X_train)

         Header_Length   LLC   TCP   UDP  DHCP   ARP  ICMP  IGMP   IPv  \
1008282          32.00  1.00  1.00  0.00   0.0  0.00   0.0   0.0  1.00   
942992           29.60  1.00  0.90  0.10   0.0  0.00   0.0   0.0  1.00   
110254           20.00  1.00  1.00  0.00   0.0  0.00   0.0   0.0  1.00   
883848           32.00  1.00  1.00  0.00   0.0  0.00   0.0   0.0  1.00   
506819           20.00  1.00  1.00  0.00   0.0  0.00   0.0   0.0  1.00   
...                ...   ...   ...   ...   ...   ...   ...   ...   ...   
359783           20.00  1.00  1.00  0.00   0.0  0.00   0.0   0.0  1.00   
152315           19.76  1.00  0.98  0.02   0.0  0.00   0.0   0.0  1.00   
963395           32.00  1.00  1.00  0.00   0.0  0.00   0.0   0.0  1.00   
117952           20.00  1.00  1.00  0.00   0.0  0.00   0.0   0.0  1.00   
305711           20.16  0.99  0.98  0.01   0.0  0.01   0.0   0.0  0.99   

         Tot sum  ...          Rate  fin_flag_number  syn_flag_number  \
1008282  33001.0  ...    965.717443   

### Feature Scaling
To have the same scale between each columns (for instance, `Flow Duration` is way bigger than `Tot Fwd Pkts` and the model could misinterpret it and gives more importance to the Income column)

In [16]:
scaler = StandardScaler()


numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])
print("X_train :\n", X_train, "\n")
print("X_test :\n", X_test)

X_train :
          Header_Length       LLC       TCP       UDP      DHCP       ARP  \
1008282       1.182657  0.249023  0.525308 -0.473760 -0.065496 -0.249023   
942992        0.779059  0.249023 -0.646594  0.892451 -0.065496 -0.249023   
110254       -0.835334  0.249023  0.525308 -0.473760 -0.065496 -0.249023   
883848        1.182657  0.249023  0.525308 -0.473760 -0.065496 -0.249023   
506819       -0.835334  0.249023  0.525308 -0.473760 -0.065496 -0.249023   
...                ...       ...       ...       ...       ...       ...   
359783       -0.835334  0.249023  0.525308 -0.473760 -0.065496 -0.249023   
152315       -0.875694  0.249023  0.290928 -0.200518 -0.065496 -0.249023   
963395        1.182657  0.249023  0.525308 -0.473760 -0.065496 -0.249023   
117952       -0.835334  0.249023  0.525308 -0.473760 -0.065496 -0.249023   
305711       -0.808427 -0.074273  0.290928 -0.337139 -0.065496  0.074273   

             ICMP      IGMP       IPv   Tot sum  ...      Rate  \
1008282 -0

### Prepare data for Deep Learning (convert datas into float32)

In [17]:
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM


X_train = X_train.astype('float32')
X_test = X_test.astype('float32')


Y_train = np.array(Y_train).astype('float32')
Y_test = np.array(Y_test).astype('float32')

X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)



### CNN training

In [18]:
CNN_model = Sequential([
    Input(shape=(X_train.shape[1], 1)),
    Conv1D(32, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

CNN_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

CNN_model.summary()

history = CNN_model.fit(X_train, Y_train, epochs=10, batch_size=64, validation_data=(X_test, Y_test))

Epoch 1/10


2025-06-10 15:46:05.601553: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 97397388 exceeds 10% of free system memory.


[1m11530/11530[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 2ms/step - accuracy: 0.9989 - loss: 0.0058 - val_accuracy: 1.0000 - val_loss: 4.0393e-05
Epoch 2/10
[1m11530/11530[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - accuracy: 1.0000 - loss: 1.0606e-04 - val_accuracy: 1.0000 - val_loss: 0.0459
Epoch 3/10
[1m11530/11530[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - accuracy: 1.0000 - loss: 6.9762e-04 - val_accuracy: 1.0000 - val_loss: 2.4659e-05
Epoch 4/10
[1m11530/11530[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - accuracy: 1.0000 - loss: 2.9297e-04 - val_accuracy: 1.0000 - val_loss: 3.2831e-06
Epoch 5/10
[1m11530/11530[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - accuracy: 1.0000 - loss: 4.7883e-05 - val_accuracy: 1.0000 - val_loss: 0.1795
Epoch 6/10
[1m11530/11530[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - accuracy: 1.0000 - loss: 1.9798e-04 - val_accuracy: 1.0

### LSTM training

In [19]:
LSTM_model = Sequential([
    Input(shape=(X_train.shape[1], 1)),
    LSTM(64, return_sequences=False),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

LSTM_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

LSTM_model.summary()


history = LSTM_model.fit(X_train, Y_train, epochs=5, batch_size=64, validation_data=(X_test, Y_test))


Epoch 1/5


2025-06-10 15:49:43.296607: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 97397388 exceeds 10% of free system memory.


[1m11530/11530[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 8ms/step - accuracy: 0.9967 - loss: 0.0143 - val_accuracy: 1.0000 - val_loss: 6.8629e-06
Epoch 2/5
[1m11530/11530[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 8ms/step - accuracy: 1.0000 - loss: 8.5024e-05 - val_accuracy: 1.0000 - val_loss: 2.8048e-06
Epoch 3/5
[1m11530/11530[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 8ms/step - accuracy: 1.0000 - loss: 2.8333e-05 - val_accuracy: 1.0000 - val_loss: 3.7028e-06
Epoch 4/5
[1m11530/11530[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 8ms/step - accuracy: 1.0000 - loss: 1.7180e-04 - val_accuracy: 1.0000 - val_loss: 1.1994e-05
Epoch 5/5
[1m11530/11530[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 8ms/step - accuracy: 1.0000 - loss: 2.6804e-05 - val_accuracy: 1.0000 - val_loss: 1.8913e-04


Let's save our models

In [21]:
CNN_model.save("cnn_ciciot_model.keras")

In [22]:
LSTM_model.save("lstm_ciciot_model.keras")

In [23]:
prediction = LSTM_model.predict(X_test)

[1m9883/9883[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 2ms/step


In [24]:
from tensorflow.keras.models import load_model

cnn_loaded = load_model('cnn_ciciot_model.keras')
results = cnn_loaded.evaluate(X_test, Y_test, batch_size=128)
print("test loss, test acc:", results)


[1m2471/2471[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 820us/step - accuracy: 1.0000 - loss: 1.6210e-05
test loss, test acc: [7.31215623090975e-05, 0.9999936819076538]


In [25]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred_prob = cnn_loaded.predict(X_test)


y_pred = (y_pred_prob > 0.5).astype("int32")

# (precision, recall, F1-score)
print(classification_report(Y_test, y_pred, target_names=["Normal", "Attacks"]))

# Confusion Matrix
cm = confusion_matrix(Y_test, y_pred)
print("Confusion matrix :\n", cm)

# Displaying the attack number
tn, fp, fn, tp = cm.ravel()
print(f"Attacks detected (True Positive) : {tp}")
print(f"Attacks missed (False Negative) : {fn}")
print(f"False alarms (False Positive) : {fp}")


[1m9883/9883[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 436us/step
              precision    recall  f1-score   support

      Normal       1.00      1.00      1.00    155152
     Attacks       1.00      1.00      1.00    161074

    accuracy                           1.00    316226
   macro avg       1.00      1.00      1.00    316226
weighted avg       1.00      1.00      1.00    316226

Confusion matrix :
 [[155152      0]
 [     2 161072]]
Attacks detected (True Positive) : 161072
Attacks missed (False Negative) : 2
False alarms (False Positive) : 0
