In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import joblib

2025-06-16 11:26:44.204801: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
attacks = pd.read_csv('../../datasets/CICIOT2023/tharindu_cleaned_version/nattack.csv')
benign = pd.read_csv('../../datasets/CICIOT2023/tharindu_cleaned_version/nbengin.csv')

df = pd.concat([attacks, benign], ignore_index=True)
print(df)

         Header_Length   LLC   TCP   UDP  DHCP   ARP  ICMP  IGMP   IPv  \
0                37.80  1.00  1.00  0.00   0.0  0.00  0.00   0.0  1.00   
1                35.96  0.99  0.96  0.02   0.0  0.01  0.01   0.0  0.99   
2                36.44  1.00  1.00  0.00   0.0  0.00  0.00   0.0  1.00   
3                37.96  1.00  0.99  0.01   0.0  0.00  0.00   0.0  1.00   
4                37.04  1.00  0.95  0.05   0.0  0.00  0.00   0.0  1.00   
...                ...   ...   ...   ...   ...   ...   ...   ...   ...   
1054080          29.60  1.00  0.90  0.10   0.0  0.00  0.00   0.0  1.00   
1054081          32.00  1.00  1.00  0.00   0.0  0.00  0.00   0.0  1.00   
1054082          29.60  1.00  0.90  0.10   0.0  0.00  0.00   0.0  1.00   
1054083          32.00  1.00  1.00  0.00   0.0  0.00  0.00   0.0  1.00   
1054084          32.00  1.00  1.00  0.00   0.0  0.00  0.00   0.0  1.00   

         Tot sum  ...  fin_flag_number  syn_flag_number  rst_flag_number  \
0           8629  ...             0

### Missing datas
useless here because no missing datas

In [3]:
from sklearn.impute import SimpleImputer # to handle missing data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

print("Missing datas by columns :\n", df.isnull().sum())

#here there is no missing data so we don't have to manage this

df.replace([np.inf, -np.inf], np.nan, inplace=True)
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

imputer = SimpleImputer(strategy='mean')
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])


#Drop useless columns

useless_column = []
# columns where we have always the same value
for col in df.columns:
    if df[col].nunique() == 1:
        useless_column.append(col)

print(useless_column)

df.drop(columns=useless_column, inplace=True)

Missing datas by columns :
 Header_Length      0
LLC                0
TCP                0
UDP                0
DHCP               0
ARP                0
ICMP               0
IGMP               0
IPv                0
Tot sum            0
SSH                0
Min                0
Max                0
AVG                0
Std                0
Tot size           0
IAT                0
Number             0
IRC                0
SMTP               0
Protocol Type      0
ece_flag_number    0
Time_To_Live       0
Rate               0
fin_flag_number    0
syn_flag_number    0
rst_flag_number    0
psh_flag_number    0
ack_flag_number    0
cwr_flag_number    0
Label              0
cluster            0
pca1               0
pca2               0
dtype: int64
['Protocol Type']


### Separing datas and labels

In [4]:
df.columns = df.columns.str.strip() # to clean spaces on the columns names
X = df.drop(columns=['Label']) # without labels
Y = df['Label'] # just labels

### Encoding categorical data
Difference between Label and OneHot : \
                                      - Label gives a number in int for each line (simple but the model could misunderstand the difference between numbers (priorities issues))\
                                      - OneHot encodes in binary columns (no hierarchy between column but a lot of columns if they are a lot of categories to label)

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# no need of OneHotEncoder because all columns are numerical columns

print(Y)

0          1.0
1          1.0
2          1.0
3          1.0
4          1.0
          ... 
1054080    0.0
1054081    0.0
1054082    0.0
1054083    0.0
1054084    0.0
Name: Label, Length: 1054085, dtype: float64


### Splitting into training set and test set
Split dataset into training and testing sets (70/30)


In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
print(X_train)

         Header_Length   LLC   TCP   UDP  DHCP   ARP  ICMP  IGMP   IPv  \
1008282          32.00  1.00  1.00  0.00   0.0  0.00   0.0   0.0  1.00   
942992           29.60  1.00  0.90  0.10   0.0  0.00   0.0   0.0  1.00   
110254           20.00  1.00  1.00  0.00   0.0  0.00   0.0   0.0  1.00   
883848           32.00  1.00  1.00  0.00   0.0  0.00   0.0   0.0  1.00   
506819           20.00  1.00  1.00  0.00   0.0  0.00   0.0   0.0  1.00   
...                ...   ...   ...   ...   ...   ...   ...   ...   ...   
359783           20.00  1.00  1.00  0.00   0.0  0.00   0.0   0.0  1.00   
152315           19.76  1.00  0.98  0.02   0.0  0.00   0.0   0.0  1.00   
963395           32.00  1.00  1.00  0.00   0.0  0.00   0.0   0.0  1.00   
117952           20.00  1.00  1.00  0.00   0.0  0.00   0.0   0.0  1.00   
305711           20.16  0.99  0.98  0.01   0.0  0.01   0.0   0.0  0.99   

         Tot sum  ...          Rate  fin_flag_number  syn_flag_number  \
1008282  33001.0  ...    965.717443   

### Feature Scaling
To have the same scale between each columns (for instance, `Flow Duration` is way bigger than `Tot Fwd Pkts` and the model could misinterpret it and gives more importance to the Income column)

In [7]:
scaler = StandardScaler()


numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])
print("X_train :\n", X_train, "\n")
print("X_test :\n", X_test)

X_train :
          Header_Length       LLC       TCP       UDP      DHCP       ARP  \
1008282       1.182657  0.249023  0.525308 -0.473760 -0.065496 -0.249023   
942992        0.779059  0.249023 -0.646594  0.892451 -0.065496 -0.249023   
110254       -0.835334  0.249023  0.525308 -0.473760 -0.065496 -0.249023   
883848        1.182657  0.249023  0.525308 -0.473760 -0.065496 -0.249023   
506819       -0.835334  0.249023  0.525308 -0.473760 -0.065496 -0.249023   
...                ...       ...       ...       ...       ...       ...   
359783       -0.835334  0.249023  0.525308 -0.473760 -0.065496 -0.249023   
152315       -0.875694  0.249023  0.290928 -0.200518 -0.065496 -0.249023   
963395        1.182657  0.249023  0.525308 -0.473760 -0.065496 -0.249023   
117952       -0.835334  0.249023  0.525308 -0.473760 -0.065496 -0.249023   
305711       -0.808427 -0.074273  0.290928 -0.337139 -0.065496  0.074273   

             ICMP      IGMP       IPv   Tot sum  ...      Rate  \
1008282 -0

### Prepare data for Deep Learning (convert datas into float32)

In [8]:
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')


Y_train = np.array(Y_train).astype('float32')
Y_test = np.array(Y_test).astype('float32')

X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)

joblib.dump(X_train, '../../preprocessed_data/CICIOT/X_train.joblib')
joblib.dump(X_test, '../../preprocessed_data/CICIOT/X_test.joblib')
joblib.dump(Y_train, '../../preprocessed_data/CICIOT/Y_train.joblib')
joblib.dump(Y_test, '../../preprocessed_data/CICIOT/Y_test.joblib')


['../../preprocessed_data/CICIOT/Y_test.joblib']