In [66]:
import pandas as pd
import numpy as np
import tensorflow as tf
import joblib

In [67]:
attacks = pd.read_csv('../../datasets/CICIOT2023/tharindu_cleaned_version/nattack.csv')
benign = pd.read_csv('../../datasets/CICIOT2023/tharindu_cleaned_version/nbengin.csv')

df = pd.concat([attacks, benign], ignore_index=True)

df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(df)

         Header_Length  LLC  TCP  UDP  DHCP  ARP  ICMP  IGMP  IPv  Tot sum  \
0                 20.0  1.0  1.0  0.0   0.0  0.0   0.0   0.0  1.0     6000   
1                 25.6  0.8  0.8  0.0   0.0  0.2   0.0   0.0  0.8      648   
2                 32.0  1.0  1.0  0.0   0.0  0.0   0.0   0.0  1.0      671   
3                 32.0  1.0  1.0  0.0   0.0  0.0   0.0   0.0  1.0      660   
4                 20.0  1.0  1.0  0.0   0.0  0.0   0.0   0.0  1.0     6000   
...                ...  ...  ...  ...   ...  ...   ...   ...  ...      ...   
1054080           20.0  1.0  1.0  0.0   0.0  0.0   0.0   0.0  1.0     6000   
1054081           20.0  1.0  1.0  0.0   0.0  0.0   0.0   0.0  1.0     6000   
1054082           20.0  1.0  1.0  0.0   0.0  0.0   0.0   0.0  1.0     6000   
1054083           30.8  1.0  1.0  0.0   0.0  0.0   0.0   0.0  1.0     7945   
1054084           20.0  1.0  1.0  0.0   0.0  0.0   0.0   0.0  1.0     6000   

         ...  fin_flag_number  syn_flag_number  rst_flag_number

### Dropping useless columns

In [68]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#Drop useless columns

useless_column = []
# columns where we have always the same value
for col in df.columns:
    if df[col].nunique() == 1:
        useless_column.append(col)

print(useless_column)

df.drop(columns=useless_column, inplace=True)

['Protocol Type']


### Separing datas and labels

In [69]:
df.columns = df.columns.str.strip() # to clean spaces on the columns names
X = df.drop(columns=['Label']) # without labels
Y = df['Label'] # just labels
print(X)

         Header_Length  LLC  TCP  UDP  DHCP  ARP  ICMP  IGMP  IPv  Tot sum  \
0                 20.0  1.0  1.0  0.0   0.0  0.0   0.0   0.0  1.0     6000   
1                 25.6  0.8  0.8  0.0   0.0  0.2   0.0   0.0  0.8      648   
2                 32.0  1.0  1.0  0.0   0.0  0.0   0.0   0.0  1.0      671   
3                 32.0  1.0  1.0  0.0   0.0  0.0   0.0   0.0  1.0      660   
4                 20.0  1.0  1.0  0.0   0.0  0.0   0.0   0.0  1.0     6000   
...                ...  ...  ...  ...   ...  ...   ...   ...  ...      ...   
1054080           20.0  1.0  1.0  0.0   0.0  0.0   0.0   0.0  1.0     6000   
1054081           20.0  1.0  1.0  0.0   0.0  0.0   0.0   0.0  1.0     6000   
1054082           20.0  1.0  1.0  0.0   0.0  0.0   0.0   0.0  1.0     6000   
1054083           30.8  1.0  1.0  0.0   0.0  0.0   0.0   0.0  1.0     7945   
1054084           20.0  1.0  1.0  0.0   0.0  0.0   0.0   0.0  1.0     6000   

         ...          Rate  fin_flag_number  syn_flag_number  r

### Encoding categorical data
Difference between Label and OneHot : \
                                      - Label gives a number in int for each line (simple but the model could misunderstand the difference between numbers (priorities issues))\
                                      - OneHot encodes in binary columns (no hierarchy between column but a lot of columns if they are a lot of categories to label)

In [70]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# no need of OneHotEncoder because all columns are numerical columns

print(Y)

0          1
1          0
2          0
3          0
4          1
          ..
1054080    1
1054081    1
1054082    1
1054083    0
1054084    1
Name: Label, Length: 1054085, dtype: int64


### Splitting into training set and test set
Split dataset into training and testing sets (70/30)


In [71]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0, stratify=Y)

print(X_train)

        Header_Length  LLC  TCP  UDP  DHCP  ARP  ICMP  IGMP  IPv  Tot sum  \
660550          20.52  1.0  1.0  0.0   0.0  0.0   0.0   0.0  1.0     6034   
550709          20.00  1.0  1.0  0.0   0.0  0.0   0.0   0.0  1.0     6000   
174594          20.00  1.0  1.0  0.0   0.0  0.0   0.0   0.0  1.0     6000   
770231          20.00  1.0  1.0  0.0   0.0  0.0   0.0   0.0  1.0     6000   
32921           20.00  1.0  1.0  0.0   0.0  0.0   0.0   0.0  1.0     6000   
...               ...  ...  ...  ...   ...  ...   ...   ...  ...      ...   
191480          20.00  1.0  1.0  0.0   0.0  0.0   0.0   0.0  1.0     6000   
719617          32.00  1.0  1.0  0.0   0.0  0.0   0.0   0.0  1.0     7900   
807111          20.00  1.0  1.0  0.0   0.0  0.0   0.0   0.0  1.0     6000   
915925          28.00  1.0  0.8  0.2   0.1  0.0   0.0   0.0  1.0     1909   
377054          32.00  1.0  1.0  0.0   0.0  0.0   0.0   0.0  1.0    13692   

        ...          Rate  fin_flag_number  syn_flag_number  rst_flag_numbe

### Missing datas
useless here because no missing datas

In [72]:
from sklearn.impute import SimpleImputer # to handle missing data

print("Missing datas by columns :\n", df.isnull().sum())

#here there is no missing data so we don't have to manage this

X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns


imputer = SimpleImputer(strategy='mean')
X_train[numeric_cols] = imputer.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = imputer.transform(X_test[numeric_cols])

Missing datas by columns :
 Header_Length      0
LLC                0
TCP                0
UDP                0
DHCP               0
ARP                0
ICMP               0
IGMP               0
IPv                0
Tot sum            0
SSH                0
Min                0
Max                0
AVG                0
Std                0
Tot size           0
IAT                0
Number             0
IRC                0
SMTP               0
ece_flag_number    0
Time_To_Live       0
Rate               0
fin_flag_number    0
syn_flag_number    0
rst_flag_number    0
psh_flag_number    0
ack_flag_number    0
cwr_flag_number    0
Label              0
cluster            0
pca1               0
pca2               0
dtype: int64


### Feature Scaling
To have the same scale between each columns (for instance, `Flow Duration` is way bigger than `Tot Fwd Pkts` and the model could misinterpret it and gives more importance to the Income column)

In [73]:
scaler = StandardScaler()


numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

print(Y_train)

660550    1
550709    1
174594    1
770231    1
32921     1
         ..
191480    1
719617    0
807111    1
915925    0
377054    0
Name: Label, Length: 737859, dtype: int64


### Prepare data for Deep Learning (convert datas into float32)

In [74]:
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')


Y_train = np.array(Y_train).astype('float32')
Y_test = np.array(Y_test).astype('float32')

X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)

joblib.dump(X_train, '../../preprocessed_data/CICIOT/X_train.joblib')
joblib.dump(X_test, '../../preprocessed_data/CICIOT/X_test.joblib')
joblib.dump(Y_train, '../../preprocessed_data/CICIOT/Y_train.joblib')
joblib.dump(Y_test, '../../preprocessed_data/CICIOT/Y_test.joblib')


['../../preprocessed_data/CICIOT/Y_test.joblib']