In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import joblib

2025-06-18 15:16:09.134556: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
attacks = pd.read_csv('../../datasets/CICIOT2023/tharindu_cleaned_version/nattack.csv')
benign = pd.read_csv('../../datasets/CICIOT2023/tharindu_cleaned_version/nbengin.csv')

df = pd.concat([attacks, benign], ignore_index=True)

df = df.sample(frac=1, random_state=42).reset_index(drop=True)

### Dropping useless columns

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#Drop useless columns

useless_column = []
# columns where we have always the same value
for col in df.columns:
    if df[col].nunique() == 1:
        useless_column.append(col)

print(f"Useless columns : {useless_column}")

df.drop(columns=useless_column, inplace=True)

Useless columns : ['Protocol Type']


### Separing datas and labels

In [5]:
df.columns = df.columns.str.strip() # to clean spaces on the columns names
X = df.drop(columns=['Label']) # without labels
Y = df['Label'] # just labels

print(Y.value_counts())

Label
1    536894
0    517191
Name: count, dtype: int64


### Encoding categorical data
Difference between Label and OneHot : \
                                      - Label gives a number in int for each line (simple but the model could misunderstand the difference between numbers (priorities issues))\
                                      - OneHot encodes in binary columns (no hierarchy between column but a lot of columns if they are a lot of categories to label)

In [79]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# no need of OneHotEncoder because all columns are numerical columns

### Splitting into training set and test set
Split dataset into training and testing sets (70/30)


In [80]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0, stratify=Y)

### Missing datas
useless here because no missing datas

In [81]:
from sklearn.impute import SimpleImputer # to handle missing data

#here there is no missing data so we don't have to manage this

X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns


imputer = SimpleImputer(strategy='mean')
X_train[numeric_cols] = imputer.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = imputer.transform(X_test[numeric_cols])

### Feature Scaling
To have the same scale between each columns (for instance, `Flow Duration` is way bigger than `Tot Fwd Pkts` and the model could misinterpret it and gives more importance to the Income column)

In [82]:
scaler = StandardScaler()


numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

### Prepare data for Deep Learning (convert datas into float32)

In [83]:
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')


Y_train = np.array(Y_train).astype('float32')
Y_test = np.array(Y_test).astype('float32')

X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)

joblib.dump(X_train, '../../preprocessed_data/CICIOT/X_train.joblib')
joblib.dump(X_test, '../../preprocessed_data/CICIOT/X_test.joblib')
joblib.dump(Y_train, '../../preprocessed_data/CICIOT/Y_train.joblib')
joblib.dump(Y_test, '../../preprocessed_data/CICIOT/Y_test.joblib')


['../../preprocessed_data/CICIOT/Y_test.joblib']