In [1]:
import pandas as pd
import numpy as np
import joblib

In [2]:
ddos = pd.read_csv('../../datasets/CICIDS2017/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv')
portscan = pd.read_csv('../../datasets/CICIDS2017/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv')
friday = pd.read_csv('../../datasets/CICIDS2017/Friday-WorkingHours-Morning.pcap_ISCX.csv')
monday = pd.read_csv('../../datasets/CICIDS2017/Monday-WorkingHours.pcap_ISCX.csv')
infiltration = pd.read_csv('../../datasets/CICIDS2017/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv')
webattacks = pd.read_csv('../../datasets/CICIDS2017/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv')
tuesday = pd.read_csv('../../datasets/CICIDS2017/Tuesday-WorkingHours.pcap_ISCX.csv')
wednesday = pd.read_csv('../../datasets/CICIDS2017/Wednesday-workingHours.pcap_ISCX.csv')

df = pd.concat([ddos, portscan, friday, monday, infiltration, webattacks, tuesday, wednesday], ignore_index=True)

df = df.sample(frac=1, random_state=42).reset_index(drop=True)

### Dropping Useless columns

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#Drop useless columns

useless_column = []
# columns where we have always the same value
for col in df.columns:
    if df[col].nunique() == 1:
        useless_column.append(col)

print(f"Useless columns : {useless_column}")

df.drop(columns=useless_column, inplace=True)

df = df.drop(columns=[' Destination Port']) # useless for detecting attacks

Useless columns : [' Bwd PSH Flags', ' Bwd URG Flags', 'Fwd Avg Bytes/Bulk', ' Fwd Avg Packets/Bulk', ' Fwd Avg Bulk Rate', ' Bwd Avg Bytes/Bulk', ' Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate']


### Separing datas and labels

In [4]:
df.columns = df.columns.str.strip() # to clean spaces on the columns names
X = df.drop(columns=['Label']) # without labels
Y = df['Label'] # just labels

### Encoding categorical data
Difference between Label and OneHot : \
                                      - Label gives a number in int for each line (simple but the model could misunderstand the difference between numbers (priorities issues))\
                                      - OneHot encodes in binary columns (no hierarchy between column but a lot of columns if they are a lot of categories to label)

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# no need of OneHotEncoder because all columns except Label are numerical columns

Y = Y.apply(lambda x: 0 if x == 'BENIGN' else 1) # because we have several types of attacks and we wants bianaries Y

### Splitting into training set and test set
Split dataset into training and testing sets (70/30)

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0, stratify=Y)

### Missing datas
useless here because no missing datas

In [7]:
from sklearn.impute import SimpleImputer # to handle missing data

#here there is no missing data so we don't have to manage this

X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns


imputer = SimpleImputer(strategy='mean')
X_train[numeric_cols] = imputer.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = imputer.transform(X_test[numeric_cols])

### Feature Scaling
To have the same scale between each columns (for instance, `Flow Duration` is way bigger than `Tot Fwd Pkts` and the model could misinterpret it and gives more importance to the Income column)

In [8]:
scaler = StandardScaler()


numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

### Prepare data for Deep Learning (convert datas into float32)

In [9]:
import tensorflow as tf

X_train = X_train.astype('float32')
X_test = X_test.astype('float32')


Y_train = np.array(Y_train).astype('float32')
Y_test = np.array(Y_test).astype('float32')

X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)

joblib.dump(X_train, '../../preprocessed_data/CICIDS/X_train.joblib')
joblib.dump(X_test, '../../preprocessed_data/CICIDS/X_test.joblib')
joblib.dump(Y_train, '../../preprocessed_data/CICIDS/Y_train.joblib')
joblib.dump(Y_test, '../../preprocessed_data/CICIDS/Y_test.joblib')

2025-06-17 16:55:24.562151: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


['../../preprocessed_data/CICIDS/Y_test.joblib']