In [1]:
import pandas as pd
import numpy as np
import joblib

In [2]:
ddos = pd.read_csv('../../datasets/CICIDS2017/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv')
portscan = pd.read_csv('../../datasets/CICIDS2017/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv')
friday = pd.read_csv('../../datasets/CICIDS2017/Friday-WorkingHours-Morning.pcap_ISCX.csv')
monday = pd.read_csv('../../datasets/CICIDS2017/Monday-WorkingHours.pcap_ISCX.csv')
infiltration = pd.read_csv('../../datasets/CICIDS2017/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv')
webattacks = pd.read_csv('../../datasets/CICIDS2017/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv')
tuesday = pd.read_csv('../../datasets/CICIDS2017/Tuesday-WorkingHours.pcap_ISCX.csv')
wednesday = pd.read_csv('../../datasets/CICIDS2017/Wednesday-workingHours.pcap_ISCX.csv')

df = pd.concat([ddos, portscan, friday, monday, infiltration, webattacks, tuesday, wednesday], ignore_index=True)

print(df)

          Destination Port   Flow Duration   Total Fwd Packets  \
0                    54865               3                   2   
1                    55054             109                   1   
2                    55055              52                   1   
3                    46236              34                   1   
4                    54863               3                   2   
...                    ...             ...                 ...   
2830738                 53           32215                   4   
2830739                 53             324                   2   
2830740              58030              82                   2   
2830741                 53         1048635                   6   
2830742                 53           94939                   4   

          Total Backward Packets  Total Length of Fwd Packets  \
0                              0                           12   
1                              1                            6   
2           

### Missing datas
useless here because no missing datas

In [3]:
from sklearn.impute import SimpleImputer # to handle missing data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

print("Missing datas by columns :\n", df.isnull().sum())

#here there is no missing data so we don't have to manage this

df.replace([np.inf, -np.inf], np.nan, inplace=True)
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns


imputer = SimpleImputer(strategy='mean') 
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

#Drop useless columns

useless_column = []
# columns where we have always the same value
for col in df.columns:
    if df[col].nunique() == 1:
        useless_column.append(col)

print(useless_column)

df.drop(columns=useless_column, inplace=True)

df = df.drop(columns=[' Destination Port']) # useless for detecting attacks

print(df.isnull().sum)

Missing datas by columns :
  Destination Port              0
 Flow Duration                 0
 Total Fwd Packets             0
 Total Backward Packets        0
Total Length of Fwd Packets    0
                              ..
Idle Mean                      0
 Idle Std                      0
 Idle Max                      0
 Idle Min                      0
 Label                         0
Length: 79, dtype: int64
[' Bwd PSH Flags', ' Bwd URG Flags', 'Fwd Avg Bytes/Bulk', ' Fwd Avg Packets/Bulk', ' Fwd Avg Bulk Rate', ' Bwd Avg Bytes/Bulk', ' Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate']
<bound method DataFrame.sum of           Flow Duration   Total Fwd Packets   Total Backward Packets  \
0                 False               False                    False   
1                 False               False                    False   
2                 False               False                    False   
3                 False               False                    False   
4                 

### Separing datas and labels

In [4]:
df.columns = df.columns.str.strip() # to clean spaces on the columns names
X = df.drop(columns=['Label']) # without labels
Y = df['Label'] # just labels

### Encoding categorical data
Difference between Label and OneHot : \
                                      - Label gives a number in int for each line (simple but the model could misunderstand the difference between numbers (priorities issues))\
                                      - OneHot encodes in binary columns (no hierarchy between column but a lot of columns if they are a lot of categories to label)

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# no need of OneHotEncoder because all columns except Label are numerical columns

Y = Y.apply(lambda x: 0 if x == 'BENIGN' else 1) # because we have several types of attacks and we wants bianaries Y

print(Y)

0          0
1          0
2          0
3          0
4          0
          ..
2830738    0
2830739    0
2830740    0
2830741    0
2830742    0
Name: Label, Length: 2830743, dtype: int64


### Splitting into training set and test set
Split dataset into training and testing sets (70/30)

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
print(X_train)

         Flow Duration  Total Fwd Packets  Total Backward Packets  \
2092860        64428.0                2.0                     2.0   
2140044       757414.0               12.0                     8.0   
1590488       184165.0                4.0                     2.0   
128477       1863306.0                6.0                     2.0   
1336329          235.0                2.0                     2.0   
...                ...                ...                     ...   
2249467     85562173.0                8.0                     5.0   
963395     118654701.0               45.0                    78.0   
2215104     87497358.0                8.0                     5.0   
1484405      1543512.0                8.0                    10.0   
305711     115280042.0               16.0                    14.0   

         Total Length of Fwd Packets  Total Length of Bwd Packets  \
2092860                         78.0                        148.0   
2140044                        54

### Feature Scaling
To have the same scale between each columns (for instance, `Flow Duration` is way bigger than `Tot Fwd Pkts` and the model could misinterpret it and gives more importance to the Income column)

In [7]:
scaler = StandardScaler()


numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])
print("X_train :\n", X_train, "\n")
print("X_test :\n", X_test)

X_train :
          Flow Duration  Total Fwd Packets  Total Backward Packets  \
2092860      -0.437708          -0.009742               -0.008360   
2140044      -0.417124           0.003378               -0.002448   
1590488      -0.434152          -0.007118               -0.008360   
128477       -0.384276          -0.004494               -0.008360   
1336329      -0.439615          -0.009742               -0.008360   
...                ...                ...                     ...   
2249467       2.101863          -0.001870               -0.005404   
963395        3.084823           0.046673                0.066519   
2215104       2.159344          -0.001870               -0.005404   
1484405      -0.393775          -0.001870               -0.000478   
305711        2.984584           0.008626                0.003463   

         Total Length of Fwd Packets  Total Length of Bwd Packets  \
2092860                    -0.041870                    -0.007053   
2140044               

### Prepare data for Deep Learning (convert datas into float32)

In [8]:
import tensorflow as tf

X_train = X_train.astype('float32')
X_test = X_test.astype('float32')


Y_train = np.array(Y_train).astype('float32')
Y_test = np.array(Y_test).astype('float32')

X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)

joblib.dump(X_train, '../../preprocessed_data/CICIDS/X_train.joblib')
joblib.dump(X_test, '../../preprocessed_data/CICIDS/X_test.joblib')
joblib.dump(Y_train, '../../preprocessed_data/CICIDS/Y_train.joblib')
joblib.dump(Y_test, '../../preprocessed_data/CICIDS/Y_test.joblib')

2025-06-16 11:32:14.942946: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


['../../preprocessed_data/CICIDS/Y_test.joblib']