In [1]:
import pandas as pd
import numpy as np
import joblib

In [2]:
normal = pd.read_csv('../../datasets/InSDN_DatasetCSV/Normal_data.csv')
ovs = pd.read_csv('../../datasets/InSDN_DatasetCSV/OVS.csv')
metasploitable = pd.read_csv('../../datasets/InSDN_DatasetCSV/metasploitable-2.csv')

df = pd.concat([normal, ovs, metasploitable], ignore_index=True)

print(df)

                                         Flow ID          Src IP  Src Port  \
0       185.127.17.56-192.168.20.133-443-53648-6   185.127.17.56       443   
1       185.127.17.56-192.168.20.133-443-53650-6  192.168.20.133     53650   
2         192.168.20.133-192.168.20.2-35108-53-6  192.168.20.133     35108   
3         192.168.20.133-192.168.20.2-35108-53-6    192.168.20.2        53   
4       154.59.122.74-192.168.20.133-443-60900-6  192.168.20.133     60900   
...                                          ...             ...       ...   
343884  192.168.3.130-200.175.2.130-41966-4444-6   192.168.3.130     41966   
343885  192.168.3.130-200.175.2.130-41967-4444-6   192.168.3.130     41967   
343886   192.168.3.130-200.175.2.130-139-44791-6   200.175.2.130     44791   
343887  192.168.3.130-200.175.2.130-41966-4444-6   192.168.3.130     41966   
343888  192.168.3.130-200.175.2.130-41967-4444-6   192.168.3.130     41967   

                Dst IP  Dst Port  Protocol        Timestamp  Fl

### Missing datas

useless here because no missing datas

In [4]:
from sklearn.impute import SimpleImputer # to handle missing data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# print("Missing datas : " )
# print(df.isnull().sum())


# print(df[['Flow Duration', 'Flow Byts/s', 'Fwd Pkt Len Mean', 'Flow IAT Mean']].head())
# column = ['Flow Duration', 'Flow Byts/s', 'Fwd Pkt Len Mean', 'Flow IAT Mean']
# df[column] = df[column].fillna(df[column].median())

# column = ["SYN Flag Cnt", "Tot Fwd Pkts", "Fwd Act Data Pkts"]
# df.loc[:, column] = df.loc[:,column].fillna(0)

# print(df[['Flow Duration', 'Flow Byts/s', 'Fwd Pkt Len Mean', 'Flow IAT Mean']].head())
# print("Missing datas : " )
# print(df.isnull().sum())

print("Missing datas by columns :\n", df.isnull().sum())

num_cols_median = ['Flow Duration', 'Flow Byts/s', 'Fwd Pkt Len Mean', 'Flow IAT Mean']
df[num_cols_median] = df[num_cols_median].fillna(df[num_cols_median].median())

cols_fill0 = ["SYN Flag Cnt", "Tot Fwd Pkts", "Fwd Act Data Pkts"]
df[cols_fill0] = df[cols_fill0].fillna(0)

#Drop useless columns

useless_column = []
# columns where we have always the same value
for col in df.columns:
    if df[col].nunique() == 1:
        useless_column.append(col)

print(useless_column)

df.drop(columns=useless_column, inplace=True)

df = df.drop(columns=['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Timestamp'])

print(df.isnull().sum)

Missing datas by columns :
 Flow ID      0
Src IP       0
Src Port     0
Dst IP       0
Dst Port     0
            ..
Idle Mean    0
Idle Std     0
Idle Max     0
Idle Min     0
Label        0
Length: 72, dtype: int64
[]
<bound method DataFrame.sum of         Protocol  Flow Duration  Tot Fwd Pkts  Tot Bwd Pkts  TotLen Fwd Pkts  \
0          False          False         False         False            False   
1          False          False         False         False            False   
2          False          False         False         False            False   
3          False          False         False         False            False   
4          False          False         False         False            False   
...          ...            ...           ...           ...              ...   
343884     False          False         False         False            False   
343885     False          False         False         False            False   
343886     False          Fa

### Separing datas and labels

In [5]:
X = df.drop(columns=['Label']) # without labels
Y = df['Label'] # just labels

### Encoding categorical data
Difference between Label and OneHot : \
                                      - Label gives a number in int for each line (simple but the model could misunderstand the difference between numbers (priorities issues))\
                                      - OneHot encodes in binary columns (no hierarchy between column but a lot of columns if they are a lot of categories to label)

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_protocol = encoder.fit_transform(X[['Protocol']])
protocol_cols = encoder.get_feature_names_out(['Protocol'])
encoded_protocol_df = pd.DataFrame(encoded_protocol, columns=protocol_cols, index=X.index) # convert in dataframe
X = pd.concat([X.drop(columns=['Protocol']), encoded_protocol_df], axis=1) #concatenate with the encoded version of protocol

print(X)

# Y_encoder = OneHotEncoder()
# Y = Y_encoder.fit_transform(Y)

Y = Y.apply(lambda x: 0 if x == 'Normal' else 1) # because we have several types of attacks and we wants bianaries Y

print(Y)

        Flow Duration  Tot Fwd Pkts  Tot Bwd Pkts  TotLen Fwd Pkts  \
0              245230            44            40           124937   
1             1605449           107           149             1071   
2               53078             5             5               66   
3                6975             1             1                0   
4              190141            13            16              780   
...               ...           ...           ...              ...   
343884         273133             2             3                0   
343885         267969             6             7              138   
343886        1552555             6             5               60   
343887         272141             2             3                0   
343888         270361             6             7              108   

        TotLen Bwd Pkts  Fwd Pkt Len Max  Fwd Pkt Len Min  Fwd Pkt Len Mean  \
0                  1071             9100                0       2839.477273   
1

### Splitting into training set and test set
Split dataset into training and testing sets (70/30)

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
print(X_train)

        Flow Duration  Tot Fwd Pkts  Tot Bwd Pkts  TotLen Fwd Pkts  \
325042          31423             0             2                0   
298378           3838             4             4               30   
268971             24             0             2                0   
253598             38             0             2                0   
253091             13             0             2                0   
...               ...           ...           ...              ...   
122579          59812             2             6              415   
304137       62379457             3             7               30   
152315          34759             2             6              332   
117952          15896             2             6               17   
305711       63291589             3             7               30   

        TotLen Bwd Pkts  Fwd Pkt Len Max  Fwd Pkt Len Min  Fwd Pkt Len Mean  \
325042                0                0                0               0.0   
2

### Feature Scaling
To have the same scale between each columns (for instance, `Flow Duration` is way bigger than `Tot Fwd Pkts` and the model could misinterpret it and gives more importance to the Income column)

In [8]:
scaler = StandardScaler()

numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])
print("X_train :\n", X_train, "\n")
print("X_test :\n", X_test)

X_train :
         Flow Duration  Tot Fwd Pkts  Tot Bwd Pkts  TotLen Fwd Pkts  \
325042      -0.306988     -0.003916     -0.036937        -0.009755   
298378      -0.308252     -0.001762     -0.019123        -0.009336   
268971      -0.308426     -0.003916     -0.036937        -0.009755   
253598      -0.308426     -0.003916     -0.036937        -0.009755   
253091      -0.308427     -0.003916     -0.036937        -0.009755   
...               ...           ...           ...              ...   
122579      -0.305688     -0.002839     -0.001309        -0.003960   
304137       2.548979     -0.002300      0.007599        -0.009336   
152315      -0.306835     -0.002839     -0.001309        -0.005119   
117952      -0.307699     -0.002839     -0.001309        -0.009518   
305711       2.590761     -0.002300      0.007599        -0.009336   

        TotLen Bwd Pkts  Fwd Pkt Len Max  Fwd Pkt Len Min  Fwd Pkt Len Mean  \
325042        -0.023171        -0.174139        -0.149992         -0.

### Prepare data for Deep Learning (convert datas into float32)

In [10]:
from tensorflow.keras.utils import to_categorical

X_train = X_train.astype('float32')
X_test = X_test.astype('float32')


Y_train = np.array(Y_train).astype('float32')
Y_test = np.array(Y_test).astype('float32')

X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)


Y_train = to_categorical(Y_train, num_classes=2)
Y_test = to_categorical(Y_test, num_classes=2)

joblib.dump(X_train, '../../preprocessed_data/InSDN/X_train.joblib')
joblib.dump(X_test, '../../preprocessed_data/InSDN/X_test.joblib')
joblib.dump(Y_train, '../../preprocessed_data/InSDN/Y_train.joblib')
joblib.dump(Y_test, '../../preprocessed_data/InSDN/Y_test.joblib')

2025-06-16 11:24:57.501417: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


['../../preprocessed_data/InSDN/Y_test.joblib']