In [1]:
import pandas as pd
import numpy as np
import joblib

In [2]:
normal = pd.read_csv('../../datasets/InSDN_DatasetCSV/Normal_data.csv')
ovs = pd.read_csv('../../datasets/InSDN_DatasetCSV/OVS.csv')
metasploitable = pd.read_csv('../../datasets/InSDN_DatasetCSV/metasploitable-2.csv')

df = pd.concat([normal, ovs, metasploitable], ignore_index=True)

df = df.sample(frac=1, random_state=42).reset_index(drop=True) 

### Dropping useless columns

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

#Drop useless columns

useless_column = []
# columns where we have always the same value
for col in df.columns:
    if df[col].nunique() == 1:
        useless_column.append(col)

print(f"Useless columns : {useless_column}")

df.drop(columns=useless_column, inplace=True)

df = df.drop(columns=['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Timestamp'])


Useless columns : ['Fwd PSH Flags', 'Fwd URG Flags', 'CWE Flag Count', 'ECE Flag Cnt', 'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd Byts/b Avg', 'Bwd Pkts/b Avg', 'Bwd Blk Rate Avg', 'Init Fwd Win Byts', 'Fwd Seg Size Min']


### Balancing the datasets (subsamples attacks)

In [4]:
df["Label"] = df["Label"].str.strip()

N_Normal = df[df["Label"] == "Normal"].shape[0]
N_Attacks = df[df["Label"] != "Normal"].shape[0]
N = N_Attacks-N_Normal
print(N)

attacks = df[df["Label"] != "Normal"]
to_remove = attacks.sample(n=N, random_state=42)

print(f"Before balancing, there is {df[df['Label'] == 'Normal'].shape[0]} normal traffic and {attacks.shape[0]} attacks.\n")

df = df.drop(index=to_remove.index)

print(f"After balancing, there is {df[df['Label'] == 'Normal'].shape[0]} normal traffic and {df[df['Label'] != 'Normal'].shape[0]} attacks.\n")



207041
Before balancing, there is 68424 normal traffic and 275465 attacks.

After balancing, there is 68424 normal traffic and 68424 attacks.



### Separing datas and labels

In [5]:
X = df.drop(columns=['Label']) # without labels
Y = df['Label'] # just labels

### Encoding categorical data
Difference between Label and OneHot : \
                                      - Label gives a number in int for each line (simple but the model could misunderstand the difference between numbers (priorities issues))\
                                      - OneHot encodes in binary columns (no hierarchy between column but a lot of columns if they are a lot of categories to label)

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_protocol = encoder.fit_transform(X[['Protocol']])
protocol_cols = encoder.get_feature_names_out(['Protocol'])
encoded_protocol_df = pd.DataFrame(encoded_protocol, columns=protocol_cols, index=X.index) # convert in dataframe
X = pd.concat([X.drop(columns=['Protocol']), encoded_protocol_df], axis=1) #concatenate with the encoded version of protocol

# Y_encoder = OneHotEncoder()
# Y = Y_encoder.fit_transform(Y)

Y = Y.apply(lambda x: 0 if x == 'Normal' else 1) # because we have several types of attacks and we wants bianaries Y

print(Y.value_counts())



Label
1    68424
0    68424
Name: count, dtype: int64


### Splitting into training set and test set
Split dataset into training and testing sets (70/30)

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0, stratify=Y)
print(Y_train.value_counts(normalize=True))

Label
1    0.500005
0    0.499995
Name: proportion, dtype: float64


### Missing datas
useless here because no missing datas

In [8]:
from sklearn.impute import SimpleImputer # to handle missing data

num_cols_median = ['Flow Duration', 'Flow Byts/s', 'Fwd Pkt Len Mean', 'Flow IAT Mean']
X_train[num_cols_median] = X_train[num_cols_median].fillna(X_train[num_cols_median].median())
X_test[num_cols_median] = X_test[num_cols_median].fillna(X_test[num_cols_median].median())

cols_fill0 = ["SYN Flag Cnt", "Tot Fwd Pkts", "Fwd Act Data Pkts"]
X_train[cols_fill0] = X_train[cols_fill0].fillna(0)
X_test[cols_fill0] = X_test[cols_fill0].fillna(0)


### Feature Scaling
To have the same scale between each columns (for instance, `Flow Duration` is way bigger than `Tot Fwd Pkts` and the model could misinterpret it and gives more importance to the Income column)

In [9]:
scaler = MinMaxScaler()

numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

### Prepare data for Deep Learning (convert datas into float32)

In [10]:
from tensorflow.keras.utils import to_categorical

X_train = X_train.astype('float32')
X_test = X_test.astype('float32')


Y_train = np.array(Y_train).astype('float32')
Y_test = np.array(Y_test).astype('float32')

X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)


joblib.dump(X_train, '../../preprocessed_data/InSDN/X_train.joblib')
joblib.dump(X_test, '../../preprocessed_data/InSDN/X_test.joblib')
joblib.dump(Y_train, '../../preprocessed_data/InSDN/Y_train.joblib')
joblib.dump(Y_test, '../../preprocessed_data/InSDN/Y_test.joblib')

2025-07-01 12:06:39.521730: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


['../../preprocessed_data/InSDN/Y_test.joblib']