In [10]:
import pandas as pd
import numpy as np
import joblib

In [11]:
normal = pd.read_csv('../../datasets/InSDN_DatasetCSV/Normal_data.csv')
ovs = pd.read_csv('../../datasets/InSDN_DatasetCSV/OVS.csv')
metasploitable = pd.read_csv('../../datasets/InSDN_DatasetCSV/metasploitable-2.csv')

df = pd.concat([normal, ovs, metasploitable], ignore_index=True)

df = df.sample(frac=1, random_state=42).reset_index(drop=True) 

print(df)

                                        Flow ID          Src IP  Src Port  \
0           192.168.20.134-79.229.226.185-0-0-0  79.229.226.185         0   
1       192.168.20.134-200.175.2.130-80-34538-6   200.175.2.130     34538   
2            192.168.3.130-35.236.236.113-0-0-0  35.236.236.113         0   
3        192.168.3.130-200.175.2.130-80-43929-6   192.168.3.130        80   
4       192.168.20.134-200.175.2.130-80-33782-6   200.175.2.130     33782   
...                                         ...             ...       ...   
343884  192.168.20.134-200.175.2.130-80-41620-6   200.175.2.130     41620   
343885       192.168.3.130-215.254.199.71-0-0-0  215.254.199.71         0   
343886  192.168.20.134-200.175.2.130-80-53420-6   200.175.2.130     53420   
343887  192.168.20.134-200.175.2.130-80-35814-6   200.175.2.130     35814   
343888  192.168.20.134-200.175.2.130-80-37660-6   200.175.2.130     37660   

                Dst IP  Dst Port  Protocol               Timestamp  \
0    

### Dropping useless columns

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#Drop useless columns

useless_column = []
# columns where we have always the same value
for col in df.columns:
    if df[col].nunique() == 1:
        useless_column.append(col)

print(useless_column)

df.drop(columns=useless_column, inplace=True)

df = df.drop(columns=['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Timestamp'])

print(df.isnull().sum)

['Fwd PSH Flags', 'Fwd URG Flags', 'CWE Flag Count', 'ECE Flag Cnt', 'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd Byts/b Avg', 'Bwd Pkts/b Avg', 'Bwd Blk Rate Avg', 'Init Fwd Win Byts', 'Fwd Seg Size Min']
<bound method DataFrame.sum of         Protocol  Flow Duration  Tot Fwd Pkts  Tot Bwd Pkts  TotLen Fwd Pkts  \
0          False          False         False         False            False   
1          False          False         False         False            False   
2          False          False         False         False            False   
3          False          False         False         False            False   
4          False          False         False         False            False   
...          ...            ...           ...           ...              ...   
343884     False          False         False         False            False   
343885     False          False         False         False            False   
343886     False          F

### Separing datas and labels

In [13]:
X = df.drop(columns=['Label']) # without labels
Y = df['Label'] # just labels

### Encoding categorical data
Difference between Label and OneHot : \
                                      - Label gives a number in int for each line (simple but the model could misunderstand the difference between numbers (priorities issues))\
                                      - OneHot encodes in binary columns (no hierarchy between column but a lot of columns if they are a lot of categories to label)

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_protocol = encoder.fit_transform(X[['Protocol']])
protocol_cols = encoder.get_feature_names_out(['Protocol'])
encoded_protocol_df = pd.DataFrame(encoded_protocol, columns=protocol_cols, index=X.index) # convert in dataframe
X = pd.concat([X.drop(columns=['Protocol']), encoded_protocol_df], axis=1) #concatenate with the encoded version of protocol

print(X)

# Y_encoder = OneHotEncoder()
# Y = Y_encoder.fit_transform(Y)

Y = Y.apply(lambda x: 0 if x == 'Normal' else 1) # because we have several types of attacks and we wants bianaries Y

print(Y)

        Flow Duration  Tot Fwd Pkts  Tot Bwd Pkts  TotLen Fwd Pkts  \
0                   2             0             2                0   
1                9476             2             5              315   
2                  14             0             2                0   
3            60750633             3             7               30   
4                  53             2             1                0   
...               ...           ...           ...              ...   
343884          24986             2             6               17   
343885             12             0             2                0   
343886           5734             2             1                0   
343887           8989             2             6              327   
343888       26489491          1799          1798            65568   

        TotLen Bwd Pkts  Fwd Pkt Len Max  Fwd Pkt Len Min  Fwd Pkt Len Mean  \
0                     0                0                0          0.000000   
1

### Splitting into training set and test set
Split dataset into training and testing sets (70/30)

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
print(X_train)

        Flow Duration  Tot Fwd Pkts  Tot Bwd Pkts  TotLen Fwd Pkts  \
325042           7507             0             2                0   
298378       60309800             3             7               30   
268971              1             0             2                0   
253598            205             0             2                0   
253091             15             0             2                0   
...               ...           ...           ...              ...   
122579        3218451            12            13             1067   
304137             13             0             2                0   
152315              7             0             2                0   
117952           2806             0             2                0   
305711       61712225             3             7               30   

        TotLen Bwd Pkts  Fwd Pkt Len Max  Fwd Pkt Len Min  Fwd Pkt Len Mean  \
325042                0                0                0          0.000000   
2

### Missing datas
useless here because no missing datas

In [16]:
from sklearn.impute import SimpleImputer # to handle missing data


print("Missing datas by columns :\n", df.isnull().sum())

num_cols_median = ['Flow Duration', 'Flow Byts/s', 'Fwd Pkt Len Mean', 'Flow IAT Mean']
X_train[num_cols_median] = X_train[num_cols_median].fillna(X_train[num_cols_median].median())
X_test[num_cols_median] = X_test[num_cols_median].fillna(X_test[num_cols_median].median())

cols_fill0 = ["SYN Flag Cnt", "Tot Fwd Pkts", "Fwd Act Data Pkts"]
X_train[cols_fill0] = X_train[cols_fill0].fillna(0)
X_test[cols_fill0] = X_test[cols_fill0].fillna(0)


Missing datas by columns :
 Protocol           0
Flow Duration      0
Tot Fwd Pkts       0
Tot Bwd Pkts       0
TotLen Fwd Pkts    0
                  ..
Idle Mean          0
Idle Std           0
Idle Max           0
Idle Min           0
Label              0
Length: 66, dtype: int64


### Feature Scaling
To have the same scale between each columns (for instance, `Flow Duration` is way bigger than `Tot Fwd Pkts` and the model could misinterpret it and gives more importance to the Income column)

In [17]:
scaler = StandardScaler()

numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])
print("X_train :\n", X_train, "\n")
print("X_test :\n", X_test)

X_train :
         Flow Duration  Tot Fwd Pkts  Tot Bwd Pkts  TotLen Fwd Pkts  \
325042      -0.307718     -0.003912     -0.037917        -0.014102   
298378       2.460439     -0.002297      0.008790        -0.013423   
268971      -0.308062     -0.003912     -0.037917        -0.014102   
253598      -0.308053     -0.003912     -0.037917        -0.014102   
253091      -0.308062     -0.003912     -0.037917        -0.014102   
...               ...           ...           ...              ...   
122579      -0.160321      0.002549      0.064839         0.010075   
304137      -0.308062     -0.003912     -0.037917        -0.014102   
152315      -0.308062     -0.003912     -0.037917        -0.014102   
117952      -0.307934     -0.003912     -0.037917        -0.014102   
305711       2.524817     -0.002297      0.008790        -0.013423   

        TotLen Bwd Pkts  Fwd Pkt Len Max  Fwd Pkt Len Min  Fwd Pkt Len Mean  \
325042        -0.023990        -0.171920        -0.144635         -0.

### Prepare data for Deep Learning (convert datas into float32)

In [18]:
from tensorflow.keras.utils import to_categorical

X_train = X_train.astype('float32')
X_test = X_test.astype('float32')


Y_train = np.array(Y_train).astype('float32')
Y_test = np.array(Y_test).astype('float32')

X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)


joblib.dump(X_train, '../../preprocessed_data/InSDN/X_train.joblib')
joblib.dump(X_test, '../../preprocessed_data/InSDN/X_test.joblib')
joblib.dump(Y_train, '../../preprocessed_data/InSDN/Y_train.joblib')
joblib.dump(Y_test, '../../preprocessed_data/InSDN/Y_test.joblib')

['../../preprocessed_data/InSDN/Y_test.joblib']