In [31]:
import pandas as pd
import numpy as np
import tensorflow as tf
import joblib

In [None]:
df = pd.read_csv('../../datasets/CICIOT2023/full_version/lighter_CICIOT2023.csv')

df = df.sample(frac=1, random_state=42).reset_index(drop=True)

  df = pd.read_csv('../../datasets/CICIOT2023/full_version/lighter_CICIOT2023.csv')


### Dropping useless columns

In [33]:
print(df)

              flow_duration Header_Length Protocol Type Duration  \
0                       0.0          54.0           6.0     64.0   
1                       0.0           0.0           1.0     64.0   
2                   0.07496       31247.0          17.0     64.0   
3                       0.0           0.0           1.0     64.0   
4        3.5216370820999146      171704.8           8.2    117.9   
...                     ...           ...           ...      ...   
2103094                 0.0           0.0           1.0     64.0   
2103095                 0.0          54.0           6.0     64.0   
2103096            0.242968      34894.19         16.61    66.74   
2103097            0.185182         65.52          6.11    64.64   
2103098                 0.0          54.0           6.0     64.0   

                      Rate              Srate Drate fin_flag_number  \
0                2097152.0          2097152.0   0.0             0.0   
1               172.310827         172.31

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#Drop useless columns

useless_column = []
# columns where we have always the same value
for col in df.columns:
    if df[col].nunique() == 1:
        useless_column.append(col)

print(f"Useless columns : {useless_column}")

cols_to_drop = [
    "Protocol Type",
    "Magnitue", "Radius", "Covariance", "Variance", "Weight",
]
df.drop(columns=cols_to_drop)
df.drop(columns=useless_column, inplace=True)

Useless columns : []


### Balancing the datasets (subsamples attacks)

In [36]:
df["label"] = df["label"].str.strip()

N_BenignTraffic = df[df["label"] == "BenignTraffic"].shape[0]
N_Attacks = df[df["label"] != "BenignTraffic"].shape[0]
N = N_Attacks-N_BenignTraffic
print(N)

attacks = df[df["label"] != "BenignTraffic"]
to_remove = attacks.sample(n=N, random_state=42)

print(f"Before balancing, there is {df[df['label'] == 'BenignTraffic'].shape[0]} normal traffic and {attacks.shape[0]} attacks.\n")

df = df.drop(index=to_remove.index)

print(f"After balancing, there is {df[df['label'] == 'BenignTraffic'].shape[0]} normal traffic and {df[df['label'] != 'BenignTraffic'].shape[0]} attacks.\n")



1875895
Before balancing, there is 113602 normal traffic and 1989497 attacks.

After balancing, there is 113602 normal traffic and 113602 attacks.



### Separing datas and labels

In [37]:
df.columns = df.columns.str.strip() # to clean spaces on the columns names
X = df.drop(columns=['label']) # without labels
Y = df['label'] # just labels

Y = Y.apply(lambda x: 0 if x == 'BenignTraffic' else 1)
print(Y.value_counts())

label
0    113602
1    113602
Name: count, dtype: int64


### Encoding categorical data
Difference between Label and OneHot : \
                                      - Label gives a number in int for each line (simple but the model could misunderstand the difference between numbers (priorities issues))\
                                      - OneHot encodes in binary columns (no hierarchy between column but a lot of columns if they are a lot of categories to label)

In [38]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# no need of OneHotEncoder because all columns are numerical columns

### Splitting into training set and test set
Split dataset into training and testing sets (70/30)


In [39]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0, stratify=Y)
print(X_test)

              flow_duration Header_Length Protocol Type Duration  \
561384                  0.0          54.0           6.0     64.0   
1590905    55.5097695350647       55325.9          11.5     97.2   
352239     30.8857843875885     1224697.6           5.4     67.8   
1649343            0.007652        4654.4         16.89     64.0   
66704              3.473864       12588.2           5.4     56.8   
...                     ...           ...           ...      ...   
731132             8.626672      222138.6           6.0     87.1   
707625            42.032916      119843.4           6.0     63.2   
978586            15.936039      762842.5           5.4    107.2   
1362398  0.0128752350807189        9813.0          17.0     64.0   
1552405           20.166727     1125992.4           5.4     57.2   

                       Rate               Srate Drate fin_flag_number  \
561384             3.355569            3.355569   0.0             0.0   
1590905    9.76636788785718    9.7663

### Missing datas
useless here because no missing datas

In [42]:
from sklearn.impute import SimpleImputer # to handle missing data

#here there is no missing data so we don't have to manage this

X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')
#because numeric values are not really numeric in the csv, so we convert them


X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
numeric_cols = X_train.select_dtypes(include=[np.number]).columns


imputer = SimpleImputer(strategy='mean')
X_train[numeric_cols] = imputer.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = imputer.transform(X_test[numeric_cols])

### Feature Scaling
To have the same scale between each columns (for instance, `Flow Duration` is way bigger than `Tot Fwd Pkts` and the model could misinterpret it and gives more importance to the Income column)

In [43]:
scaler = StandardScaler()


numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

### Prepare data for Deep Learning (convert datas into float32)

In [44]:
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')


Y_train = np.array(Y_train).astype('float32')
Y_test = np.array(Y_test).astype('float32')

X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)

joblib.dump(X_train, '../../preprocessed_data/CICIOT_update/X_train.joblib')
joblib.dump(X_test, '../../preprocessed_data/CICIOT_update/X_test.joblib')
joblib.dump(Y_train, '../../preprocessed_data/CICIOT_update/Y_train.joblib')
joblib.dump(Y_test, '../../preprocessed_data/CICIOT_update/Y_test.joblib')


['../../preprocessed_data/CICIOT_update/Y_test.joblib']