#### Libraries

In [170]:
%%capture
%reset -f  
'generic imports'
import pandas as pd
import json
import time
import sys
import os
import numpy as np
from matplotlib import pyplot as plt
# from IPython.display import display, clear_output

'data processing and augmentation imports'
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle
from imblearn import over_sampling

sys.path.append(os.path.abspath('..'))
from src import utils
import importlib
importlib.reload(utils)

In [171]:
def load_data(file_train, file_test):
    
    train_data = pd.read_csv(file_train, low_memory=False)
    test_data = pd.read_csv(file_test, low_memory=False)
    
    print(f"Train data: \n {train_data.shape[0]} rows and {train_data.shape[1]} columns.")
    print(f"Test data:\n {test_data.shape[0]} rows and {test_data.shape[1]} columns.\n")
    
    assert train_data.dtypes.equals(test_data.dtypes), "Train and test data types are not the same"
    assert train_data.shape[1] == test_data.shape[1], "Train and test data have different number of columns"

    print("Train and Test data types match:")
    print(pd.concat([train_data.dtypes, test_data.dtypes], axis=1, keys=['Train', 'Test']), "\n")

    return train_data, test_data

#### Data Loading

In [172]:
df_train, df_test = load_data('../data/EdgeIIot_train_100k.csv','../data/EdgeIIot_test.csv') 

Train data: 
 536515 rows and 48 columns.
Test data:
 381934 rows and 48 columns.

Train and Test data types match:
                             Train     Test
arp.opcode                 float64  float64
arp.hw.size                float64  float64
icmp.checksum              float64  float64
icmp.seq_le                float64  float64
icmp.unused                float64  float64
http.content_length        float64  float64
http.request.method         object   object
http.referer                object   object
http.request.version        object   object
http.response              float64  float64
http.tls_port              float64  float64
tcp.ack                    float64  float64
tcp.ack_raw                float64  float64
tcp.checksum               float64  float64
tcp.connection.fin         float64  float64
tcp.connection.rst         float64  float64
tcp.connection.syn         float64  float64
tcp.connection.synack      float64  float64
tcp.flags                  float64  float64
tcp.

#### Label Encoding

In [173]:
y_train = df_train['Attack_type']
y_test = df_test['Attack_type']

X_train = df_train.drop(['Attack_type', 'Attack_label'], axis=1)
X_test = df_test.drop(['Attack_type', 'Attack_label'], axis=1)

# label encode the target variable
y_train_enc, y_test_enc, le = utils.encode_labels(y_train, y_test)

Attack_type and encoded labels:

Backdoor                0
DDoS_HTTP               1
DDoS_ICMP               2
DDoS_TCP                3
DDoS_UDP                4
Fingerprinting          5
MITM                    6
Normal                  7
Password                8
Port_Scanning           9
Ransomware              10
SQL_injection           11
Uploading               12
Vulnerability_scanner   13
XSS                     14


#### Data encoding

In [174]:
X_train_enc, X_test_enc = utils.encode_categorical(X_train, X_test)

# Verify that the column names of the one-hot encoded training and test datasets are identical
assert X_train_enc.columns.equals(X_test_enc.columns), "Train and test data have different columns after one-hot encoding"

Categorical features to be encoded:

mqtt.topic
http.referer
http.request.version
dns.qry.name.len
mqtt.conack.flags
mqtt.protoname
http.request.method

Encoding complete.
No of features before encoding: 46
No of features after encoding: 85


### SMOTE Augmentation

In [176]:
# SMOTE augmentation instantiation
smote = over_sampling.SMOTE(random_state=42)

In [177]:
# SMOTE augmentation
X_train_sm, y_train_sm = smote.fit_resample(X_train_enc, y_train_enc)
print(f"No rows before SMOTE: {len(X_train):,}\nNo rows after SMOTE: {len(X_train_sm):,}\n")

No rows before SMOTE: 536,515
No rows after SMOTE: 1,500,000



In [178]:
# df_train_smote reconstruction
df_train_sm = pd.DataFrame(X_train_sm)
df_train_sm.columns = X_train_enc.columns
df_train_sm['Attack_label'] = y_train_sm
# Add Attack_type column based on Attack_label column
df_train_sm['Attack_type'] = le.inverse_transform(df_train_sm['Attack_label'])

In [190]:
# df_test_enc reconstruction
df_test_enc = pd.DataFrame(X_test_enc)
df_test_enc.columns = X_test_enc.columns
df_test_enc['Attack_label'] = y_test_enc
# Add Attack_type column based on Attack_label column
df_test_enc['Attack_type'] = le.inverse_transform(df_test_enc['Attack_label'])

In [191]:
assert df_train_sm.columns.equals(df_test_enc.columns), "Train and test data have different columns after one-hot encoding"

#### Save datasets

In [192]:
# save df_train_sm and df_test_enc to csv files
df_train_sm.to_csv('../data/EdgeIIot_train_smote_v2.csv', index=False)
df_test_enc.to_csv('../data/EdgeIIot_test_enc_v2.csv', index=False)