<h1>Base data (train and test)</h1>

<h2>Statistics</h2>

In [1]:
#!pip install pytorch-tabnet wget
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.utils import shuffle
from sklearn.tree import export_graphviz
from sklearn import metrics
from sklearn import tree
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from imblearn.over_sampling import SMOTENC


### Configurations

In [2]:
random_state_seed=42
np.random.seed(random_state_seed)

In [3]:
def load_data(file_train, file_test):
    
    train = pd.read_csv(file_train)
    test = pd.read_csv(file_test)
    
    print('Train data')
    print(train.dtypes)
    print(train.describe(include='all'))

    print('Test data')
    print(test.dtypes)
    print(test.describe(include='all'))

    return train, test

<h2>Base data - Encoding Categorical Values as Dummies</h2>

In [4]:
def encoding_categorial(d_train, d_test):

    train = d_train.copy()
    test = d_test.copy()

    print(f'No. of rows - train (initial): {len(train)}')
    print(f'No. of rows - test (initial): {len(test)}')
    
    train['o'] ='r'
    test['o'] ='t'

    frames = train,test
    full = pd.concat(frames)

    dummies = pd.get_dummies(full['http.request.method'], prefix='http.request.method')
    full = pd.concat([full,dummies],axis=1)

    dummies = pd.get_dummies(full['http.referer'], prefix='http.referer')
    full = pd.concat([full,dummies],axis=1)

    dummies = pd.get_dummies(full['http.request.version'], prefix='http.request.version')
    full = pd.concat([full,dummies],axis=1)

    dummies = pd.get_dummies(full['dns.qry.name.len'], prefix='dns.qry.name.len')
    full = pd.concat([full,dummies],axis=1)

    dummies = pd.get_dummies(full['mqtt.conack.flags'], prefix='mqtt.conack.flags')
    full = pd.concat([full,dummies],axis=1)

    dummies = pd.get_dummies(full['mqtt.protoname'], prefix='mqtt.protoname')
    full = pd.concat([full,dummies],axis=1)

    dummies = pd.get_dummies(full['mqtt.topic'], prefix='mqtt.topic')
    full = pd.concat([full,dummies],axis=1)

    columns = full.columns
    print(f'No. of columns (before drop): {len(columns)}')

    full = full.drop(['http.request.method','http.referer','http.request.version','dns.qry.name.len','mqtt.conack.flags','mqtt.protoname','mqtt.topic'], axis=1)

    train= full[full['o'] == 'r']
    test= full[full['o'] == 't']

    train = train.drop(['o'], axis=1)
    test = test.drop(['o'], axis=1)
    
    columns = train.columns
    print(f'No. of columns (final): {len(columns)}')

    print(f'No. of rows - train (final): {len(train)}')
    print(f'No. of rows - test (final): {len(test)}')

    train.to_csv('encoded_trainData.csv')
    test.to_csv('encoded_testData.csv')
    pd.DataFrame(columns).to_csv("encoded_columns.csv", index=False)
    
    return train, test, columns
          

<h1> Data augmentation </h1>

In [5]:
def smote_augmentation(seed_data, lec, columns, output_file):
    
    y_train = seed_data['Attack_type']

    X_train = np.array(seed_data.drop(['Attack_type', 'Attack_label'], axis=1))

    print(f'X_train: {X_train.shape}, y_train: {y_train.shape}')

    y_train = lec.fit_transform(y_train)

    unique_labels_df = pd.DataFrame({'Original Label': lec.classes_, 'Encoded Label': range(len(lec.classes_))})

    # Show the unique labels and their corresponding encodings and the number of instances of each label side by side
    label_counts = pd.DataFrame({'Number of instances': pd.DataFrame(y_train)[0].value_counts()})
    unique_labels_df = unique_labels_df.join(label_counts)
    unique_labels_df.index = unique_labels_df.index.rename('Index')
    print("Encoding Mapping and Number of instances:")
    print(unique_labels_df.to_string(index=False))

    smote = SMOTE(random_state=random_state_seed)
    X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

    # print number of lines before and after SMOTE
    print(f"Lines before Augmentation: {len(X_train):,}")
    print(f"Lines after Augmentation: {len(X_train_sm):,}")
    
    # Convert X_train_sm to a DataFrame
    df_train_sm = pd.DataFrame(X_train_sm)

    # Add first 95 columns names to df_train_sm
    df_train_sm.columns = columns[:-2]

    # Reconstruct Attack_label column with label encoder and put it the last column of df_train_sm
    df_train_sm['Attack_label'] = y_train_sm
    df_train_sm['Attack_type'] = df_train_sm['Attack_label'].map(unique_labels_df.set_index('Encoded Label')['Original Label'])

    # Save df_train_sm to csv file
    df_train_sm.to_csv(output_file, index=False)
    
    del X_train, y_train, df_train_sm, unique_labels_df


def smote_nc_augmentation(df_train, output_file):
    
    train = df_train.copy()
    train[['http.request.method','http.referer','http.request.version','dns.qry.name.len','mqtt.conack.flags','mqtt.protoname','mqtt.topic']] = train[['http.request.method','http.referer','http.request.version','dns.qry.name.len','mqtt.conack.flags','mqtt.protoname','mqtt.topic']].astype(str)

    categorical = 'http.request.method','http.referer','http.request.version','dns.qry.name.len','mqtt.conack.flags','mqtt.protoname','mqtt.topic'
    
    y_train = train['Attack_type'].copy()    
    X_train = train.drop(['Attack_type', 'Attack_label'], axis=1).copy()
    
    cat_features_idx = [X_train.columns.get_loc(c) for c in categorical]
    sm = SMOTENC(categorical_features=cat_features_idx, random_state=random_state_seed,sampling_strategy="minority")
    
    for labels in np.unique(y_train):
        if labels != "Normal":
            X_train, y_train = sm.fit_resample(X_train, y_train)
    
    print(f"Lines before Augmentation: {train.shape[0]}")
    print(f"Lines after Augmentation: {X_train.shape[0]}")
    print('Number of instances of each label after oversampling:')
    print(y_train.value_counts())
    
    df_smotenc = pd.DataFrame(X_train, columns=X_train.columns)
    df_smotenc['Attack_type']=y_train
    df_smotenc.to_csv('train_smotenc.csv', index=False)
    
    del df_smotenc


<h1>Main</h1>

In [6]:
#load data
df_train, df_test = load_data('EdgeIIot_train_100k.csv','EdgeIIot_test.csv') 

  train = pd.read_csv(file_train)
  test = pd.read_csv(file_test)


Train data
arp.opcode                   float64
arp.hw.size                  float64
icmp.checksum                float64
icmp.seq_le                  float64
icmp.unused                  float64
http.content_length          float64
http.request.method           object
http.referer                  object
http.request.version          object
http.response                float64
http.tls_port                float64
tcp.ack                      float64
tcp.ack_raw                  float64
tcp.checksum                 float64
tcp.connection.fin           float64
tcp.connection.rst           float64
tcp.connection.syn           float64
tcp.connection.synack        float64
tcp.flags                    float64
tcp.flags.ack                float64
tcp.len                      float64
tcp.seq                      float64
udp.stream                   float64
udp.time_delta               float64
dns.qry.name                 float64
dns.qry.name.len              object
dns.qry.qu                 

## SMOTE Augmentation

In [7]:
le = LabelEncoder()
edf_train, edf_test, cols = encoding_categorial(df_train, df_test)
smote_augmentation(edf_train, le, cols, 'train_smote.csv')
del edf_train, edf_test

No. of rows - train (initial): 536515
No. of rows - test (initial): 381934
No. of columns (before drop): 110
No. of columns (final): 102
No. of rows - train (final): 536515
No. of rows - test (final): 381934
X_train: (536515, 100), y_train: (536515,)
Encoding Mapping and Number of instances:
       Original Label  Encoded Label  Number of instances
             Backdoor              0                19244
            DDoS_HTTP              1                38916
            DDoS_ICMP              2                54438
             DDoS_TCP              3                40053
             DDoS_UDP              4                96966
       Fingerprinting              5                  707
                 MITM              6                  282
               Normal              7               100000
             Password              8                39825
        Port_Scanning              9                15915
           Ransomware             10                 7763
        SQL

### SMOTE-NC Augmentation

In [8]:
smote_nc_augmentation(df_train, 'train_smotenc.csv')

Lines before Augmentation: 536515
Lines after Augmentation: 1500000
Number of instances of each label after oversampling:
Attack_type
DDoS_UDP                 100000
Password                 100000
DDoS_TCP                 100000
Backdoor                 100000
DDoS_ICMP                100000
Port_Scanning            100000
Vulnerability_scanner    100000
SQL_injection            100000
DDoS_HTTP                100000
Uploading                100000
XSS                      100000
Ransomware               100000
MITM                     100000
Fingerprinting           100000
Normal                   100000
Name: count, dtype: int64
