### Libraries

In [24]:
%%capture
%reset -f                        # clear all variables from the workspace
'generic imports'
import os                              
import pandas as pd                     
import numpy as np
import datetime
import re
import sys
sys.path.append(os.path.abspath('..'))
from src import utils
import importlib
importlib.reload(utils)  

'machine learning library imports'
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

### Load data

In [25]:
AUGMENTATION = 'RealTabFormer'

data_dir = os.path.abspath('../data')

# Load the train and test datasets
df_train, df_test = utils.load_dataset(data_directory=data_dir, 
                                       augmentation=AUGMENTATION, 
                                       ignore_columns=['mbtcp.unit_id', 
                                                       'mbtcp.trans_id']) 

Loading complete.
Train data: 1500000 rows, 46 columns. 
Test data: 381934 rows, 46 columns.


### Data preparation

In [26]:
# Creates X_train, y_train
X_train = df_train.drop(['Attack_label', 'Attack_type'], axis=1)
y_train = df_train['Attack_type']

# Creates X_test, y_test
X_test = df_test.drop(['Attack_label', 'Attack_type'], axis=1)
y_test = df_test['Attack_type']

#### Convert categorical features to one-hot encoded features

In [27]:
# Encode the training and test labels if needed
X_train_enc, X_test_enc = utils.encode_categorical(X_train, X_test)

Categorical features to be encoded:

mqtt.topic
mqtt.protoname
http.request.method
dns.qry.name.len
http.request.version
http.referer
mqtt.conack.flags

Encoding complete.
No of features before encoding: 44
No of features after encoding: 83


In [28]:
# # check if there is categorical data in the dataset
# if X_train.select_dtypes(include="object").columns.size > 0:
    
#     # Concatenate X_train and X_test to ensure consistent encoding for both
#     concatenated_data = pd.concat([X_train, X_test], axis=0)

#     # Extract categorical features
#     categorical_features = concatenated_data.select_dtypes(include="object").columns

#     # Extract indices of categorical features
#     cat_idxs = [concatenated_data.columns.get_loc(col) for col in categorical_features]

#     # Find number of unique values in each categorical column
#     cat_dims = [len(concatenated_data[col].unique()) for col in categorical_features]

#     # One-hot encode categorical features for both X_train and X_test
#     concatenated_data_encoded = pd.get_dummies(concatenated_data, 
#                                                columns=categorical_features, 
#                                                drop_first=True, 
#                                                dtype='int8')

#     # Split the data back into X_train and X_test
#     X_train = concatenated_data_encoded.iloc[:len(X_train)]
#     X_test = concatenated_data_encoded.iloc[len(X_train):]
    
#     print(f'X_train shape: {X_train.shape}, X_test shape: {X_test.shape}')

In [29]:
valid = False

if valid == True:
    def get_unique_values(df, columns):
        unique_values = df[columns].apply(lambda x: ''.join(x.astype(str)), axis=1).unique()
        unique_values.sort()
        return unique_values

    http_referer_columns = [col for col in X_train.columns if 'http.referer' in col]
    http_request_version_columns = [col for col in X_train.columns if 'http.request.version' in col]
    dns_qry_name_len_columns = [col for col in X_train.columns if 'dns.qry.name.len' in col]
    mqtt_conack_flags_columns = [col for col in X_train.columns if 'mqtt.conack.flags' in col]
    mqtt_protoname_columns = [col for col in X_train.columns if 'mqtt.protoname' in col]
    http_request_method_columns = [col for col in X_train.columns if 'http.request.method' in col]

    unique_values_http_referer = get_unique_values(X_train, http_referer_columns)
    unique_values_http_request_version = get_unique_values(X_train, http_request_version_columns)
    unique_values_dns_qry_name_len = get_unique_values(X_train, dns_qry_name_len_columns)
    unique_values_mqtt_conack_flags = get_unique_values(X_train, mqtt_conack_flags_columns)
    unique_values_mqtt_protoname = get_unique_values(X_train, mqtt_protoname_columns)

    # Apply the same function to X_test
    unique_values_http_referer_test = get_unique_values(X_test, http_referer_columns)
    unique_values_http_request_version_test = get_unique_values(X_test, http_request_version_columns)
    unique_values_dns_qry_name_len_test = get_unique_values(X_test, dns_qry_name_len_columns)
    unique_values_mqtt_conack_flags_test = get_unique_values(X_test, mqtt_conack_flags_columns)
    unique_values_mqtt_protoname_test = get_unique_values(X_test, mqtt_protoname_columns)
    
    # Join the unique values from both train and test data and print them
    unique_values_http_referer = np.unique(np.concatenate((unique_values_http_referer, unique_values_http_referer_test)))
    unique_values_http_request_version = np.unique(np.concatenate((unique_values_http_request_version, unique_values_http_request_version_test)))
    unique_values_dns_qry_name_len = np.unique(np.concatenate((unique_values_dns_qry_name_len, unique_values_dns_qry_name_len_test)))
    unique_values_mqtt_conack_flags = np.unique(np.concatenate((unique_values_mqtt_conack_flags, unique_values_mqtt_conack_flags_test)))
    unique_values_mqtt_protoname = np.unique(np.concatenate((unique_values_mqtt_protoname, unique_values_mqtt_protoname_test)))

    print(f'Unique values for http.referer: \n{unique_values_http_referer}\n')
    print(f'Unique values for http.request.version: \n{unique_values_http_request_version}\n')
    print(f'Unique values for dns.qry.name.len: \n{unique_values_dns_qry_name_len}\n')
    print(f'Unique values for mqtt.conack.flags: \n{unique_values_mqtt_conack_flags}\n')
    print(f'Unique values for mqtt.protoname: \n{unique_values_mqtt_protoname}\n')
    
    # check if X_train and X_test have categorical features
    print(f'X_train categorical features: {X_train.select_dtypes(include="object").columns}')
    print(f'X_test categorical features: {X_test.select_dtypes(include="object").columns}')

#### Columns name validation for XGBoost

In [31]:
# define a regular expression pattern to match (),:; {}><[]$- 
pattern = r'[(),:;{}><\[\]\$\-]|_{2,}|\s+'

# clean X_train  column names using the regular expression pattern
cleaned_columns = [re.sub(pattern, '', col) for col in X_train_enc.columns]

# check for duplicates and if they exist, add suffix to make them unique
unique_names = []
for name in cleaned_columns:
    if name in unique_names:
        # add suffix to make the name unique
        suffix = 1
        while f"{name}_{suffix}" in unique_names:
            suffix += 1
        unique_names.append(f"{name}_{suffix}")
    else:
        unique_names.append(name)

# Replace the column names with the cleaned and unique names
X_train_enc.columns, X_test_enc.columns = unique_names, unique_names

# print the unique feature names
print(unique_names)

# check if any of X_train column names have [, ] or <, > characters
for name in X_train_enc.columns:
    if '[' in name or ']' in name or '<' in name or '>' in name:
        print(name)

# Assert all X_train_enc and X_test_enc column names are unique
assert len(np.unique(X_train_enc.columns)) == len(X_train_enc.columns)
assert len(np.unique(X_test_enc.columns)) == len(X_test_enc.columns)


['arp.opcode', 'arp.hw.size', 'icmp.checksum', 'icmp.seq_le', 'icmp.unused', 'http.content_length', 'http.response', 'http.tls_port', 'tcp.ack', 'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin', 'tcp.connection.rst', 'tcp.connection.syn', 'tcp.connection.synack', 'tcp.flags', 'tcp.flags.ack', 'tcp.len', 'tcp.seq', 'udp.stream', 'udp.time_delta', 'dns.qry.name', 'dns.qry.qu', 'dns.qry.type', 'dns.retransmission', 'dns.retransmit_request', 'dns.retransmit_request_in', 'mqtt.conflag.cleansess', 'mqtt.conflags', 'mqtt.hdrflags', 'mqtt.len', 'mqtt.msg_decoded_as', 'mqtt.msgtype', 'mqtt.proto_len', 'mqtt.topic_len', 'mqtt.ver', 'mbtcp.len', 'http.request.method_0.0', 'http.request.method_GET', 'http.request.method_OPTIONS', 'http.request.method_POST', 'http.request.method_PROPFIND', 'http.request.method_PUT', 'http.request.method_SEARCH', 'http.request.method_TRACE', 'http.referer_0', 'http.referer_0.0', 'http.referer_127.0.0.1', 'http.referer_TESTING_PURPOSES_ONLY', 'http.request.version

#### Label encoding

In [32]:
y_train_enc, y_test_enc, le = utils.encode_labels(y_train, y_test)

Attack_type and encoded labels:

Backdoor                0
DDoS_HTTP               1
DDoS_ICMP               2
DDoS_TCP                3
DDoS_UDP                4
Fingerprinting          5
MITM                    6
Normal                  7
Password                8
Port_Scanning           9
Ransomware              10
SQL_injection           11
Uploading               12
Vulnerability_scanner   13
XSS                     14


### Model training

In [33]:
# Shuffle training data
X_train, y_train = shuffle(X_train, y_train, random_state=42)

# Instantiate the XGBClassifier
xgb = XGBClassifier(random_state=42,
                    seed=42,
                    num_class= (le.classes_).size)

# Train the model
xgb_clf =  xgb.fit(X_train_enc, y_train_enc)

### Model Evaluation

In [34]:
predictions = xgb_clf.predict(X_test_enc)

In [35]:
accuracy = metrics.accuracy_score(y_test_enc, predictions)
precision_w = metrics.precision_score(y_test_enc, predictions, average='weighted', zero_division=1)
recall_w = metrics.recall_score(y_test_enc, predictions, average='weighted')
f1_score_w = metrics.f1_score(y_test_enc, predictions, average='weighted')
precision_m = metrics.precision_score(y_test_enc, predictions, average='macro', zero_division=1)
recall_m = metrics.recall_score(y_test_enc, predictions, average='macro')
f1_score_m = metrics.f1_score(y_test_enc, predictions, average='macro')

In [36]:
# Create dictionary for results
results = {
    "model": "XGBoost",
    "augmentations": AUGMENTATION,
    "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "accuracy": accuracy,
    "precision_macro": precision_m,
    "recall_macro": recall_m,
    "f1_macro": f1_score_m,
    "precision_weighted": precision_w,
    "recall_weighted": recall_w,
    "f1_weighted": f1_score_w
    }

utils.print_results_table(results)

╒══════════════════════╤═════════╕
│ Metric               │ Value   │
╞══════════════════════╪═════════╡
│ Accuracy             │ 97.97%  │
├──────────────────────┼─────────┤
│ Precision (macro)    │ 90.09%  │
├──────────────────────┼─────────┤
│ Recall (macro)       │ 91.64%  │
├──────────────────────┼─────────┤
│ F1 (macro)           │ 90.26%  │
├──────────────────────┼─────────┤
│ Precision (weighted) │ 98.20%  │
├──────────────────────┼─────────┤
│ Recall (weighted)    │ 97.97%  │
├──────────────────────┼─────────┤
│ F1 (weighted)        │ 97.99%  │
╘══════════════════════╧═════════╛


#### Save Metrics Results 

In [13]:
# Save results to csv   
utils.save_results_to_csv([results], '../results/metrics/xgboost.csv')

#### Confusion Matrix

In [37]:
conf_mat = metrics.confusion_matrix(y_test_enc, predictions)

attack_labels = ['Backdoor', 'DDoS_HTTP', 'DDoS_ICMP', 'DDoS_TCP', 'DDoS_UDP', 
'Fingerprinting', 'MITM', 'Normal', 'Password', 'Port_Scanning', 'Ransomware', 
'SQL_injection', 'Uploading', 'Vulnerability_scanner', 'XSS']

# Create a dataframe from the confusion matrix
conf_mat_df = pd.DataFrame(conf_mat, 
                            index = attack_labels, 
                            columns = attack_labels)
conf_mat_df.index.name = 'Actual'
conf_mat_df.columns.name = 'Predicted'

# Save the confusion matrix
conf_mat_df.to_csv(f"../results/conf_matrix/{results['model']}_{results['augmentations']}.csv")
conf_mat_df

Predicted,Backdoor,DDoS_HTTP,DDoS_ICMP,DDoS_TCP,DDoS_UDP,Fingerprinting,MITM,Normal,Password,Port_Scanning,Ransomware,SQL_injection,Uploading,Vulnerability_scanner,XSS
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Backdoor,4576,0,0,0,1,47,0,0,0,101,57,0,0,0,0
DDoS_HTTP,0,9119,0,0,0,0,0,0,0,0,0,0,0,56,453
DDoS_ICMP,0,0,13429,0,1,71,0,0,0,0,0,0,0,0,0
DDoS_TCP,0,0,0,10009,0,0,0,0,0,0,0,0,0,0,0
DDoS_UDP,0,0,0,0,24601,0,0,0,0,0,0,0,0,0,0
Fingerprinting,10,0,0,0,0,108,0,0,0,24,4,0,0,0,0
MITM,0,0,0,0,0,0,76,0,0,0,0,0,0,0,0
Normal,0,0,0,0,0,0,0,272776,0,0,0,0,0,0,0
Password,0,806,0,0,0,0,0,0,7910,0,0,1260,132,0,0
Port_Scanning,1,0,0,0,0,0,0,0,0,4061,0,0,0,0,0
