# This is a benchmark comparison for the simplified encoder to see the results in Random Forest and ExplainerDashboard for all threats:
- The procedure focuses on gathering the top 30 correlated values to threats of each categorcial variable and provides some information throughout.
- The printouts include the highest to lowest correlated values in each category (currently top 30).
- Encoders have been optimised with tensors to reduce memory allocation issues.
- The following update will include the implementation into ExplainerDashboard by reducing the categories to top10.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import shap
from explainerdashboard import ExplainerDashboard, ClassifierExplainer, InlineExplainer
import pickle
import time
import tensorflow as tf

In [2]:
# Imports the cleaned dataset.
data = pd.read_csv('Cleaned_full_data.csv')



In [3]:
# Reset retained index.
data = data.reset_index(drop=True)
# Set NA to 0.
# Some extra cleaning.
data['ct_ftp_cmd'] = data['ct_ftp_cmd'].fillna(0)
data['attack_cat'] = data['attack_cat'].str.replace(r'\s+', '', regex=True)
data['attack_cat'] = data['attack_cat'].str.replace('Backdoors', 'Backdoor')
# Drop correlated features 99% +.
data = data.drop(columns=['dwin', 'dloss'])
# Normal rows with hidden strings.
data = data[~data['sport'].str.startswith(('0x', '-'), na=False)]
data = data[~data['dsport'].str.startswith(('0x', '-'), na=False)]

In [None]:
# Technically we can group the IP addresses directly to a router disregarding individual devices. This is done by removing the last value and dot point.
# Can ask the stakeholder which might be better but at the moment we are not facing issues in cardinality with these either.
# NOTE: A similar thing can be done with ports to set the to well known, common, and private ports but this might lose a lot of information.
# By collecting the ports and ips from the training set it becomes more relevant to the data.
# Some websites also offer lists of vulnerable ports but again this might not be relative to the data we have to train the model.
# We can assume that the ports correlated to threats are commonly used anyway.
# Wikipedia offers a list of ports which we can get information from: https://en.wikipedia.org/wiki/List_of_TCP_and_UDP_port_numbers.
# I will create a selenium scraper that can make a dataframe we can link ports to descriptions.
grouped_ip = data['srcip']
print(len(grouped_ip.value_counts()))
print(grouped_ip.value_counts())
grouped_ip = grouped_ip.str.rsplit('.', n=1).str[0]
print(len(grouped_ip.value_counts()))
print(grouped_ip.value_counts())

0            59.166.0
1            59.166.0
2            59.166.0
3            59.166.0
4            59.166.0
              ...    
2540042      59.166.0
2540043      59.166.0
2540044      59.166.0
2540045      59.166.0
2540046    175.45.176
Name: srcip, Length: 2539739, dtype: object
8
srcip
59.166.0       1942302
175.45.176      361690
149.171.126     217315
10.40.182         9581
10.40.85          6702
10.40.170         2094
192.168.241         54
127.0.0              1
Name: count, dtype: int64


In [7]:
class CorrOnehotEncoder:
    """
    CorrOnehotEncoder: Encodes the given column by creating one-hot encoded columns for categories that have
    a correlation higher than a threshold with the target column.
    """
    def __init__(self, column, target):
        """
        Constructor: Stores the column and target (storing the full data causes memory issues).
        
        Parameters:
            - column (pd.Series): The feature column to encode.
            - target (pd.Series): The target column.
        """
        # Force to string for groups.
        self.column = column.astype(str)
        # Convert to float32 precision to minimise memory load.
        self.target = target.astype(np.float32)

    def corr(self, x, y):
        """
        Calculate the Pearson correlation coefficient (Phi).
        
        Parameters:
            - x (tensor - float32): The first variable.
            - y (tensor - float32): The target to draw correlation to.
        
        Returns:
            - r (float32): The Pearson correlation coefficient (Phi).
        """
        mean_x = tf.reduce_mean(x)
        mean_y = tf.reduce_mean(y)
        covariance = tf.reduce_sum((x - mean_x) * (y - mean_y))
        std_x = tf.sqrt(tf.reduce_sum((x - mean_x) ** 2))
        std_y = tf.sqrt(tf.reduce_sum((y - mean_y) ** 2))
        r = covariance / (std_x * std_y)
        return r

    def encode(self, sparse_n, threshold, max_encoded):
        """
        Encode the feature column by creating one-hot encoded columns for categories that have
        a correlation higher than a threshold with the target.
        
        Parameters:
            - sparse_n (int): Minimum number of occurrences (1's) for a category in the column.
            - threshold (float): The correlation threshold.
            - max_encoded (int): The maximum number of encoded features.
        
        Returns:
            - ohe_df (pd.DataFrame): One-hot encoded columns that meet the correlation threshold.
        """
        # Convert to numpy for tensors.
        column_np = self.column.to_numpy()
        target_np = self.target.to_numpy()

        # Store results.
        ohe_list = []    
        column_names = []
        correlations = []
        # Iterate through each unique category in the column.
        for c in np.unique(column_np):
            # Convert to binary - float32 minimises memory issues.
            corr_column = (column_np == c).astype(np.float32)
            # If the category count is below sparse_n, skip encoding.
            if np.sum(corr_column) < sparse_n:
                continue
            # Convert to tensors for the correlation calculation.
            correlation = self.corr(tf.convert_to_tensor(corr_column, dtype=tf.float32), 
                                    tf.convert_to_tensor(target_np, dtype=tf.float32))
            # If the absolute correlation is greater than the threshold, add to the list.
            if abs(correlation.numpy()) > threshold:
                ohe_list.append(corr_column)
                column_names.append(c)
                # Store correlations to sort.
                correlations.append(abs(correlation.numpy()))

        # Sort the columns by their correlation with the target.
        sorted_indices = np.argsort(correlations)[::-1]
        sorted_ohe_list = []
        sorted_column_names = []
        for i in sorted_indices:
            sorted_ohe_list.append(ohe_list[i])
            sorted_column_names.append(column_names[i])

        # Limit the number of variables to max_encoded.
        if len(sorted_ohe_list) > max_encoded:
            sorted_ohe_list = sorted_ohe_list[:max_encoded]
            sorted_column_names = sorted_column_names[:max_encoded]
        # Add the encoded data to a dataframe.
        ohe_df = pd.DataFrame(np.column_stack(sorted_ohe_list), columns=sorted_column_names)
        
        if ohe_df.empty:
            print("No correlations exceed the threshold.")
            return pd.DataFrame()
        
        return ohe_df

In [None]:
cte = CorrOnehotEncoder(data['sport'], data['label'])
# dsport and sport are the categories with the highest cardinalities. Websites offer lists for suspicious ports but we don't know how relative they
# might be to the data. So they are collected like this. I will compare the results to the ones found online to see how closely they match.
# NOTE: Processing time has decreased by more than half since the tensor conversion and increase in sparse_n (60) in the encoder.
# Port 0: Wildcard Port: Let the port be automatically chosen.
# https://www.lifewire.com/port-0-in-tcp-and-udp-818145
ec1 = cte.encode(60, 0.001, 30)
print(len(ec1.columns))
for i in ec1.columns:
    print(i)

30
1043
47439
0
137
68
138
65535
65532
1024
60986
65534
65527
80
65529
1029
6881
32820
5190
25
65533
143
21
65524
53
111
2013
65531
1103
1911
1230


In [9]:
cte = CorrOnehotEncoder(data['dsport'], data['label'])
ec2 = cte.encode(60, 0.001, 30)
print(len(ec2.columns))
for i in ec2.columns:
    print(i)

30
53
0
445
6881
5190
110
22
520
179
514
5060
143
25
1723
80
69
137
8080
139
21
67
135
23
389
3306
443
5555
161
554
111


In [10]:
cte = CorrOnehotEncoder(data['proto'], data['label'])
ec3 = cte.encode(60, 0.0, 30)
print(len(ec3.columns))
for i in ec3.columns:
    print(i)

30
tcp
udp
unas
sctp
ospf
any
gre
rsvp
ipv6
mobile
sun-nd
swipe
pim
sep
arp
etherip
encap
ipip
gmtp
sccopmce
merit-inp
a/n
emcon
nvp
netblt
mfe-nsp
pri-enc
vines
igp
ax.25


In [11]:
cte = CorrOnehotEncoder(data['service'], data['label'])
ec4 = cte.encode(60, 0.0, 30)
print(len(ec4.columns))
for i in ec4.columns:
    print(i)

11
dns
none
ftp-data
pop3
ssh
smtp
http
ftp
dhcp
ssl
snmp


In [12]:
cte = CorrOnehotEncoder(data['state'], data['label'])
ec5 = cte.encode(60, 0.0, 30)
print(len(ec5.columns))
for i in ec5.columns:
    print(i)

7
INT
FIN
CON
RST
CLO
ECO
REQ


In [13]:
# We can group IPs differently as this might be impractical since they are all coming from the same router.
# IP addresses can be grouped by router (removing the last .value).
# NOTE: This might be due to the nature of the data and how the USNW-NB15 dataset was created. 
# If a company is collecting historic data from previously encountered IPs they can be collected like
# this and a model can be periodically trained for it. Alternatively the IPs could just be flagged instead of training the model on them.
cte = CorrOnehotEncoder(data['srcip'], data['label'])
ec6 = cte.encode(60, 0.0, 30)
print(len(ec6.columns))
for i in ec6.columns:
    print(i)

30
175.45.176.1
175.45.176.3
175.45.176.0
175.45.176.2
59.166.0.4
59.166.0.1
59.166.0.5
59.166.0.2
59.166.0.0
59.166.0.3
59.166.0.9
59.166.0.6
59.166.0.8
59.166.0.7
149.171.126.18
149.171.126.15
149.171.126.14
149.171.126.10
149.171.126.12
10.40.85.1
10.40.182.1
10.40.182.6
10.40.85.30
10.40.182.3
10.40.170.2
10.40.85.10
149.171.126.5
149.171.126.1
149.171.126.3
149.171.126.4


In [14]:
cte = CorrOnehotEncoder(data['dstip'], data['label'])
ec7 = cte.encode(60, 0.0, 30)
print(len(ec7.columns))
for i in ec7.columns:
    print(i)

30
149.171.126.18
149.171.126.15
149.171.126.14
149.171.126.10
149.171.126.12
149.171.126.17
149.171.126.19
149.171.126.13
149.171.126.11
149.171.126.3
149.171.126.2
149.171.126.4
149.171.126.1
149.171.126.5
149.171.126.0
149.171.126.9
149.171.126.7
149.171.126.6
149.171.126.8
149.171.126.16
175.45.176.3
175.45.176.1
175.45.176.0
224.0.0.5
10.40.182.3
10.40.182.255
10.40.85.1
10.40.170.2
10.40.85.30
59.166.0.0
