In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
import os
import time
from sklearn.utils.class_weight import compute_class_weight
import shap
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from explainerdashboard import InlineExplainer, ExplainerDashboard, ClassifierExplainer
import random
import tensorflow as tf

# New Encoders with fixed logic.
Added a max_encoded logic along with some other encoders.

In [None]:
class CorrOnehotEncoder:
    """
    CorrOnehotEncoder: Encodes the given column by creating one-hot encoded columns for categories that have
    a correlation higher than a threshold with the target column.
    """
    def __init__(self, column, target):
        """
        Constructor: Stores the column and target (storing the full data causes memory issues).
        
        Parameters:
            - column (pd.Series): The feature column to encode.
            - target (pd.Series): The target column.
        """
        # Force to string for groups.
        self.column = column.astype(str)
        # Convert to float32 precision to minimise memory load.
        self.target = target.astype(np.float32)

    def corr(self, x, y):
        """
        Calculate the Pearson correlation coefficient (Phi).
        
        Parameters:
            - x (tensor - float32): The first variable.
            - y (tensor - float32): The target to draw correlation to.
        
        Returns:
            - r (float32): The Pearson correlation coefficient (Phi).
        """
        mean_x = tf.reduce_mean(x)
        mean_y = tf.reduce_mean(y)
        covariance = tf.reduce_sum((x - mean_x) * (y - mean_y))
        std_x = tf.sqrt(tf.reduce_sum((x - mean_x) ** 2))
        std_y = tf.sqrt(tf.reduce_sum((y - mean_y) ** 2))
        r = covariance / (std_x * std_y)
        return r

    def encode(self, sparse_n, threshold, max_encoded):
        """
        Encode the feature column by creating one-hot encoded columns for categories that have
        a correlation higher than a threshold with the target.
        
        Parameters:
            - sparse_n (int): Minimum number of occurrences (1's) for a category in the column.
            - threshold (float): The correlation threshold.
            - max_encoded (int): The maximum number of encoded features.
        
        Returns:
            - ohe_df (pd.DataFrame): One-hot encoded columns that meet the correlation threshold.
        """
        # Convert to numpy for tensors.
        column_np = self.column.to_numpy()
        target_np = self.target.to_numpy()

        # Store results.
        ohe_list = []    
        column_names = []
        correlations = []
        # Iterate through each unique category in the column.
        for c in np.unique(column_np):
            # Convert to binary - float32 minimises memory issues.
            corr_column = (column_np == c).astype(np.float32)
            # If the category count is below sparse_n, skip encoding.
            if np.sum(corr_column) < sparse_n:
                continue
            # Convert to tensors for the correlation calculation.
            correlation = self.corr(tf.convert_to_tensor(corr_column, dtype=tf.float32), 
                                    tf.convert_to_tensor(target_np, dtype=tf.float32))
            # If the absolute correlation is greater than the threshold, add to the list.
            if abs(correlation.numpy()) > threshold:
                ohe_list.append(corr_column)
                column_names.append(c)
                # Store correlations to sort.
                correlations.append(abs(correlation.numpy()))

        # Sort the columns by their correlation with the target.
        sorted_indices = np.argsort(correlations)[::-1]
        sorted_ohe_list = []
        sorted_column_names = []
        for i in sorted_indices:
            sorted_ohe_list.append(ohe_list[i])
            sorted_column_names.append(column_names[i])

        # Limit the number of variables to max_encoded.
        if len(sorted_ohe_list) > max_encoded:
            sorted_ohe_list = sorted_ohe_list[:max_encoded]
            sorted_column_names = sorted_column_names[:max_encoded]
        # Add the encoded data to a dataframe.
        ohe_df = pd.DataFrame(np.column_stack(sorted_ohe_list), columns=sorted_column_names)
        
        if ohe_df.empty:
            print("No correlations exceed the threshold.")
            return pd.DataFrame()
        
        return ohe_df

In [None]:
class CorrVarEncoder:
    """
    CorrThresholdEncoder: Encodes a given column based on a correlation threshold. All values within the variable that fall below the threshold are
    converted to a given string name. For Example: LowThreshold.
    
    NOTE: It is recommended to include the threshold used in the new value name.

    Initialisation:
        - data (pd.Series): The column that contains the feature.
        - target (pd.Series): The target column to draw correlation to.
    """
    def __init__(self, column, target):
        self.column = column.astype(str)
        self.target = target.astype(np.float32)

    def corr(self, x, y):
        """
        Calculate the Pearson correlation coefficient (Phi).
        
        Parameters:
            - x (tensor - float32): The first variable.
            - y (tensor - float32): The target to draw correlation to.
        
        Returns:
            - r (float32): The Pearson correlation coefficient (Phi).
        """
        mean_x = tf.reduce_mean(x)
        mean_y = tf.reduce_mean(y)
        covariance = tf.reduce_sum((x - mean_x) * (y - mean_y))
        std_x = tf.sqrt(tf.reduce_sum((x - mean_x) ** 2))
        std_y = tf.sqrt(tf.reduce_sum((y - mean_y) ** 2))
        r = covariance / (std_x * std_y)
        return r

    def encode(self, threshold, value_name, sparse_n, max_encoded):
        """
        encode: Takes the column to encode and computes the correlation with the target column. If the correlation is below the threshold, 
        the category value is replaced with a specified value name. It also filters categories based on the sparse_n condition.
        
        Parameters:
            - threshold (float): The threshold for correlation. The function creates onehot encoded columns of all variables that have correlation
              higher than the threshold to the target label.
            - value_name (str): The value to replace categories with low correlation.
            - sparse_n (int): The minimum number of occurrences for a category to be considered.
            - max_encoded (int): The maximum number of categories.
        
        Returns:
            - pd.Series: The converted column.
        """
        corr_dict = {}

        # Go through each unique category in the column.
        for c in self.column.unique():
            corr_column = (self.column == c).astype(np.float32)
            num_ones = corr_column.sum()
            # Set category to value name if below sparse_n.
            if num_ones < sparse_n:
                self.column[self.column == c] = value_name
                continue

            # Convert to tensors to minimise memory allocation.
            corr_column_tensor = tf.convert_to_tensor(corr_column, dtype=tf.float32)
            target_tensor = tf.convert_to_tensor(self.target, dtype=tf.float32)
            # Calculate the correlation with the target label.
            correlation = self.corr(corr_column_tensor, target_tensor)
            # Only add to the dictionary if the correlation is above the threshold.
            if abs(correlation.numpy()) >= threshold:
                corr_dict[c] = correlation.numpy()
            else:
                # If correlation is below threshold, mark as low correlation.
                self.column[self.column == c] = value_name

        # Sort categories for max_encoded.
        sorted_corr_dict = sorted(corr_dict.items(), key=lambda x: abs(x[1]), reverse=True)
        limited_categories = []
        for item in sorted_corr_dict[:min(max_encoded-1, len(sorted_corr_dict))]:
            limited_categories.append(item[0])

        # Replace values that are not in the top 'max_encoded' categories.
        for c in self.column.unique():
            if c not in limited_categories:
                self.column[self.column == c] = value_name

        return self.column

In [None]:
class CorrBinEncoder:
    """
    CorrBinEncoder: Encodes a variable based on the correlation drawn to the given label based on the number of categories provided. The variable is binarised
    using the correlations with pd.cut.

    NOTE: pd.cut creates relative borders when binarising which means that at times even a value that is labelled as High might still only be low correlation (0.1).
    """
    def __init__(self, column, target):
        self.column = column.astype(str)
        self.target = target.astype(np.float32)

    def corr(self, x, y):
        """
        Calculate the Pearson correlation (Phi).
        
        Parameters:
            - x (tensor - float32): The first variable.
            - y (tensor - float32): The target to draw correlation to.
        
        Returns:
            - r (float32): The Pearson correlation coefficient (Phi).
        """
        mean_x = tf.reduce_mean(x)
        mean_y = tf.reduce_mean(y)
        covariance = tf.reduce_sum((x - mean_x) * (y - mean_y))
        std_x = tf.sqrt(tf.reduce_sum((x - mean_x) ** 2))
        std_y = tf.sqrt(tf.reduce_sum((y - mean_y) ** 2))
        r = covariance / (std_x * std_y)
        return r

    def encode(self, column, bin_cut, bin_labels):
        """
        encode: Select a number of bins and corresponding labels to binarize the variable based on correlation to the label.

        Parameters:
            - column (string): The column to encode.
            - bin_cut (int): The number of bins to create based on pd.cut.
            - bin_labels (list): A list of strings to name the new values (High, Medium, Low) - must match the same number of bins.

        Returns:
            - pd.Series: The encoded column with correlation binned by categories.
        """
        corr_dict = {}

        # Go through each unique category in the column.
        for c in self.column.unique():
            # Create a binary column for each category.
            corr_column = (self.column == c).astype(np.float32)

            # Convert to tensors to minimise memory for corr calculation.
            corr_column_tensor = tf.convert_to_tensor(corr_column, dtype=tf.float32)
            target_tensor = tf.convert_to_tensor(self.target, dtype=tf.float32)
            correlation = self.corr(corr_column_tensor, target_tensor)
            corr_dict[c] = correlation.numpy()

        # Create a DataFrame for cut.
        corr_df = pd.DataFrame(list(corr_dict.items()), columns=['category', 'corr'])
        corr_df['abs_corr'] = corr_df['corr'].abs()
        # Binarise using pd.cut.
        corr_df['binned'] = pd.cut(
            corr_df['abs_corr'],
            bins=bin_cut,
            labels=bin_labels,
            include_lowest=True
        )
        # Map each category to its corresponding bin.
        category_to_bin = dict(zip(corr_df['category'], corr_df['binned']))
        encoded_column = self.column.map(category_to_bin)

        return encoded_column

# Original Version:
- kept as a reference for the original DosAnalysis.

In [None]:
class CorrEncoder:
    """
    CorrEncoder: Takes a dataset as input and uses it for the encode function. Encodes the filtered categories then draws correlations.
    If correlation is above the threshold adds it to a new dataframe then returns the one hot encoded values with the labels.

    Initialisation:
        - data (pd.DataFrame): The Dataset that contains the target column and target label variables.
    """
    
    def __init__(self, data):
        self.data = data.reset_index(drop=True).copy()
        #self.data = self.data.drop(columns=['Label'])

    def encode(self, target_column, sparse_n, threshold):
        """
        encode: Takes a target column and target label to encode and draw correlations from. The target column is iterated through
        for all categories that contain more positive values than defined in sparse_n. This allows for filtering of sparse categories.
        The function then one hot encodes the given category with the static target column and draws correlations for them. If correlation
        is greater then threshold then add it to the new DataFrame. The function returns the one hot encoded categories that pass the
        threshold with the target label.

        The purpose of this function is to resolve the high cardinality problem in one hot encoding.

        Parameters:
            - target_column (string): The name of the target column. The target column should contain the various categories to encode.
            - sparse_n (integer): The minimum amount of positive values required for a category after encoding (deals with sparse categories).
            - threshold (float): The threshold for correlation. The function creates onehot encoded columns of all variables that have correlation
              higher than the threshold to the target label.

        Returns:
            - ohe_df (pd.DataFrame): The one hot encoded values from the target columns.
        """


        self.data[target_column] = self.data[target_column].astype(str)
        value_counts = self.data[target_column].value_counts()
        # Check if number of 1s is above the given threshold set by sparse_n.
        categories = value_counts[value_counts > sparse_n].index.tolist()
        ohe_list = []    
        attack_cat = self.data['attack_cat']
        # Go through each unique category in the target column.
        for c in categories:
            col_name = f'{target_column}_{c}'

            # Create the binary encoding column for the current category and target label
            corr_column = (self.data[target_column] == c).astype(int)
            correlation = corr_column.corr(attack_cat)

            # Check if absolute correlation is greater than threshold.
            if abs(correlation) > threshold:
                corr_column.name = col_name
                ohe_list.append(corr_column)
        print('Number of Encoded Features for', target_column)
        print(len(ohe_list))
        if ohe_list:
            # NOTE: This section can be expanded to include print outs but at the moment am focusing on the evaluations.
            ohe_df = pd.concat(ohe_list, axis=1)
            return ohe_df
        else:
            # This ommits errors (if really high thresholds are used).
            print("No correlations exceed the threshold.")
            return pd.DataFrame()

In [5]:
data = pd.read_csv('../Cleaned_full_data.csv')
data = data.reset_index(drop=True)
# Set NA to 0.
data['ct_ftp_cmd'] = data['ct_ftp_cmd'].fillna(0)
data['ct_ftp_cmd'] = data['ct_ftp_cmd'].fillna(0)
data['attack_cat'] = data['attack_cat'].str.replace(r'\s+', '', regex=True)
data['attack_cat'] = data['attack_cat'].str.replace('Backdoors', 'Backdoor')
# Select a threat category.
category = 'DoS'
data['attack_cat'] = (data['attack_cat'] == category).astype(int)



In [7]:
encoder = CorrEncoder(data)
threshold = 0.01
ohe4a = encoder.encode('proto', 30, threshold)

Number of Encoded Features for proto
129


In [None]:
# Although Dsport has just as many unique categories it is not as sensitive to threshold as sport.
encoded_columns = ['dsport', 'proto', 'sport', 'srcip', 'dstip', ]
for column in encoded_columns:
    unique_categories = data[column].nunique()
    print(f"Column '{column}': {unique_categories}")

Column 'dsport': 128310
Column 'proto': 135
Column 'sport': 100341
Column 'srcip': 43
Column 'dstip': 47


In [5]:
# Takes around 90 minutes (likely because many values are just above sparse_n in sport
# column considering the attack type).
encoder = CorrEncoder(data)
threshold = 0.01
# 0.001 fails with memory allocation for sport (concatenating the columns).
# Could try 0.005.
ohe1a = encoder.encode('sport', 30, threshold)
threshold = 0
ohe2 = encoder.encode('state', 30, threshold)
ohe3 = encoder.encode('service', 30, threshold)
ohe4 = encoder.encode('proto', 30, threshold)
# This could be reduced to 0 too but just want to see if we get an improvement with the current
# settings.
threshold = 0.001
ohe5 = encoder.encode('dsport', 30, threshold)
ohe6 = encoder.encode('srcip', 30, threshold)
ohe7 = encoder.encode('dstip', 30, threshold)

Number of Encoded Features for sport
11
Number of Encoded Features for state
9
Number of Encoded Features for service
13
Number of Encoded Features for proto
132
Number of Encoded Features for dsport
76
Number of Encoded Features for srcip
26
Number of Encoded Features for dstip
29


In [8]:
encoded_columns = {
    'proto2': ohe4a,
}

# Save data. This is the most optimal set I have been able to get without memory issues.
# NOTE: Batching may not be possible because we won't be drawing correlation on the full variable.
for column_name, encoded_data in encoded_columns.items():
    encoded_data.to_csv(f'DoS_{column_name}_encoded.csv', index=False)
    print(f"DoS_{column_name}_encoded.csv")

DoS_proto2_encoded.csv


In [12]:
data_encoded = pd.concat([data, ohe1a, ohe2, ohe3, ohe4, ohe5, ohe6, ohe7], axis=1)
data_encoded = data_encoded.drop(columns=['sport', 'state', 'service', 'proto', 'dsport', 'srcip', 'dstip'])

In [None]:
print(len(data_encoded.columns))

338
