In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
import os
import time
from sklearn.utils.class_weight import compute_class_weight
import shap
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import random

In [None]:
class CorrEncoder:
    """
    CorrEncoder: Takes a dataset as input and uses it for the encode function. Encodes the filtered categories then draws correlations.
    If correlation is above the threshold adds it to a new dataframe then returns the one hot encoded values with the labels.

    Initialisation:
        - data (pd.DataFrame): The Dataset that contains the target column and target label variables.
    """
    
    def __init__(self, data):
        self.data = data.copy()
        self.data = self.data.drop(columns=['Label'])

    def encode(self, target_column, sparse_n, threshold):
        """
        encode: Takes a target column and target label to encode and draw correlations from. The target column is iterated through
        for all categories that contain more positive values than defined in sparse_n. This allows for filtering of sparse categories.
        The function then one hot encodes the given category with the static target column and draws correlations for them. If correlation
        is greater then threshold then add it to the new DataFrame. The function returns the one hot encoded categories that pass the
        threshold with the target label.

        The purpose of this function is to resolve the high cardinality problem in one hot encoding.

        Parameters:
            - target_column (string): The name of the target column. The target column should contain the various categories to encode.
            - sparse_n (integer): The minimum amount of positive values required for a category after encoding (deals with sparse categories).
            - threshold (float): The threshold for correlation. The function creates onehot encoded columns of all variables that high correlation
              higher that the threshold to the target label.
            - cat (string): The category label to compare to.

        Returns:
            - ohe_df (pd.DataFrame): The one hot encoded values from the target columns.
        """


        self.data[target_column] = self.data[target_column].astype(str)
        value_counts = self.data[target_column].value_counts()
        # Check if number of 1s is above the given threshold set by sparse_n.
        categories = value_counts[value_counts > sparse_n].index.tolist()
        ohe_list = []    
        attack_cat = (self.data['attack_cat'] != 'Normal').astype(int)
        # Go through each unique category in the target column.
        for c in categories:
            col_name = f'{target_column}_{c}'

            # Create the binary encoding column for the current category and target label
            corr_column = (self.data[target_column] == c).astype(int)
            correlation = corr_column.corr(attack_cat)

            # Check if absolute correlation is greater than threshold.
            if abs(correlation) > threshold:
                corr_column.name = col_name
                ohe_list.append(corr_column)
        print('Number of Encoded Features for', target_column)
        print(len(ohe_list))
        if ohe_list:
            # NOTE: This section can be expanded to include print outs but at the moment am focusing on the evaluations.
            ohe_df = pd.concat(ohe_list, axis=1)
            return ohe_df
        else:
            # This ommits errors (if really high thresholds are used).
            print("No correlations exceed the threshold.")
            return pd.DataFrame()

In [6]:
def get_data(size, rs, threshold, downsample, split_method, category):
    """
    get_data: 
    """
    feature_names = pd.read_csv('features2.csv')
    feature_names_list = feature_names['Name'].tolist()
    datasets = []
    for i in range(1, 5):
        df = pd.read_csv(f'UNSW-NB15_{i}.csv', header=None)
        df.columns = feature_names_list
        df.loc[df['attack_cat'].isnull(), 'attack_cat'] = 'Normal'
        datasets.append(df)
    filtered_datasets = []
    for df in datasets:
        length1 = len(df)
        df['attack_cat'] = df['attack_cat'].str.replace(r'\s+', '', regex=True)
        df['attack_cat'] = df['attack_cat'].str.replace('Backdoors', 'Backdoor')
        # Very sparse data - optional.
        df = df.drop(columns=['ct_ftp_cmd', 'ct_flw_http_mthd', 'is_ftp_login'])
        df = df[~df['sport'].astype(str).str.startswith('0x')]
        df = df[~df['sport'].astype(str).str.startswith('-')]
        df['sport'] = df['sport'].apply(pd.to_numeric)
        df = df[~df['dsport'].astype(str).str.startswith('0x')]
        df = df[~df['dsport'].astype(str).str.startswith('-')]
        df['dsport'] = df['dsport'].apply(pd.to_numeric)
        print(f"Filtered Rows (Cleaning): {length1 - len(df)}")
        df = df[df['attack_cat'].isin(['Normal', category])]
        
        if downsample == 'full':
            threat_rows = df[df['attack_cat'] != 'Normal']
            num_threat_rows = len(threat_rows)
            print(df['attack_cat'].value_counts())
            normal_rows = df[df['attack_cat'] == 'Normal']
            sampled_data = normal_rows.sample(n=num_threat_rows, random_state=rs)
            df = pd.concat([threat_rows, sampled_data]).reset_index(drop=True)

        # Downsample by a given pecentage.
        elif downsample is not None:
            mask = (df['Label'].shift(-1) != 1) & (df['Label'].shift(1) != 1)
            normal_rows = df[(df['attack_cat'] == 'Normal') & mask]
            percentage_to_remove = int(len(normal_rows) * downsample)
            rows_to_remove = normal_rows.sample(n=percentage_to_remove, random_state=rs)
            df = df.drop(rows_to_remove.index)
            print(f"Downsampled Rows: {len(rows_to_remove)}")

        filtered_datasets.append(df)
    full_data = pd.concat(filtered_datasets).reset_index(drop=True)

    categorical_columns = ['state', 'service']
    encoder = OneHotEncoder(sparse_output=False, dtype='float32')
    encoded_data = encoder.fit_transform(full_data[categorical_columns])
    encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_columns), index=full_data.index)
    full_encoded = pd.concat([full_data.drop(columns=categorical_columns), encoded_df], axis=1)
   # return full_encoded
    encoder = CorrEncoder(full_encoded)
    ohe1 = encoder.encode('dsport', 30, threshold)
    ohe2 = encoder.encode('proto', 30, threshold)
    ohe3 = encoder.encode('sport', 30, threshold)
    ohe4 = encoder.encode('srcip', 30, threshold)
    ohe5 = encoder.encode('dstip', 30, threshold)
    cols_to_drop = ['dsport', 'proto', 'sport', 'srcip', 'dstip']
    filtered_data = full_encoded.drop(columns=cols_to_drop)
    combined_data = pd.concat([filtered_data, ohe1, ohe2, ohe3, ohe4, ohe5], axis=1)
    combined_data['attack_cat'] = (combined_data['attack_cat'] == category).astype(int)
    df_features = combined_data.drop(columns=['attack_cat', 'Label'])
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(df_features)
    final_data = pd.DataFrame(scaled_data, columns=df_features.columns, index=combined_data.index)
    final_data['attack_cat'] = combined_data['attack_cat']

    if split_method == 'slice':
        slice_size = int(size * len(final_data))
        val_start = random.randrange(0, len(final_data) - 2 * slice_size)
        val_end = val_start + slice_size
        val_data = final_data.iloc[val_start:val_end]
        train_data = final_data.drop(val_data.index)
    elif split_method == 'shuffle':
        train_data, val_data = train_test_split(final_data, test_size=size, random_state=rs)
    
    return train_data, val_data


In [8]:
def select_features(data, type_of, k):
    """
    select_features: Filter data using various feature importance methods.

    - 'corr': Filters based on correlation.
    - 'ft_importance': Filters using feature importance from Random Forest.
    - 'kbest': Filters using Select K-Best from scikit.

    Parameters:
        - type_of (string): The type of feature importance to measure correlation (corr), feature importance from Random Forest (ft_importance), and Select K-Best (k-best).

    Returns:
        top_k_features (list): The list of n features.
    """
    
    X = data.drop(columns=['attack_cat'])
    y = data['attack_cat']
    
    if type_of == 'correlation':
        corr_values = {}
        for feature in X.columns:
            corr_values[feature] = X[feature].corr(y)
        corr_data = pd.Series(corr_values).abs()
        top_k_features = corr_data.nlargest(k).index.tolist()
        return top_k_features

    elif type_of == 'ft_importance':
        model = RandomForestClassifier()
        model.fit(X, y)
        feature_importances = pd.Series(model.feature_importances_, index=X.columns)
        top_k_features = feature_importances.nlargest(k).index.tolist()
        return top_k_features

    elif type_of == 'kbest':
        selector = SelectKBest(score_func=f_classif, k=k)
        selector.fit(X, y)
        top_k_features = []
        selected_mask = selector.get_support()
        for i in range(len(selected_mask)):
            if selected_mask[i]:
                top_k_features.append(X.columns[i])

        return top_k_features

In [12]:
train_data, val_data = get_data(
    size=0.2,
    rs=42,
    threshold=0.1,
    downsample=.90,
    split_method='shuffle',
    category='Fuzzers'
    )

  df = pd.read_csv(f'UNSW-NB15_{i}.csv', header=None)
  df = pd.read_csv(f'UNSW-NB15_{i}.csv', header=None)


Filtered Rows (Cleaning): 67
Downsampled Rows: 606946
Filtered Rows (Cleaning): 61
Downsampled Rows: 578493
Filtered Rows (Cleaning): 105
Downsampled Rows: 479475
Filtered Rows (Cleaning): 75
Downsampled Rows: 310883
Number of Encoded Features for dsport
9
Number of Encoded Features for proto
1
Number of Encoded Features for sport
2
Number of Encoded Features for srcip
4
Number of Encoded Features for dstip
10


In [13]:
print(train_data['attack_cat'].value_counts())
print(val_data['attack_cat'].value_counts())

attack_cat
0    194112
1     19412
Name: count, dtype: int64
attack_cat
0    48547
1     4834
Name: count, dtype: int64


In [14]:
X_train = train_data.drop(columns=['attack_cat'])
y_train = train_data['attack_cat']
X_val = val_data.drop(columns=['attack_cat'])
y_val = val_data['attack_cat']
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Precision:", precision_score(y_val, y_pred))
print("Recall:", recall_score(y_val, y_pred))
print("F1 Score:", f1_score(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))

Accuracy: 0.9894531762237501
Precision: 0.9014852415867645
Recall: 0.991932147290029
F1 Score: 0.9445484093371417

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99     48547
           1       0.90      0.99      0.94      4834

    accuracy                           0.99     53381
   macro avg       0.95      0.99      0.97     53381
weighted avg       0.99      0.99      0.99     53381



In [15]:
features1 = select_features(train_data, 'correlation', 30)
features2 = select_features(train_data, 'ft_importance', 30)
features3 = select_features(train_data, 'kbest', 30)

print(features1)
print(features2)
print(features3)

['sttl', 'ct_state_ttl', 'dttl', 'srcip_175.45.176.1', 'srcip_175.45.176.0', 'tcprtt', 'dsport_445', 'synack', 'dsport_179', 'ackdat', 'srcip_175.45.176.3', 'srcip_175.45.176.2', 'dsport_520', 'dstip_149.171.126.10', 'dstip_149.171.126.11', 'dstip_149.171.126.12', 'dstip_149.171.126.14', 'dstip_149.171.126.17', 'dstip_149.171.126.15', 'dstip_149.171.126.13', 'dsport_514', 'dstip_149.171.126.16', 'sport_0', 'dstip_149.171.126.19', 'dsport_0', 'dmeansz', 'dsport_1723', 'service_-', 'state_INT', 'proto_unas']
['ct_state_ttl', 'sttl', 'dttl', 'tcprtt', 'Dload', 'synack', 'ackdat', 'srcip_175.45.176.1', 'dmeansz', 'service_-', 'srcip_175.45.176.2', 'smeansz', 'service_dns', 'Dintpkt', 'dbytes', 'Stime', 'Ltime', 'dsport_520', 'ct_dst_sport_ltm', 'sbytes', 'Sintpkt', 'ct_srv_dst', 'dsport_445', 'srcip_175.45.176.0', 'Sload', 'dsport_53', 'srcip_175.45.176.3', 'ct_dst_src_ltm', 'dsport_179', 'dtcpb']
['sttl', 'dttl', 'dmeansz', 'tcprtt', 'synack', 'ackdat', 'ct_state_ttl', 'state_INT', 'servi