In [None]:
# Core data manipulation and analysis libraries
import pandas as pd  # For data manipulation and analysis
import numpy as np   # For numerical operations and arrays

# Visualization libraries
import matplotlib.pyplot as plt  # For creating static, animated, and interactive visualizations
import seaborn as sns  # For statistical data visualization

# Machine learning libraries
from sklearn import metrics  # For model evaluation metrics
from sklearn.model_selection import train_test_split  # For splitting datasets

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
import random

In [None]:
df = pd.read_csv('Cleaned_full_data.csv')
print(len(df))
df['attack_cat'] = df['attack_cat'].str.replace('Backdoors', 'Backdoor')
# Interesting here. I was getting this error when encoding but you seemed to have fixed it.
df = df[~df['sport'].astype(str).str.startswith('0x')]
df = df[~df['dsport'].astype(str).str.startswith('0x')]
print(len(df))

In [3]:
print(df.head())
print(df['attack_cat'].value_counts())

        srcip  sport          dstip dsport proto state       dur  sbytes  \
0  59.166.0.0   1390  149.171.126.6     53   udp   CON  0.001055     132   
1  59.166.0.0  33661  149.171.126.9   1024   udp   CON  0.036133     528   
2  59.166.0.6   1464  149.171.126.7     53   udp   CON  0.001119     146   
3  59.166.0.5   3593  149.171.126.5     53   udp   CON  0.001209     132   
4  59.166.0.3  49664  149.171.126.0     53   udp   CON  0.001169     146   

   dbytes  sttl  ...  ct_ftp_cmd  ct_srv_src  ct_srv_dst ct_dst_ltm  \
0     164    31  ...         0.0           3           7          1   
1     304    31  ...         0.0           2           4          2   
2     178    31  ...         0.0          12           8          1   
3     164    31  ...         0.0           6           9          1   
4     178    31  ...         0.0           7           9          1   

   ct_src_ltm  ct_src_dport_ltm  ct_dst_sport_ltm  ct_dst_src_ltm  attack_cat  \
0           3                 1    

In [6]:
def select_features(data, type_of, k):
    """
    select_features: Filter data using various feature importance methods.

    - 'corr': Filters based on correlation.
    - 'ft_importance': Filters using feature importance from Random Forest.
    - 'kbest': Filters using Select K-Best from scikit.

    Parameters:
        - type_of (string): The type of feature importance to measure correlation (corr), feature importance from Random Forest (ft_importance), and Select K-Best (k-best).

    Returns:
        top_k_features (list): The list of n features.
    """
    
    X = data.drop(columns=['attack_cat', 'Label'])
    y = data['Label']
    
    if type_of == 'correlation':
        corr_values = {}
        for feature in X.columns:
            corr_values[feature] = X[feature].corr(y)
        corr_data = pd.Series(corr_values).abs()
        top_k_features = corr_data.nlargest(k).index.tolist()
        return top_k_features

    elif type_of == 'ft_importance':
        model = RandomForestClassifier()
        model.fit(X, y)
        feature_importances = pd.Series(model.feature_importances_, index=X.columns)
        top_k_features = feature_importances.nlargest(k).index.tolist()
        return top_k_features

    elif type_of == 'kbest':
        selector = SelectKBest(score_func=f_classif, k=k)
        selector.fit(X, y)
        top_k_features = []
        selected_mask = selector.get_support()
        for i in range(len(selected_mask)):
            if selected_mask[i]:
                top_k_features.append(X.columns[i])

        return top_k_features

In [None]:
class CorrEncoder:
    """
    CorrEncoder: Takes a dataset as input and uses it for the encode function. Encodes the filtered categories then draws correlations.
    If correlation is above the threshold adds it to a new dataframe then returns the one hot encoded values with the labels.

    Initialisation:
        - data (pd.DataFrame): The Dataset that contains the target column and target label variables.
    """
    
    def __init__(self, data):
        self.data = data.copy()
        # Removes Label for the multi-class processing as it is based on the label category (threat or not).
        self.data = self.data.drop(columns=['attack_cat'])

    def encode(self, target_column, sparse_n, threshold, print_data):
        """
        encode: Takes a target column and target label to encode and draw correlations from. The target column is iterated through
        for all categories that contain more positive values than defined in sparse_n. This allows for filtering of sparse categories.
        The function then one hot encodes the given category with the static target column and draws correlations for them. If correlation
        is greater then threshold then add it to the new DataFrame. The function returns the one hot encoded categories that pass the
        threshold with the target label.

        The purpose of this function is to resolve the high cardinality problem in one hot encoding.

        Parameters:
            - target_column (string): The name of the target column. The target column should contain the various categories to encode.
            - sparse_n (integer): The minimum amount of positive values required for a category after encoding (deals with sparse categories).
            - threshold (float): The threshold for correlation. The function creates onehot encoded columns of all variables that have correlation
              higher that the threshold to the target label.

        Returns:
            - ohe_df (pd.DataFrame): The one hot encoded values from the target column.
        """

        self.data[target_column] = self.data[target_column].astype(str)
        value_counts = self.data[target_column].value_counts()
        # Check if number of 1s is above the given threshold set by sparse_n.
        categories = value_counts[value_counts > sparse_n].index.tolist()
        ohe_list = []
        # Attack category for multi-class binary.
        attack_cat = self.data['label']
        
        # Go through each unique category in the target column.
        for c in categories:
            col_name = f'{target_column}_{c}'

            # Create the binary encoding column for the current category and target label.
            corr_column = (self.data[target_column] == c).astype(int)
            correlation = corr_column.corr(attack_cat)

            # Check if absolute correlation is greater than threshold.
            if abs(correlation) > threshold:
                corr_column.name = col_name
                ohe_list.append(corr_column)
        if print_data:
            print('Number of Encoded Features for', target_column)
            print(len(ohe_list))
        if ohe_list:
            # NOTE: This section can be expanded to include print outs but at the moment am focusing on the evaluations.
            ohe_df = pd.concat(ohe_list, axis=1)
            return ohe_df
        else:
            return pd.DataFrame()

In [9]:
feature_names_df = pd.read_csv('NUSW-NB15_features.csv', encoding='ISO-8859-1')
feature_names_df['Name'] = feature_names_df['Name'].str.strip().str.lower().str.replace(' ', '')

df = pd.read_csv('Cleaned_full_data.csv')


In [10]:
# Identify categorical and numerical columns
categorical_features = df.select_dtypes(include=['object']).columns.tolist()
numerical_features = df.select_dtypes(include=['number']).columns.tolist()

# Count the number of categorical and numerical features
num_categorical_features = len(categorical_features)
num_numerical_features = len(numerical_features)

In [None]:
categorical_columns = ['state', 'service']
# There is not many unique values here so it works ok.
encoder = OneHotEncoder(sparse_output=False, dtype='float32')
encoded_data = encoder.fit_transform(df[categorical_columns])
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_columns), index=df.index)
full_encoded = pd.concat([df.drop(columns=categorical_columns), encoded_df], axis=1)

# Select correlation threshold. All values with correlation less than threshold to the label are not encoded.
# NOTE: Decreasing threshold 0.01 or lower significantly increases the number of columns. I think 0.01 was around 244 for downsampled but it usually is more for
# full data.
threshold = 0.1
encoder = CorrEncoder(full_encoded)
ohe1 = encoder.encode('dsport', 30, threshold, True)
ohe2 = encoder.encode('proto', 30, threshold, True)
ohe3 = encoder.encode('sport', 30, threshold, True)
ohe4 = encoder.encode('srcip', 30, threshold, True)
ohe5 = encoder.encode('dstip', 30, threshold, True)
cols_to_drop = ['dsport', 'proto', 'sport', 'srcip', 'dstip']
filtered_data = full_encoded.drop(columns=cols_to_drop)
combined_data = pd.concat([filtered_data, ohe1, ohe2, ohe3, ohe4, ohe5], axis=1)

Number of Encoded Features for dsport
3
Number of Encoded Features for proto
3


KeyboardInterrupt: 

# Example usage of Downsampling.

In [None]:
#Full Downsample
#threat_rows = df[df['attack_cat'] != 'Normal']
#num_threat_rows = len(threat_rows)
#print(df['attack_cat'].value_counts())
#normal_rows = df[df['attack_cat'] == 'Normal']
#sampled_data = normal_rows.sample(n=num_threat_rows, random_state=rs)
#df = pd.concat([threat_rows, sampled_data]).reset_index(drop=True)

# Choose a random seed.
rs = 42

# Select proportion to downsample by.
downsample = 0.5
mask = (df['Label'].shift(-1) != 1) & (df['Label'].shift(1) != 1)
normal_rows = df[(df['attack_cat'] == 'Normal') & mask]
percentage_to_remove = int(len(normal_rows) * downsample)
rows_to_remove = normal_rows.sample(n=percentage_to_remove, random_state=rs)
df = df.drop(rows_to_remove.index)
print(f"Downsampled Rows: {len(rows_to_remove)}")