In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
from numba import cuda

There are 2 types of features: numeric and categorical. Each of them will have different formula for calculating the similarity.

The numerical features are using:
$$
\mathrm{sim_{feature}} = 1 - \mathrm{diff_{feature}}
$$

$$
\mathrm{diff} = \frac{|q - c|}{\max_{\mathrm{feature}} - \min_{\mathrm{feature}}}
$$

The categorical features are using if statement.
$$
\mathrm{sim}_{\mathrm{feature}} =
\begin{cases}
1, & \text{if } q = c \\
0, & \text{otherwise}
\end{cases}
$$

Then the similarities would be summed with this formula:
$$
\mathrm{Total\_Sim} = \mathrm{weight}_{\mathrm{feature}} \cdot \mathrm{sim}_{\mathrm{feature}}
$$

In [None]:
@cuda.jit
def similarity_kernel(d_casebase_numeric, d_casebase_categorical,
                      d_query_numeric, d_query_categorical,
                      d_weights_numeric, d_weights_categorical,
                      d_min_vals, d_max_vals,
                      d_similarities_out):
    """
    This CUDA kernel calculates the similarity for one case against the query.
    It will be executed by thousands of GPU threads in parallel.
    """
    idx = cuda.grid(1)
    if idx >= d_casebase_numeric.shape[0]:
        return

    total_similarity = 0.0
    num_numeric_features = d_casebase_numeric.shape[1]
    num_categorical_features = d_casebase_categorical.shape[1]

    # Similarity for NUMERIC features
    for i in range(num_numeric_features):
        weight, query_val, case_val = d_weights_numeric[i], d_query_numeric[i], d_casebase_numeric[idx, i]
        max_val, min_val = d_max_vals[i], d_min_vals[i]
        sim = 1.0
        if max_val != min_val:
            normalized_diff = abs(query_val - case_val) / (max_val - min_val)
            sim = 1.0 - (normalized_diff * normalized_diff)
        total_similarity += weight * sim

    # Similarity for CATEGORICAL features
    for i in range(num_categorical_features):
        weight, query_val, case_val = d_weights_categorical[i], d_query_categorical[i], d_casebase_categorical[idx, i]
        sim = 1.0 if query_val == case_val else 0.0
        total_similarity += weight * sim

    d_similarities_out[idx] = total_similarity

The weighting formula is the one that probably didn't implemented accurately due to minimal information from the paper.

For numeric weighting, I try to use this formula:
$$
\mathbf{w}_{\mathrm{numerical}} = |\mathrm{mean_{agent}} - \mathrm{mean_{full}}|
$$

While categorical weighting use this:
$$
\mathbf{w}_{\mathrm{categorical}} = \begin{bmatrix} 0.1 & 0.1 & \dots & 0.1 \end{bmatrix}_{1 \times n}
$$

When one case is received, each agent will retrieve the most similar from 10 cases from each corresponding agent. Which mean the total would be 100 cases being retrieved (total agent is 10).

From that 100 cases, it will be picked 10 most similar agent. For example from 100 cases, it found that the most 10 similar cases are from:\
Shellcode = 0.999 \
Shellcode = 0.997 \
Generic = 0.988 \
Generic = 0.987 \
Generic = 0.977 \
Generic = 0.945 \
Generic = 0.943 \
Generic = 0.931 \
Generic = 0.919 \
Fuzzer = 0.681 \
Despite Shellcode has the highest similarity, but majority are from Generic category. Therefore, the new case is categorized as Generic.

In [None]:
class CBRAgent:
    def __init__(self, attack_category, training_data, numeric_features, categorical_features, cat_mappings):
        self.attack_category = attack_category
        self.numeric_features = numeric_features
        self.categorical_features = categorical_features
        self.cat_mappings = cat_mappings
        self.casebase_df = training_data[training_data['attack_cat'] == attack_category].copy()

        if self.casebase_df.empty:
            self.is_active = False
            return
        self.is_active = True

        # --- Data Preparation for GPU ---
        casebase_numeric_np = self.casebase_df[numeric_features].to_numpy(dtype=np.float32)
        casebase_categorical_np = self.casebase_df[categorical_features].to_numpy(dtype=np.int32)

        self.d_casebase_numeric = cuda.to_device(casebase_numeric_np)
        self.d_casebase_categorical = cuda.to_device(casebase_categorical_np)

        full_data_mean = training_data[numeric_features].mean()
        agent_data_mean = self.casebase_df[numeric_features].mean()

        numeric_weights = (agent_data_mean - full_data_mean).abs().values
        categorical_weights = np.full(len(categorical_features), 0.1)

        total_weight = numeric_weights.sum() + categorical_weights.sum()
        if total_weight > 0:
            numeric_weights /= total_weight
            categorical_weights /= total_weight

        self.d_weights_numeric = cuda.to_device(numeric_weights.astype(np.float32))
        self.d_weights_categorical = cuda.to_device(categorical_weights.astype(np.float32))

        min_vals = self.casebase_df[numeric_features].min().values.astype(np.float32)
        max_vals = self.casebase_df[numeric_features].max().values.astype(np.float32)
        self.d_min_vals = cuda.to_device(min_vals)
        self.d_max_vals = cuda.to_device(max_vals)

    def find_most_similar_numba(self, query_case_numeric, query_case_categorical, n=10):
        if not self.is_active:
            return []

        # --- Kernel Launch Setup ---
        d_query_numeric = cuda.to_device(query_case_numeric)
        d_query_categorical = cuda.to_device(query_case_categorical)

        num_cases = self.d_casebase_numeric.shape[0]
        d_similarities = cuda.device_array(num_cases, dtype=np.float32)

        threads_per_block = 128
        blocks_per_grid = (num_cases + (threads_per_block - 1)) // threads_per_block

        # --- Launch the Kernel ---
        similarity_kernel[blocks_per_grid, threads_per_block](
            self.d_casebase_numeric, self.d_casebase_categorical,
            d_query_numeric, d_query_categorical,
            self.d_weights_numeric, self.d_weights_categorical,
            self.d_min_vals, self.d_max_vals,
            d_similarities
        )

        # --- Retrieve and Process Results ---
        similarities_host = d_similarities.copy_to_host()
        n = min(n, len(similarities_host))
        top_n_indices = np.argpartition(similarities_host, -n)[-n:]
        top_n_indices = top_n_indices[np.argsort(similarities_host[top_n_indices])][::-1]

        top_scores = similarities_host[top_n_indices]
        top_cases = self.casebase_df.iloc[top_n_indices]

        results = []
        for i in range(len(top_n_indices)):
            results.append((top_scores[i], top_cases.iloc[i]))

        return results

In [None]:
class IDS:
    def __init__(self, training_data, numeric_features, categorical_features, cat_mappings):
        self.agents = {}
        self.categories = training_data['attack_cat'].unique()
        self.numeric_features = numeric_features
        self.categorical_features = categorical_features

        print("Initializing agents and transferring data to GPU...")
        for category in self.categories:
            self.agents[category] = CBRAgent(category, training_data, numeric_features, categorical_features, cat_mappings)
        print("All agents initialized.")

    def _get_top_n_votes(self, test_case, n=10):
        """Helper method to get the list of top N predicted categories."""
        all_votes = []
        query_numeric = test_case[self.numeric_features].to_numpy(dtype=np.float32)
        query_categorical = test_case[self.categorical_features].to_numpy(dtype=np.int32)

        for agent in self.agents.values():
            all_votes.extend(agent.find_most_similar_numba(query_numeric, query_categorical, n))

        all_votes.sort(reverse=True, key=lambda x: x[0])
        top_n_overall = all_votes[:n]

        return [case['attack_cat'] for _, case in top_n_overall]

    def classify(self, test_case, n=10):
        """Performs majority vote to classify a case."""
        vote_list = self._get_top_n_votes(test_case, n)
        if not vote_list:
            return "Unknown"
        return max(set(vote_list), key=vote_list.count)

In [None]:
def preprocess_data(df, numeric_features, categorical_features, cat_mappings=None):
    if cat_mappings is None:
        cat_mappings = {}
        is_training = True
    else:
        is_training = False

    for col in categorical_features:
        if is_training:
            df[col], mapping = pd.factorize(df[col])
            cat_mappings[col] = {label: i for i, label in enumerate(mapping)}
        else:
            mapping = cat_mappings[col]
            df[col] = df[col].map(mapping).fillna(-1).astype(np.int32)

    return df, cat_mappings

def run_table_2_evaluation(ids, test_df):
    """
    Runs the vote distribution analysis to generate results similar to Table 2.
    It takes 1000 samples per attack type and counts how many votes the correct agent gets.
    """
    print("\n--- Running Table 2 Evaluation (Agent Vote Distribution) ---")

    # Use the original, non-encoded labels for filtering
    original_test_df = pd.read_csv('UNSW_NB15_testing-set.csv')
    original_test_df.columns = original_test_df.columns.str.strip()

    results = pd.DataFrame(index=range(11))

    for category in ids.categories:
        print(f"  Analyzing category: {category}")

        # Get up to 1000 test cases for the current category
        category_cases_indices = original_test_df[original_test_df['attack_cat'] == category].index
        sample_size = min(1000, len(category_cases_indices))
        if sample_size == 0:
            results[category] = 0
            continue

        sample_indices = np.random.choice(category_cases_indices, sample_size, replace=False)
        test_sample = test_df.loc[sample_indices]

        vote_counts = defaultdict(int)
        for _, test_case in test_sample.iterrows():
            top_votes = ids._get_top_n_votes(test_case, n=10)
            correct_votes = top_votes.count(category)
            vote_counts[correct_votes] += 1

        # Add the results for this category to the DataFrame
        for i in range(11):
            results.loc[i, category] = vote_counts[i]

    results.index.name = "Vote Count"
    return results.fillna(0).astype(int)

The table 2 below is when 1000 cases are retrieved from each agent. But there are some agents that didn't have fully 1000 cases. Probably the test data isn't contain many that agent's case.

In [None]:
if __name__ == "__main__":
    try:
        print("Loading data with pandas...")
        train_df = pd.read_csv('UNSW_NB15_training-set.csv')
        test_df = pd.read_csv('UNSW_NB15_testing-set.csv')
        train_df.columns = train_df.columns.str.strip()
        test_df.columns = test_df.columns.str.strip()

        print("Preprocessing data for GPU...")
        numeric_features = train_df.select_dtypes(include=np.number).columns.drop(['id', 'label'], errors='ignore').tolist()
        categorical_features = train_df.select_dtypes(include=['object']).columns.drop(['attack_cat'], errors='ignore').tolist()

        train_df, cat_mappings = preprocess_data(train_df, numeric_features, categorical_features)
        test_df, _ = preprocess_data(test_df, numeric_features, categorical_features, cat_mappings)

        ids = IDS(train_df, numeric_features, categorical_features, cat_mappings)

        # --- Run Table 2 Analysis ---
        table_2_results = run_table_2_evaluation(ids, test_df)
        print("\n--- Table 2: Agent Voting Performance (1000 cases per agent) ---")
        print(table_2_results.to_string())

        # --- Run Table 3 Analysis (Classification Report) ---
        EVALUATION_SAMPLE_SIZE = 50000
        print(f"\n--- Running Table 3 Evaluation (Classification Report on {EVALUATION_SAMPLE_SIZE} cases) ---")
        test_sample = test_df.sample(EVALUATION_SAMPLE_SIZE)

        true_labels = []
        predicted_labels = []

        for i, (_, test_case) in enumerate(test_sample.iterrows()):
            if (i + 1) % 500 == 0:
                print(f"  Processed {i + 1}/{len(test_sample)} cases...")

            true_labels.append(test_case['attack_cat'])
            predicted_labels.append(ids.classify(test_case))

        print("Evaluation Complete.")

        from sklearn.metrics import classification_report
        print("\n--- Classification Report ---")
        print(classification_report(true_labels, predicted_labels, zero_division=0))

    except FileNotFoundError:
        print("\nERROR: Make sure 'UNSW_NB15_training-set.csv' and 'UNSW_NB15_testing-set.csv' are present.")
    except Exception as e:
        print(f"\nAn unexpected error occurred: {e}")
        print("Please ensure your Numba and CUDA environments are set up correctly.")

Loading data with pandas...
Preprocessing data for GPU...
Initializing agents and transferring data to GPU...
All agents initialized.

--- Running Table 2 Evaluation (Agent Vote Distribution) ---
  Analyzing category: Normal




  Analyzing category: Backdoor
  Analyzing category: Analysis
  Analyzing category: Fuzzers
  Analyzing category: Shellcode
  Analyzing category: Reconnaissance
  Analyzing category: Exploits
  Analyzing category: DoS
  Analyzing category: Worms
  Analyzing category: Generic

--- Table 2: Agent Voting Performance (1000 cases per agent) ---
            Normal  Backdoor  Analysis  Fuzzers  Shellcode  Reconnaissance  Exploits  DoS  Worms  Generic
Vote Count                                                                                               
0               29       210       650      311        301             525       318  903     42      394
1               68        10        22      126         51             212        57   75      2        4
2               87         6         5      155         16              93       139   18      0       14
3              104         1         0      142          4              30       159    2      0       11
4               59    