In [8]:
#
# A Multi-agent Case-based Reasoning Intrusion Detection System
# -------------------------------------------------------------------
# Re-implementation using a CPU-Based Decision Tree with scikit-learn
#
# This version evaluates the final model on a 50,000-case random sample
# from the test set for consistency with other methods.
#
# Setup:
# pip install pandas numpy scikit-learn
#

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import warnings

# Suppress potential warnings from scikit-learn for cleaner output
warnings.filterwarnings("ignore", category=UserWarning)

def run_decision_tree_classifier():
    """
    Loads, preprocesses, trains, and evaluates a Decision Tree model on the CPU.
    """
    try:
        # --- 1. Load Data ---
        print("Loading UNSW-NB15 training and testing data... 📂")
        train_df = pd.read_csv('UNSW_NB15_training-set.csv')
        test_df = pd.read_csv('UNSW_NB15_testing-set.csv')

        # Combine for consistent preprocessing, then split later
        full_df = pd.concat([train_df, test_df], ignore_index=True)
        full_df.columns = full_df.columns.str.strip()

        # Drop unnecessary columns
        full_df = full_df.drop(['id', 'label'], axis=1, errors='ignore')

        # --- 2. Data Preparation ---
        print("Preparing data for the model... 📊")
        X = full_df.drop('attack_cat', axis=1)
        y = full_df['attack_cat']

        numeric_features = X.select_dtypes(include=np.number).columns.tolist()
        categorical_features = X.select_dtypes(include=['object']).columns.tolist()

        # Split the data back into training and full testing sets
        X_train, X_test = X.iloc[:len(train_df)], X.iloc[len(train_df):]
        y_train, y_test = y.iloc[:len(train_df)], y.iloc[len(train_df):]

        # --- 3. Preprocessing Pipeline ---
        print("Building preprocessing pipeline... ⚙️")
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), numeric_features),
                ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
            ])

        # --- 4. Model Training ---
        print("Training DecisionTreeClassifier... 🌳")
        dt_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', DecisionTreeClassifier(
                max_depth=15,
                random_state=42
            ))
        ])

        # Train the model on the full training set
        dt_pipeline.fit(X_train, y_train)

        # --- 5. Evaluation on a Sample of the Test Set ---
        EVALUATION_SAMPLE_SIZE = 50000
        print(f"Evaluating the model on a random sample of {EVALUATION_SAMPLE_SIZE} test cases... 📈")

        # Create the random sample from the test set
        X_test_sample = X_test.sample(n=EVALUATION_SAMPLE_SIZE, random_state=42)
        y_test_sample = y_test.loc[X_test_sample.index]

        # Run predictions on the sample
        y_pred = dt_pipeline.predict(X_test_sample)

        print("\n" + "="*50)
        print("            CPU-Based Decision Tree Results")
        print("="*50)

        # Generate and print the classification report from the sample
        print("\n--- Classification Report ---")
        report = classification_report(y_test_sample, y_pred, zero_division=0)
        print(report)

        # Generate and print the confusion matrix from the sample
        print("\n--- Confusion Matrix ---")
        all_labels = y.unique()
        cm = confusion_matrix(y_test_sample, y_pred, labels=all_labels)
        cm_df = pd.DataFrame(cm, index=all_labels, columns=all_labels)
        print(cm_df)

    except FileNotFoundError:
        print("\nERROR: Make sure 'UNSW_NB15_training-set.csv' and 'UNSW_NB15_testing-set.csv' are present.")
    except Exception as e:
        print(f"\nAn unexpected error occurred: {e}")

# =================================
# Main Execution Block
# =================================
if __name__ == "__main__":
    run_decision_tree_classifier()

Loading UNSW-NB15 training and testing data... 📂
Preparing data for the model... 📊
Building preprocessing pipeline... ⚙️
Training DecisionTreeClassifier... 🌳
Evaluating the model on a random sample of 50000 test cases... 📈

            CPU-Based Decision Tree Results

--- Classification Report ---
                precision    recall  f1-score   support

      Analysis       0.02      0.06      0.03       398
      Backdoor       0.04      0.09      0.06       378
           DoS       0.41      0.11      0.18      2452
      Exploits       0.59      0.82      0.69      6801
       Fuzzers       0.30      0.46      0.36      3685
       Generic       0.99      0.98      0.98     11369
        Normal       0.94      0.79      0.86     22552
Reconnaissance       0.93      0.78      0.85      2089
     Shellcode       0.40      0.73      0.51       243
         Worms       0.66      0.64      0.65        33

      accuracy                           0.77     50000
     macro avg       0.53  