In [9]:
#
# A Multi-agent Case-based Reasoning Intrusion Detection System
# -------------------------------------------------------------------
# Re-implementation using a CPU-Based Support Vector Machine (SVM)
#
# This version trains on a sample of the training data (for performance)
# and evaluates on a 50,000-case sample of the test data (for consistency).
#
# Setup:
# pip install pandas numpy scikit-learn
#

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import warnings

# Suppress potential warnings from scikit-learn for cleaner output
warnings.filterwarnings("ignore", category=UserWarning)

def run_svm_classifier():
    """
    Loads, preprocesses, trains, and evaluates an SVM model on the CPU.
    """
    try:
        # --- 1. Load Data ---
        print("Loading UNSW-NB15 training and testing data... üìÇ")
        train_df = pd.read_csv('UNSW_NB15_training-set.csv')
        test_df = pd.read_csv('UNSW_NB15_testing-set.csv')

        full_df = pd.concat([train_df, test_df], ignore_index=True)
        full_df.columns = full_df.columns.str.strip()
        full_df = full_df.drop(['id', 'label'], axis=1, errors='ignore')

        # --- 2. Data Preparation ---
        print("Preparing data for the model... üìä")
        X = full_df.drop('attack_cat', axis=1)
        y = full_df['attack_cat']

        numeric_features = X.select_dtypes(include=np.number).columns.tolist()
        categorical_features = X.select_dtypes(include=['object']).columns.tolist()

        # Get original train/test indices
        train_indices = range(len(train_df))
        X_train_full = X.iloc[train_indices]
        y_train_full = y.iloc[train_indices]
        X_test_full = X.iloc[len(train_df):]
        y_test_full = y.iloc[len(train_df):]

        # --- !! IMPORTANT: Subsample the Training Data !! ---
        # Training SVM on the full dataset is very slow.
        TRAIN_SAMPLE_SIZE = 20000
        print(f"Using a random sample of {TRAIN_SAMPLE_SIZE} cases for training to reduce computation time.")
        X_train_sample = X_train_full.sample(n=TRAIN_SAMPLE_SIZE, random_state=42)
        y_train_sample = y_train_full.loc[X_train_sample.index]

        # --- 3. Preprocessing Pipeline ---
        print("Building preprocessing pipeline... ‚öôÔ∏è")
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), numeric_features),
                ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
            ])

        # --- 4. Model Training ---
        print("Training Support Vector Classifier (SVC)... This may take several minutes. ‚è≥")
        svm_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', SVC(
                kernel='rbf',
                random_state=42
            ))
        ])

        # Train the model on the smaller training sample
        svm_pipeline.fit(X_train_sample, y_train_sample)

        # --- 5. Evaluation on a Sample of the Test Set ---
        EVALUATION_SAMPLE_SIZE = 50000
        print(f"Evaluating the model on a random sample of {EVALUATION_SAMPLE_SIZE} test cases... üìà")

        # Create the random sample from the test set
        X_test_sample = X_test_full.sample(n=EVALUATION_SAMPLE_SIZE, random_state=42)
        y_test_sample = y_test_full.loc[X_test_sample.index]

        # Run predictions on the test sample
        y_pred = svm_pipeline.predict(X_test_sample)

        print("\n" + "="*50)
        print("          Support Vector Machine Results")
        print("="*50)

        print("\n--- Classification Report ---")
        report = classification_report(y_test_sample, y_pred, zero_division=0)
        print(report)

        print("\n--- Confusion Matrix ---")
        all_labels = y.unique()
        cm = confusion_matrix(y_test_sample, y_pred, labels=all_labels)
        cm_df = pd.DataFrame(cm, index=all_labels, columns=all_labels)
        print(cm_df)

    except FileNotFoundError:
        print("\nERROR: Make sure 'UNSW_NB15_training-set.csv' and 'UNSW_NB15_testing-set.csv' are present.")
    except Exception as e:
        print(f"\nAn unexpected error occurred: {e}")

# =================================
# Main Execution Block
# =================================
if __name__ == "__main__":
    run_svm_classifier()

Loading UNSW-NB15 training and testing data... üìÇ
Preparing data for the model... üìä
Using a random sample of 20000 cases for training to reduce computation time.
Building preprocessing pipeline... ‚öôÔ∏è
Training Support Vector Classifier (SVC)... This may take several minutes. ‚è≥
Evaluating the model on a random sample of 50000 test cases... üìà

          Support Vector Machine Results

--- Classification Report ---
                precision    recall  f1-score   support

      Analysis       0.00      0.00      0.00       398
      Backdoor       0.00      0.00      0.00       378
           DoS       0.82      0.01      0.01      2452
      Exploits       0.52      0.88      0.65      6801
       Fuzzers       0.24      0.70      0.36      3685
       Generic       1.00      0.96      0.98     11369
        Normal       0.98      0.62      0.76     22552
Reconnaissance       0.48      0.59      0.53      2089
     Shellcode       0.00      0.00      0.00       243
         W