# Network Intrusion Detection Classifier using Support Vector Machine

Imports and Data Preparation


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from urllib.request import urlretrieve
from tqdm import tqdm
import os

# URLs for NSL-KDD dataset
train_url = "https://github.com/jmnwong/NSL-KDD-Dataset/raw/master/KDDTrain+.txt"
test_url = "https://github.com/jmnwong/NSL-KDD-Dataset/raw/master/KDDTest+.txt"

train_file = "KDDTrain+.txt"
test_file = "KDDTest+.txt"

if not os.path.exists(train_file):
    print("Downloading training dataset...")
    urlretrieve(train_url, train_file)
if not os.path.exists(test_file):
    print("Downloading testing dataset...")
    urlretrieve(test_url, test_file)

columns = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes",
    "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in",
    "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations",
    "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login",
    "is_guest_login", "count", "srv_count", "serror_rate", "srv_serror_rate",
    "rerror_rate", "srv_rerror_rate", "same_srv_rate", "diff_srv_rate",
    "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
    "dst_host_same_srv_rate", "dst_host_diff_srv_rate",
    "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate",
    "dst_host_serror_rate", "dst_host_srv_serror_rate", "dst_host_rerror_rate",
    "dst_host_srv_rerror_rate", "class", "difficulty"
]


train_df = pd.read_csv(train_file, names=columns, header=None)
test_df = pd.read_csv(test_file, names=columns, header=None)


#Preprocessing data (Label encoding for categorical columns and convert class labels to binary (0 for normal, 1 for attack)]
df = pd.concat([train_df, test_df], axis=0)

categorical_cols = ["protocol_type", "service", "flag"]
encoder = LabelEncoder()
for col in categorical_cols:
    df[col] = encoder.fit_transform(df[col])

df["class"] = df["class"].apply(lambda x: 0 if x == "normal" else 1)

X = df.drop("class", axis=1).values
y = df["class"].values

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train = X[:len(train_df)]
y_train = y[:len(train_df)]
X_test = X[len(train_df):]
y_test = y[len(train_df):]

print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")


Training set: 125973 samples
Testing set: 22544 samples


SVM Classifier Implementation

In [19]:
class SVM:
    def __init__(self, learning_rate=0.001, lambda_param=0.01, epochs=1000):
        self.lr = learning_rate
        self.lambda_param = lambda_param
        self.epochs = epochs

    def fit(self, X, y):
        n_samples, n_features = X.shape
        y_ = np.where(y <= 0, -1, 1)

        self.w = np.zeros(n_features)
        self.b = 0

        for _ in tqdm(range(self.epochs), desc="Training Progress"):
            for idx, x_i in enumerate(X):
            # Margin condition: if the sample is correctly classified and outside the margin
                cndn = y_[idx] * (np.dot(x_i, self.w) - self.b) >= 1

                if cndn:
                # Apply L2 regularization if correctly classified and outside margin
                  self.w -= self.lr * (2 * self.lambda_param * self.w)
                else:
                  # Else if it is within margin or misclassified, update both weights and bias
                  # (gradient step considers hinge loss and regularization)
                  self.w -= self.lr * (2 * self.lambda_param * self.w - np.dot(x_i, y_[idx]))
                  self.b -= self.lr * y_[idx]  # Adjusting bias


    def predict(self, X):
        approx = np.dot(X, self.w) - self.b
        return np.where(approx >= 0, 1, 0)


Training and Evaluating the SVM Model

In [20]:
# Initialize and train SVM model
svm_classifier = SVM(learning_rate=0.001, lambda_param=0.01, epochs=10)
svm_classifier.fit(X_train, y_train)

# Predict and check accuracy on test data
y_pred_svm = svm_classifier.predict(X_test)
accuracy_svm = np.mean(y_pred_svm == y_test) * 100
print(f"SVM Model Accuracy: {accuracy_svm:.2f}%")


Training Progress: 100%|██████████| 10/10 [00:03<00:00,  2.79it/s]

SVM Model Accuracy: 85.63%





In [21]:
from sklearn.metrics import classification_report, confusion_matrix

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_svm)
print("Confusion Matrix:\n", conf_matrix)


Confusion Matrix:
 [[ 8927   784]
 [ 2455 10378]]


Scikit learn SVM implementation

In [22]:
from sklearn.svm import SVC

svm_sklearn = SVC(kernel='linear')
svm_sklearn.fit(X_train, y_train)
y_pred_sklearn = svm_sklearn.predict(X_test)

accuracy_sklearn = np.mean(y_pred_sklearn == y_test) * 100
print(f"Scikit-learn SVM Accuracy: {accuracy_sklearn:.2f}%")

Scikit-learn SVM Accuracy: 84.67%
