In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score

In [2]:
# Load the dataset
df = pd.read_csv('UNSW_NB15_training-set.csv')

In [3]:
# Drop the 'id' column
df = df.drop(columns=['id'])

In [4]:
# Define the features (X) and target (y)
X = df.drop(columns=['label'])
y = df['label']

In [5]:
# Identify categorical columns for one-hot encoding
categorical_cols = ['proto', 'service', 'state', 'attack_cat']

In [6]:
# Preprocessing: One-hot encode categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'  # Pass through the remaining columns without changes
)

In [7]:
# Define the SVM model
svm = SVC(C=100, gamma=0.01, kernel='rbf', random_state=42)

In [8]:
# Create a pipeline that preprocesses the data and then fits the SVM
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', svm)
])

In [9]:
# Split the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Fit the pipeline to the training set
pipeline.fit(X_train, y_train)

In [11]:
# Predict on the test set
y_pred = pipeline.predict(X_test)

In [12]:
# Calculate accuracy and F1 score
accuracy = accuracy_score(y_test, y_pred)
f1_macro = f1_score(y_test, y_pred, average='macro')
print(f"Test set accuracy: {accuracy:.4f}")
print(f"Test set F1 Score (Macro): {f1_macro:.4f}")

Test set accuracy: 0.7738
Test set F1 Score (Macro): 0.7692


In [13]:
# Save the model to disk
import pickle
filename = 'finalized_model.sav'
pickle.dump(pipeline, open(filename, 'wb'))

In [14]:
# Do cross-validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
print(f"Cross-validation scores: {scores}")
print(f"Mean accuracy: {scores.mean():.4f}")
print(f"Standard deviation: {scores.std():.4f}")