In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
import pickle
from skopt import BayesSearchCV
import warnings

In [None]:
warnings.filterwarnings("ignore")

In [None]:
# Load the dataset
df = pd.read_csv('UNSW_NB15_training-set.csv')

In [None]:
# Drop the 'id' column
df = df.drop(columns=['id'])


In [None]:
# Handle missing values
df = df.dropna()

In [None]:
# Define the features (X) and target (y)
X = df.drop(columns=['attack_cat'])
y = df['attack_cat']

In [None]:
# Identify categorical columns for one-hot encoding
categorical_cols = ['proto', 'service', 'state']
numerical_cols = X.columns.difference(categorical_cols)

In [None]:
# Preprocessing: One-hot encode categorical columns and scale numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', StandardScaler(), numerical_cols)
    ],
    remainder='passthrough'  # Pass through the remaining columns without changes
)

In [None]:
# Encode the target variable (attack_cat) into numeric labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [None]:
# Define the SVM model for multi-class classification
svm = SVC(kernel='rbf', decision_function_shape='ovr', random_state=42)

In [None]:
# Create a pipeline that preprocesses the data and then fits the SVM
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', svm)
])

In [None]:
# Define the parameter space for BayesSearchCV
param_space = {
    'classifier__C': (1e-6, 1e+6, 'log-uniform'),
    'classifier__gamma': (1e-6, 1e+1, 'log-uniform')
}

In [None]:
# Create the BayesSearchCV object
opt = BayesSearchCV(
    estimator=pipeline,
    search_spaces=param_space,
    n_iter=32,
    cv=5,
    n_jobs=-1,
    verbose=2,
    random_state=42
)

In [None]:
# Train the model only 

# Split the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [None]:
# Fit the BayesSearchCV to the training set
opt.fit(X_train, y_train)

In [None]:
# Get the best estimator
best_model = opt.best_estimator_

In [None]:
# Predict on the test set
y_pred = best_model.predict(X_test)

In [None]:
# Calculate accuracy and F1 score
accuracy = accuracy_score(y_test, y_pred)
f1_macro = f1_score(y_test, y_pred, average='macro')
print(f"Test set accuracy: {accuracy:.4f}")
print(f"Test set F1 Score (Macro): {f1_macro:.4f}")

In [None]:
# Save the best model to disk
filename = 'finalized_model_multiclass.sav'
pickle.dump(best_model, open(filename, 'wb'))

In [None]:
# Save the label encoder to disk
label_encoder_filename = 'label_encoder.sav'
pickle.dump(label_encoder, open(label_encoder_filename, 'wb'))