In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
import pickle
from skopt import BayesSearchCV
import warnings

In [2]:
warnings.filterwarnings("ignore")

In [3]:
# Load the dataset
df = pd.read_csv('UNSW_NB15_training-set.csv')

In [4]:
# Drop the 'id' column
df = df.drop(columns=['id'])


In [5]:
# Handle missing values
df = df.dropna()

In [6]:
# Sample 10% of the dataset
df_sampled = df.sample(frac=0.1, random_state=42)

In [7]:
# Define the features (X) and target (y)
X_sampled = df_sampled.drop(columns=['attack_cat'])
y_sampled = df_sampled['attack_cat']

In [8]:
# Identify categorical columns for one-hot encoding
categorical_cols = ['proto', 'service', 'state']
numerical_cols = X_sampled.columns.difference(categorical_cols)

In [9]:
# Preprocessing: One-hot encode categorical columns and scale numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', StandardScaler(), numerical_cols)
    ],
    remainder='passthrough'  # Pass through the remaining columns without changes
)

In [10]:
# Encode the target variable (attack_cat) into numeric labels
label_encoder = LabelEncoder()
y_sampled_encoded = label_encoder.fit_transform(y_sampled)

In [11]:
# Define the SVM model for multi-class classification
svm = SVC(kernel='rbf', decision_function_shape='ovr', random_state=42)

In [12]:
# Create a pipeline that preprocesses the data and then fits the SVM
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', svm)
])

In [13]:
# Define the parameter space for BayesSearchCV
param_space = {
    'classifier__C': (1e-6, 1e+6, 'log-uniform'),
    'classifier__gamma': (1e-6, 1e+1, 'log-uniform')
}

In [14]:
# Create the BayesSearchCV object
opt = BayesSearchCV(
    estimator=pipeline,
    search_spaces=param_space,
    n_iter=32,
    cv=5,
    n_jobs=-1,
    verbose=2,
    random_state=42
)

In [15]:
# Split the sampled dataset into the Training set and Test set
X_train_sampled, X_test_sampled, y_train_sampled, y_test_sampled = train_test_split(X_sampled, y_sampled_encoded, test_size=0.2, random_state=42)

In [16]:
# Fit the BayesSearchCV to the sampled training set
opt.fit(X_train_sampled, y_train_sampled)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [17]:
# Get the best estimator
best_model = opt.best_estimator_

In [18]:
# Predict on the sampled test set
y_pred_sampled = best_model.predict(X_test_sampled)

In [19]:
# Calculate accuracy and F1 score
accuracy_sampled = accuracy_score(y_test_sampled, y_pred_sampled)
f1_macro_sampled = f1_score(y_test_sampled, y_pred_sampled, average='macro')
print(f"Sampled test set accuracy: {accuracy_sampled:.4f}")
print(f"Sampled test set F1 Score (Macro): {f1_macro_sampled:.4f}")

Sampled test set accuracy: 0.8611
Sampled test set F1 Score (Macro): 0.5282


In [22]:
# Save the best model from the sampled dataset to disk
filename_sampled = 'finalized_model_multiclass_sampled.sav'
pickle.dump(best_model, open(filename_sampled, 'wb'))

In [23]:
# Save the label encoder to disk
label_encoder_filename_sampled = 'label_encoder_sampled.sav'
pickle.dump(label_encoder, open(label_encoder_filename_sampled, 'wb'))

In [24]:
# Print best parameters found from the sampled dataset
print(f"Best parameters from sampled dataset: {opt.best_params_}")

Best parameters from sampled dataset: OrderedDict([('classifier__C', 125242.13240720892), ('classifier__gamma', 0.0004718206977608264)])


In [None]:
# Do the same on 60% of the dataset
df_60 = df.sample(frac=0.6, random_state=42)

# Define the features (X) and target (y)
X_60 = df_60.drop(columns=['attack_cat'])
y_60 = df_60['attack_cat']

# Encode the target variable (attack_cat) into numeric labels
y_60_encoded = label_encoder.transform(y_60)

# Split the 60% dataset into the Training set and Test set
X_train_60, X_test_60, y_train_60, y_test_60 = train_test_split(X_60, y_60_encoded, test_size=0.2, random_state=42)

# Fit the BayesSearchCV to the 60% training set
opt.fit(X_train_60, y_train_60)

# Get the best estimator
best_model_60 = opt.best_estimator_

# Predict on the 60% test set
y_pred_60 = best_model_60.predict(X_test_60)

# Calculate accuracy and F1 score
accuracy_60 = accuracy_score(y_test_60, y_pred_60)
f1_macro_60 = f1_score(y_test_60, y_pred_60, average='macro')
print(f"60% test set accuracy: {accuracy_60:.4f}")
print(f"60% test set F1 Score (Macro): {f1_macro_60:.4f}")

# Save the best model from the 60% dataset to disk
filename_60 = 'finalized_model_multiclass_60.sav'
pickle.dump(best_model_60, open(filename_60, 'wb'))

# Print best parameters found from the 60% dataset
print(f"Best parameters from 60% dataset: {opt.best_params_}")

In [25]:
# Load the full dataset again for training on the full dataset
X_full = df.drop(columns=['attack_cat'])
y_full = df['attack_cat']

In [26]:
# Encode the target variable (attack_cat) into numeric labels
y_full_encoded = label_encoder.fit_transform(y_full)

In [27]:
# Split the full dataset into the Training set and Test set
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(X_full, y_full_encoded, test_size=0.2, random_state=42)

In [28]:
# Update the pipeline with the best parameters found
best_params = opt.best_params_
svm.set_params(C=best_params['classifier__C'], gamma=best_params['classifier__gamma'])

In [1]:
# Create the full pipeline with the best parameters
pipeline_full = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', svm)
])

NameError: name 'Pipeline' is not defined

In [30]:
# Fit the pipeline to the full training set
pipeline_full.fit(X_train_full, y_train_full)

In [None]:
# Predict on the full test set
y_pred_full = pipeline_full.predict(X_test_full)

In [None]:
# Calculate accuracy and F1 score
accuracy_full = accuracy_score(y_test_full, y_pred_full)
f1_macro_full = f1_score(y_test_full, y_pred_full, average='macro')
print(f"Full test set accuracy: {accuracy_full:.4f}")
print(f"Full test set F1 Score (Macro): {f1_macro_full:.4f}")

In [None]:
# Save the final model trained on the full dataset to disk
filename_full = 'finalized_model_multiclass_full.sav'
pickle.dump(pipeline_full, open(filename_full, 'wb'))

In [None]:
# Save the label encoder used for the full dataset
label_encoder_filename_full = 'label_encoder_full.sav'
pickle.dump(label_encoder, open(label_encoder_filename_full, 'wb'))

In [None]:
# do the cross valuation

from sklearn.model_selection import cross_val_score
scores = cross_val_score(pipeline_full, X_full, y_full_encoded, cv=5)
print("Cross-validation scores: {}".format(scores))
print("Average cross-validation score: {:.2f}".format(scores.mean()))

In [None]:
# do the confusion matrix
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(y_test_full, y_pred_full)
print("Confusion matrix:\n{}".format(confusion))

In [None]:
# do the classification report
from sklearn.metrics import classification_report
print("Classification report:\n{}".format(classification_report(y_test_full, y_pred_full)))

In [None]:
# do the ROC curve
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
fpr, tpr, thresholds = roc_curve(y_test_full, pipeline_full.decision_function(X_test_full))
plt.plot(fpr, tpr, label="ROC Curve")
plt.xlabel("FPR")
plt.ylabel("TPR (recall)")
plt.title("roc_curve")
plt.show()

In [None]:
# do the precision-recall curve
from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(y_test_full, pipeline_full.decision_function(X_test_full))
plt.plot(precision, recall, label="precision_recall_curve")
plt.xlabel("Precision")
plt.ylabel("Recall")
plt.title("precision_recall_curve")
plt.show()

In [None]:
# do the AUC
from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score(y_test_full, pipeline_full.decision_function(X_test_full))
print("AUC: {:.3f}".format(roc_auc))

In [None]:
# do the average precision score
from sklearn.metrics import average_precision_score
average_precision = average_precision_score(y_test_full, pipeline_full.decision_function(X_test_full))
print("Average precision score: {:.3f}".format(average_precision))

In [None]:
# do the f1 score
f1 = f1_score(y_test_full, y_pred_full, average='macro')
print("F1 Score: {:.3f}".format(f1))

In [None]:
# do the accuracy score
accuracy = accuracy_score(y_test_full, y_pred_full)
print("Accuracy Score: {:.3f}".format(accuracy))

In [None]:
# do the precision score
from sklearn.metrics import precision_score

In [None]:

precision = precision_score(y_test_full, y_pred_full, average='macro')
print("Precision Score: {:.3f}".format(precision))

In [None]:
# do the recall score
from sklearn.metrics import recall_score

In [None]:
recall = recall_score(y_test_full, y_pred_full, average='macro')
print("Recall Score: {:.3f}".format(recall))

In [None]:

# do the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy = balanced_accuracy_score(y_test_full, y_pred_full)
print("Balanced Accuracy Score: {:.3f}".format(balanced_accuracy))

In [None]:
# do the matthews correlation coefficient
from sklearn.metrics import matthews_corrcoef

mcc = matthews_corrcoef(y_test_full, y_pred_full)
print("Matthews Correlation Coefficient: {:.3f}".format(mcc))

In [None]:
# do the zero one loss
from sklearn.metrics import zero_one_loss

zero_one = zero_one_loss(y_test_full, y_pred_full)
print("Zero One Loss: {:.3f}".format(zero_one))

In [None]:
# do the hamming loss
from sklearn.metrics import hamming_loss

hamming = hamming_loss(y_test_full, y_pred_full)
print("Hamming Loss: {:.3f}".format(hamming))

In [None]:
# do the jaccard score
from sklearn.metrics import jaccard_score

jaccard = jaccard_score(y_test_full, y_pred_full, average='macro')
print("Jaccard Score: {:.3f}".format(jaccard))

In [None]:
# do the brier score
from sklearn.metrics import brier_score_loss

brier = brier_score_loss(y_test_full, pipeline_full.predict_proba(X_test_full)[:, 1])
print("Brier Score: {:.3f}".format(brier))

In [None]:
# do the log loss
from sklearn.metrics import log_loss

log = log_loss(y_test_full, pipeline_full.predict_proba(X_test_full))
print("Log Loss: {:.3f}".format(log))