In [1]:
import os
import joblib
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import KFold
from utils.preprocess_data_KDD_plus_test import preprocess_data
from utils.plot import plot_roc_curve, plot_threshold_metrics, save_classification_reports_for_thresholds, plot_combined_roc_curve


In [2]:
# File paths
train_file_path = os.path.join( 'data', 'KDDTrain+.csv')
test_file_path = os.path.join('data', 'KDDTest+.csv')

# Preprocess data
preprocessing_models_folder = os.path.join('preprocessing_pipeline')
X_train, X_test, y_train, y_test = preprocess_data(
    train_file_path=train_file_path,
    test_file_path=test_file_path,
    preprocessing_models_folder=preprocessing_models_folder
)

# Set output folder
current_directory = os.path.abspath(os.getcwd())
output_folder = os.path.join(current_directory, "svm")


In [3]:
def svm_kfold(X, y, k=5, output_folder=output_folder):
    """Trains SVM using K-Fold cross-validation, generates reports, and plots combined ROC curves."""
    os.makedirs(output_folder, exist_ok=True)
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    all_fpr = []
    all_tpr = []
    all_auc = []
    fold_index = 0
    thresholds = np.arange(0.1, 1.0, 0.05)

    for train_index, test_index in kf.split(X):
        print(f"running fold-{fold_index + 1}")
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        # Initialize and train the SVM model
        svm = SVC(probability=True, random_state=42)
        svm.fit(X_train, y_train)

        # Predict probabilities for ROC curve
        y_prob_val = svm.predict_proba(X_val)[:, 1]

        # Compute ROC curve and AUC for the current fold
        fpr, tpr, _ = roc_curve(y_val, y_prob_val)
        roc_auc = auc(fpr, tpr)
        all_fpr.append(fpr)
        all_tpr.append(tpr)
        all_auc.append(roc_auc)

        # Plot ROC curve for the current fold
        plot_roc_curve(fpr, tpr, roc_auc, f'fold-{fold_index+1}', output_folder)

        print(f"fold-{fold_index + 1} complete")
        fold_index += 1

    # Plot combined ROC curves for all folds
    plot_combined_roc_curve(all_fpr, all_tpr, all_auc, k, output_folder)

    print(f"K-Fold cross-validation completed. Results saved in {output_folder}")


In [4]:
def train_svm(X, y, X_test, y_test, output_folder=output_folder):
    """Trains the final SVM model on the full training set and evaluates on the test set."""
    thresholds = np.arange(0.1, 1.0, 0.05)

    # Final evaluation on test set (X_test, y_test)
    print("Final evaluation on test set:")
    svm_final = SVC(probability=True, random_state=42)
    svm_final.fit(X, y)  # Train the final model on the entire training set

    # Save the trained model
    model_path = os.path.join(output_folder, "svm_model.pkl")
    joblib.dump(svm_final, model_path)

    y_prob_test = svm_final.predict_proba(X_test)[:, 1]

    # ROC Curve for test data
    fpr_test, tpr_test, _ = roc_curve(y_test, y_prob_test)
    roc_auc_test = auc(fpr_test, tpr_test)
    plot_roc_curve(fpr_test, tpr_test, roc_auc_test, 'test_final', output_folder)

    # Threshold analysis for test data
    test_precision_at_threshold = [np.mean(y_prob_test >= t) for t in thresholds]
    test_recall_at_threshold = [np.mean(y_test[y_prob_test >= t] == 1) for t in thresholds]

    # Plot Precision and Recall vs Threshold for test data
    plot_threshold_metrics(thresholds, test_precision_at_threshold, 'Precision', 'test_final', output_folder)
    plot_threshold_metrics(thresholds, test_recall_at_threshold, 'Recall', 'test_final', output_folder)

    # Save classification reports for different thresholds for the test set
    save_classification_reports_for_thresholds(thresholds, y_test, y_prob_test, output_folder, 'test_final')

    print(f"Final test evaluation and plots saved in {output_folder}")


In [None]:
svm_kfold(X_train, y_train, k=5, output_folder=output_folder)

running fold-1
fold-1 complete
running fold-2
fold-2 complete
running fold-3


In [None]:
train_svm(X_train, y_train, X_test, y_test, output_folder=output_folder)