In [1]:
import pandas as pd
import numpy as np
import time
import joblib
import matplotlib as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Conv1D, MaxPooling1D, Conv1D, MaxPooling1D
from tensorflow.keras.utils import to_categorical
from scikeras.wrappers import KerasClassifier
import tensorflow as tf
import warnings
import shap
from tqdm import tqdm
import itertools
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'scikeras'

# Data Preprocessing

In [None]:
# Set seed for reproducibility
np.random.seed(42)

# Load dataset
data = pd.read_parquet("data/cic-collection.parquet")  # Replace with the correct path to the dataset

# Separate features and target
X = data.drop(['Label','ClassLabel'], axis=1)  # Replace 'target' with the correct column name
y = data['ClassLabel']

# Encode target if categorical
if y.dtype == 'object':
    y = pd.factorize(y)[0]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Function to calculate metrics

In [None]:
def calculate_metrics(y_true, y_pred, training_time, inference_time):
    return {
        "Accuracy": round(accuracy_score(y_true, y_pred), 4),
        "Precision": round(precision_score(y_true, y_pred, average="weighted"), 4),
        "Recall": round(recall_score(y_true, y_pred, average="weighted"), 4),
        "F1": round(f1_score(y_true, y_pred, average="weighted"), 4),
        "Training Time": round(training_time, 4),
        "Inference Time": round(inference_time, 4),
    }

# Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_split=5, random_state=42)

# Train the model
start = time.time()
rf.fit(X_train, y_train)
training_time = time.time() - start

# Make predictions
start = time.time()
y_pred = rf.predict(X_test)
inference_time = time.time() - start

# Calculate metrics
metrics = calculate_metrics(y_test, y_pred, training_time, inference_time)
print(metrics)

# Save the model
joblib.dump(rf, "random_forest.joblib")

In [None]:
# XAI: SHAP Analysis
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_test)

# Plot global feature importance
shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)
plt.savefig("shap_feature_importance.png")
print("SHAP global feature importance saved as 'shap_feature_importance.png'.")

# Identify top 10 important features
feature_importance = rf.feature_importances_
important_features = pd.Series(feature_importance, index=X_train.columns).sort_values(ascending=False)
top_features = important_features.head(10)
print("Top 10 Features:\n", top_features)

# Normalize data

In [None]:
scaler = StandardScaler()  # Use MinMaxScaler() if you prefer normalization to [0, 1]
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# XGBoost

In [None]:
xgb_model = xgb.XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=10, subsample=0.8, random_state=42)

# Train the model
start = time.time()
xgb_model.fit(X_train, y_train)
training_time = time.time() - start

# Make predictions
start = time.time()
y_pred = xgb_model.predict(X_test)
inference_time = time.time() - start

# Calculate metrics
metrics = calculate_metrics(y_test, y_pred, training_time, inference_time)
print(metrics)

# Save the model
joblib.dump(xgb_model, "xgboost.joblib")

In [None]:
# XAI: SHAP Analysis
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X_test)