In [2]:
import pandas as pd
import numpy as np
import time
import joblib
import matplotlib as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb
from tensorflow.python.keras.models import Sequential
#from tensorflow.python.keras.layers import Dense, Dropout, LSTM, Conv1D, MaxPooling1D, Conv1D, #MaxPooling1D
from scikeras.wrappers import KerasClassifier
import tensorflow as tf
import warnings
import shap
from tqdm import tqdm
import itertools
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


# Data Preprocessing

In [3]:
# Set seed for reproducibility
np.random.seed(42)

# Load dataset
data = pd.read_parquet("data/cic-collection.parquet")  # Replace with the correct path to the dataset

#data['ClassLabel'] = data['ClassLabel'].apply(lambda x: 0 if x == 'Benign' else 1)
# Separate features and target
X = data.drop(['Label','ClassLabel'], axis=1)  # Replace 'target' with the correct column name
y = data['ClassLabel']

 #Encode target if categorical
if y.dtype == 'object':
   y = pd.factorize(y)[0]



# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Function to calculate metrics

In [4]:
def calculate_metrics(y_true, y_pred, training_time, inference_time):
    return {
        "Accuracy": round(accuracy_score(y_true, y_pred), 4),
        "Precision": round(precision_score(y_true, y_pred, average="weighted"), 4),
        "Recall": round(recall_score(y_true, y_pred, average="weighted"), 4),
        "F1": round(f1_score(y_true, y_pred, average="weighted"), 4),
        "Training Time": round(training_time, 4),
        "Inference Time": round(inference_time, 4),
    }

# Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_split=5, random_state=42, n_jobs=-1)

# Train the model
start = time.time()
rf.fit(X_train, y_train)
training_time = time.time() - start

# Make predictions
start = time.time()
y_pred = rf.predict(X_test)
inference_time = time.time() - start

# Calculate metrics
metrics = calculate_metrics(y_test, y_pred, training_time, inference_time)
print(metrics)

# Save the model
joblib.dump(rf, "random_forest.joblib")

In [None]:
# XAI: SHAP Analysis
import shap
import torch
import matplotlib.pyplot as plt
torch.cuda.empty_cache()

# Load the Random Forest model
rf = joblib.load("random_forest.joblib")

# Define batch size
batch_size = 500000

# Create the SHAP explainer for the Random Forest model
explainer = shap.TreeExplainer(rf)

# List to store SHAP values
shap_values_list = []

# Iterate over batches of the test data
for i in tqdm(range(0, len(X_test), batch_size)):
    # Get SHAP values for the current batch
    shap_values = explainer.shap_values(X_test.iloc[i:i + batch_size])
    
    # Handle multi-class case by averaging SHAP values
    if isinstance(shap_values, list):  # Multi-class case
        shap_values = np.array(shap_values).mean(axis=0)
    
    shap_values_list.append(shap_values)

# Concatenate all SHAP values into a single array
shap_values = np.concatenate(shap_values_list, axis=0)

# Calculate the mean absolute SHAP values for each feature
mean_abs_shap = np.mean(np.abs(shap_values), axis=0)

# Get the indices of the top 10 features
top_k_indices = np.argsort(mean_abs_shap)[-10:]

# Print the shape of X_test and the top 10 feature indices
top_k_indices_flat = top_k_indices[0]

# Plot the summary SHAP plot for the top 10 features
shap.summary_plot(shap_values[:, top_k_indices_flat], 
                  X_test.iloc[:, top_k_indices_flat], 
                  plot_type="bar", show=False)

# Save the generated figure
plt.gcf().savefig("shap_rf_top10_features.png", bbox_inches="tight")
plt.close()

# Identify top 10 important features using feature importances from the Random Forest model
feature_importance = rf.feature_importances_
important_features = pd.Series(feature_importance, index=X_train.columns).sort_values(ascending=False)
top_features = important_features.head(10)

print("Top 10 Features:\n", top_features)
print("SHAP global feature importance saved as 'shap_rf_top10_features.png'.")

  0%|          | 0/6 [00:00<?, ?it/s]

# Normalize data

In [None]:
scaler = StandardScaler()  # Use MinMaxScaler() if you prefer normalization to [0, 1]
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# XGBoost

In [None]:
xgb_model = xgb.XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=10, subsample=0.8, random_state=42,device="cuda")

# Train the model
start = time.time()
xgb_model.fit(X_train, y_train)
training_time = time.time() - start

# Make predictions
start = time.time()
y_pred = xgb_model.predict(X_test)
inference_time = time.time() - start

# Calculate metrics
metrics = calculate_metrics(y_test, y_pred, training_time, inference_time)
print(metrics)

# Save the model
joblib.dump(xgb_model, "xgboost.joblib")

In [None]:
print(X_test.shape)  # Deve mostrar (n_amostras, n_features)
print(f"X_test shape: {X_test.shape}")
print(f"Top k indices: {top_k_indices}")



In [None]:
# XAI: SHAP Analysis
import shap
import torch
import joblib
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

torch.cuda.empty_cache()

# Load the model
xgb_model = joblib.load("xgboost.joblib")

# Define batch size
batch_size = 500000

# Create the SHAP explainer for the XGBoost model
explainer = shap.TreeExplainer(xgb_model)

# List to store SHAP values
shap_values_list = []

# Iterate over batches of the test data
for i in tqdm(range(0, len(X_test), batch_size)):
    # Get SHAP values for the current batch
    shap_values = explainer.shap_values(X_test.iloc[i:i + batch_size])
    
    # Handle multi-class case by averaging SHAP values
    if isinstance(shap_values, list):  # Multi-class case
        shap_values = np.array(shap_values).mean(axis=0)
    
    shap_values_list.append(shap_values)

# Concatenate all SHAP values into a single array
shap_values = np.concatenate(shap_values_list, axis=0)

# Calculate the mean absolute SHAP values for each feature
mean_abs_shap = np.mean(np.abs(shap_values), axis=0)

# Get the indices of the top 10 features
top_k_indices = np.argsort(mean_abs_shap)[-10:]

# Print the shape of X_test and the top 10 feature indices
top_k_indices_flat = top_k_indices[0]

# Plot the summary SHAP plot for the top 10 features
shap.summary_plot(shap_values[:, top_k_indices_flat], 
                  X_test.iloc[:, top_k_indices_flat], 
                  plot_type="bar", show=False)

# Save the generated figure
plt.gcf().savefig("shap_xgboost_force_plot_top10_features.png", bbox_inches="tight")
plt.close()


 # LIGHTGBM

In [None]:
import lightgbm as lgbm
lgbm_model = lgbm.LGBMClassifier(n_estimators=200, learning_rate=0.1, max_depth=10, num_leaves=63, random_state=42, force_col_wise="true")

start = time.time()
lgbm_model.fit(X_train, y_train)
training_time = time.time() - start

start = time.time()
y_pred = lgbm_model.predict(X_test)
inference_time = time.time() - start

metrics = calculate_metrics(y_test, y_pred, training_time, inference_time)
print(metrics)

joblib.dump(lgbm_model, "lightgbm.joblib")

In [None]:
# XAI: SHAP Analysis
import shap
import torch
import joblib
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

torch.cuda.empty_cache()

# Load the model
lgbm_model = joblib.load("lightgbm.joblib")

# Define batch size
batch_size = 500000

# Create the SHAP explainer for the LightGBM model
explainer = shap.TreeExplainer(lgbm_model)

# List to store SHAP values
shap_values_list = []

# Iterate over batches of the test data
for i in tqdm(range(0, len(X_test), batch_size)):
    # Get SHAP values for the current batch
    shap_values = explainer.shap_values(X_test.iloc[i:i + batch_size])
    
    # Handle multi-class case by averaging SHAP values
    if isinstance(shap_values, list):  # Multi-class case
        shap_values = np.array(shap_values).mean(axis=0)
    
    shap_values_list.append(shap_values)
    
# Concatenate all SHAP values into a single array
shap_values = np.concatenate(shap_values_list, axis=0)

# Calculate the mean absolute SHAP values for each feature
mean_abs_shap = np.mean(np.abs(shap_values), axis=0)

# Get the indices of the top 10 features
top_k_indices = np.argsort(mean_abs_shap)[-10:]

# Print the shape of X_test and the top 10 feature indices
top_k_indices_flat = top_k_indices[0]

# Plot the summary SHAP plot for the top 10 features
shap.summary_plot(shap_values[:, top_k_indices_flat], 
                  X_test.iloc[:, top_k_indices_flat], 
                  plot_type="bar", show=False)



# CNN_RNN

In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, BatchNormalization, MaxPooling1D, LSTM, Dropout, Dense
import numpy as np
import tensorflow as tf

def create_best_cnn_rnn_model():
    model = Sequential([
        Conv1D(64, 3, activation='relu', kernel_initializer='he_uniform', input_shape=(X_train_dl.shape[1], 1)),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),
        LSTM(64, return_sequences=False),  # Use Keras' LSTM
        Dropout(0.3),
        Dense(128, activation='relu', kernel_initializer='he_uniform'),
        Dropout(0.3),
        Dense(len(np.unique(y_train)), activation='softmax')
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model


# Preparar os dados para treino

In [6]:
from keras.src.utils import to_categorical

X_train_dl = np.expand_dims(X_train, axis=2)
X_test_dl = np.expand_dims(X_test, axis=2)
y_train_dl = to_categorical(y_train)
y_test_dl = to_categorical(y_test)

In [15]:

best_model = create_best_cnn_rnn_model()
start_time = time.time()
history = best_model.fit(
    X_train_dl, y_train_dl,
    validation_data=(X_test_dl, y_test_dl),
    batch_size=32,
    epochs=10,
    verbose=1
)
training_time = time.time() - start_time

Epoch 1/10
[1m200541/200541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1938s[0m 10ms/step - accuracy: 0.9343 - loss: 0.2207 - val_accuracy: 0.8383 - val_loss: 0.5497
Epoch 2/10
[1m200541/200541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2053s[0m 10ms/step - accuracy: 0.9730 - loss: 0.1049 - val_accuracy: 0.8522 - val_loss: 0.6124
Epoch 3/10
[1m200541/200541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2021s[0m 10ms/step - accuracy: 0.9763 - loss: 0.0952 - val_accuracy: 0.8671 - val_loss: 0.5734
Epoch 4/10
[1m200541/200541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2050s[0m 10ms/step - accuracy: 0.9774 - loss: 0.0916 - val_accuracy: 0.8704 - val_loss: 0.5611
Epoch 5/10
[1m200541/200541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2120s[0m 11ms/step - accuracy: 0.9782 - loss: 0.0901 - val_accuracy: 0.8690 - val_loss: 0.6528
Epoch 6/10
[1m200541/200541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1970s[0m 10ms/step - accuracy: 0.9789 - loss: 0.0888 - val_accu

# Avaliar o modelo


In [16]:
start_inference = time.time()
y_pred = np.argmax(best_model.predict(X_test_dl), axis=1)
inference_time = time.time() - start_inference

y_true = np.argmax(y_test_dl, axis=1)

metrics = calculate_metrics(y_true, y_pred, training_time, inference_time)
print(metrics)

joblib.dump(metrics, 'model_metrics.joblib')

[1m85947/85947[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m217s[0m 3ms/step
{'Accuracy': 0.9197, 'Precision': np.float64(0.907), 'Recall': np.float64(0.9197), 'F1': np.float64(0.909), 'Training Time': 20822.1642, 'Inference Time': 248.5197}


['model_metrics.joblib']

In [17]:
joblib.dump(best_model, 'cnn_rnn.joblib')

['cnn_rnn_model.joblib']