# Imports

In [None]:
import pandas as pd
import numpy as np
import time
import joblib
import matplotlib as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Conv1D, MaxPooling1D, Conv1D, MaxPooling1D
from tensorflow.keras.utils import to_categorical
from scikeras.wrappers import KerasClassifier
import tensorflow as tf
import warnings
import shap
from tqdm import tqdm
import itertools
warnings.filterwarnings('ignore')

# Data Preprocessing

In [None]:
# Set seed for reproducibility
np.random.seed(42)

# Load dataset
data = pd.read_parquet("data/cic-collection.parquet")  # Replace with the correct path to the dataset

# Separate features and target
X = data.drop(['Label','ClassLabel'], axis=1)  # Replace 'target' with the correct column name
y = data['ClassLabel']

# Encode target if categorical
if y.dtype == 'object':
    y = pd.factorize(y)[0]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)


In [None]:
# Reduce the size of the dataset for faster training
X_train = X_train[:10000]
y_train = y_train[:10000]
X_test = X_test[:1000]
y_test = y_test[:1000]

# Function to calculate metrics

In [None]:
def calculate_metrics(y_true, y_pred, training_time, inference_time):
    return {
        "Accuracy": round(accuracy_score(y_true, y_pred), 4),
        "Precision": round(precision_score(y_true, y_pred, average="weighted"), 4),
        "Recall": round(recall_score(y_true, y_pred, average="weighted"), 4),
        "F1": round(f1_score(y_true, y_pred, average="weighted"), 4),
        "Training Time": round(training_time, 4),
        "Inference Time": round(inference_time, 4),
    }



```
# Isto está formatado como código
```

# Random Forest

In [None]:
rf = RandomForestClassifier(random_state=42)

# Hyperparameter grid
#param_grid_rf = {
#    'n_estimators': [50, 100, 200, 500],
#    'max_depth': [5, 10, 20, None],
#    'min_samples_split': [2],
#    'min_samples_leaf': [1],
#    'max_features': ['sqrt']
#}

param_grid_rf = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['sqrt', 'log2', None]
}

# Generate all combinations of parameters
all_params = list(itertools.product(
    param_grid_rf['n_estimators'],
    param_grid_rf['max_depth'],
    param_grid_rf['min_samples_split'],
    param_grid_rf['min_samples_leaf'],
    param_grid_rf['max_features']
))

# Initialize progress bar
progress_bar = tqdm(total=len(all_params), desc="Training models", unit="model")

best_score = -1
best_params = None
cv_results = []

start_time = time.time()

for params in all_params:
    # Unpack parameters
    param_dict = {
        'n_estimators': params[0],
        'max_depth': params[1],
        'min_samples_split': params[2],
        'min_samples_leaf': params[3],
        'max_features': params[4],
    }
    
    # Update model with current parameters
    rf.set_params(**param_dict)
    
    # Perform cross-validation
    scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='f1_macro', n_jobs=-1)
    mean_score = scores.mean()
    std_score = scores.std()
    
    # Save results
    cv_results.append({**param_dict, 'mean_f1': mean_score, 'std_f1': std_score})
    
    # Update best parameters if needed
    if mean_score > best_score:
        best_score = mean_score
        best_params = param_dict
    
    # Update progress bar
    progress_bar.update(1)

progress_bar.close()

training_time_rf = time.time() - start_time

# Train final model with best parameters
best_rf = RandomForestClassifier(**best_params, random_state=42)
best_rf.fit(X_train, y_train)

start_time = time.time()
y_pred_rf = best_rf.predict(X_test)
inference_time_rf = time.time() - start_time

metrics_rf = calculate_metrics(y_test, y_pred_rf, training_time_rf, inference_time_rf)
print("Random Forest Metrics:", metrics_rf)

# Save results and the best model
results_rf = pd.DataFrame(cv_results)
results_rf.to_csv('gridsearch_rf_results.csv', index=False)
print("GridSearchCV results saved to 'gridsearch_rf_results.csv'.")

joblib.dump(best_rf, "best_random_forest_model.pkl")
print("Model saved as 'best_random_forest_model.pkl'.")

In [None]:
## XAI: SHAP Analysis
#explainer = shap.TreeExplainer(best_rf)
#shap_values = explainer.shap_values(X_test)
#
## Plot global feature importance
#shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)
#plt.savefig("shap_feature_importance.png")
#print("SHAP global feature importance saved as 'shap_feature_importance.png'.")
#
## Identify top 10 important features
#feature_importance = best_rf.feature_importances_
#important_features = pd.Series(feature_importance, index=X_train.columns).sort_values(ascending=False)
#top_features = important_features.head(10)
#print("Top 10 Features:\n", top_features)

# Normalize data

In [None]:
scaler = StandardScaler()  # Use MinMaxScaler() if you prefer normalization to [0, 1]
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# XGBoost


In [None]:
# Define the model and hyperparameter grid
xgb_model = xgb.XGBClassifier(
    use_label_encoder=False, eval_metric='mlogloss', random_state=42, tree_method='gpu_hist'
)

param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 10, 20],
    'learning_rate': [0.01],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'gamma': [0.1],
}

#param_grid_xgb = {
#    #'n_estimators': [100, 200, 300],
#    #'max_depth': [3, 6, 10, 20],
#    #'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.2],
#    #'subsample': [0.6, 0.8, 1.0],
#    #'colsample_bytree': [0.6, 0.8, 1.0],
#    #'gamma': [0, 0.1, 1],
#}

# Generate all combinations of parameters
all_params = list(ParameterGrid(param_grid_xgb))

# Initialize progress bar
progress_bar = tqdm(total=len(all_params), desc="Training XGBoost models", unit="model")

# To store results
cv_results = []
best_score = -1
best_params = None

# Start grid search
start_time = time.time()
for params in all_params:
    # Update the model with the current parameters
    xgb_model.set_params(**params)
    
    # Perform cross-validation
    scores = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring='f1_weighted', n_jobs=-1)
    scores = scores[~np.isnan(scores)]
    mean_score = scores.mean()
    std_score = scores.std()
    
    # Save results
    cv_results.append({**params, 'mean_f1': mean_score, 'std_f1': std_score})

    # Update best parameters
    if mean_score > best_score:
        best_score = mean_score
        best_params = params

    # Update progress bar
    progress_bar.update(1)

progress_bar.close()
training_time_xgb = time.time() - start_time

# Train the best model on the entire training set
best_xgb = xgb.XGBClassifier(
    use_label_encoder=False, eval_metric='mlogloss', random_state=42, device='cuda', **best_params
)
best_xgb.fit(X_train, y_train)

# Test set predictions
start_time = time.time()
y_pred_xgb = best_xgb.predict(X_test)
inference_time_xgb = time.time() - start_time

# Calculate metrics
metrics_xgb = calculate_metrics(y_test, y_pred_xgb, training_time_xgb, inference_time_xgb)
print("XGBoost Metrics:", metrics_xgb)

# Save results
results_xgb = pd.DataFrame(cv_results)
results_xgb.to_csv('gridsearch_xgb_results.csv', index=False)
print("GridSearchCV results saved to 'gridsearch_xgb_results.csv'.")

# Save the best model
joblib.dump(best_xgb, "best_xgboost_model.pkl")
print("XGBoost model saved as 'best_xgboost_model.pkl'.")

In [None]:
print(scores.size)

In [None]:
## SHAP Analysis
#explainer = shap.TreeExplainer(best_xgb)
#shap_values = explainer.shap_values(X_test)
#
## Global Feature Importance
#shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)
#plt.savefig("xgb_shap_feature_importance.png")
#print("SHAP global feature importance saved as 'xgb_shap_feature_importance.png'.")
#
## Local Explanation for a Single Prediction
#sample_index = 0  # Change this index to visualize a specific sample
#shap.force_plot(
#    explainer.expected_value,
#    shap_values[sample_index],
#    X_test.iloc[sample_index],
#    matplotlib=True,
#).savefig("xgb_shap_local_explanation.png")
#print("SHAP local explanation for a sample saved as 'xgb_shap_local_explanation.png'.")
#
## Identify Top 10 Features
#feature_importance = best_xgb.feature_importances_
#important_features = pd.Series(feature_importance, index=X_train.columns).sort_values(ascending=False)
#top_features = important_features.head(10)
#print("Top 10 Features:\n", top_features)

# KNN


In [None]:
# Define the model and hyperparameter grid
knn = KNeighborsClassifier()

param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9, 15],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

# Generate all combinations of parameters
all_params = list(ParameterGrid(param_grid_knn))

# Initialize progress bar
progress_bar = tqdm(total=len(all_params), desc="Training KNN models", unit="model")

# To store results
cv_results = []
best_score = -1
best_params = None

# Start grid search
start_time = time.time()
for params in all_params:
    # Update the model with the current parameters
    knn.set_params(**params)
    
    # Perform cross-validation
    scores = cross_val_score(knn, X_train, y_train, cv=5, scoring='f1_macro', n_jobs=-1)
    mean_score = scores.mean()
    std_score = scores.std()
    
    # Save results
    cv_results.append({**params, 'mean_f1': mean_score, 'std_f1': std_score})
    
    # Update best parameters
    if mean_score > best_score:
        best_score = mean_score
        best_params = params

    # Update progress bar
    progress_bar.update(1)

progress_bar.close()
training_time_knn = time.time() - start_time

# Train the best model on the entire training set
best_knn = KNeighborsClassifier(**best_params)
best_knn.fit(X_train, y_train)

# Test set predictions
start_time = time.time()
y_pred_knn = best_knn.predict(X_test)
inference_time_knn = time.time() - start_time

# Calculate metrics
metrics_knn = calculate_metrics(y_test, y_pred_knn, training_time_knn, inference_time_knn)
print("KNN Metrics:", metrics_knn)

# Save results
results_knn = pd.DataFrame(cv_results)
results_knn.to_csv('gridsearch_knn_results.csv', index=False)
print("GridSearchCV results saved to 'gridsearch_knn_results.csv'.")

# Save the best model
joblib.dump(best_knn, "best_knn_model.pkl")
print("KNN model saved as 'best_knn_model.pkl'.")

In [None]:
## SHAP Analysis for KNN
#explainer = shap.KernelExplainer(grid_knn.best_estimator_.predict_proba, shap.sample(X_train, 100))  # Use a sample of the training data for efficiency
#shap_values = explainer.shap_values(X_test, nsamples=100)
#
## Global Feature Importance
#shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)
#plt.savefig("knn_shap_feature_importance.png")
#print("SHAP global feature importance saved as 'knn_shap_feature_importance.png'.")
#
## Local Explanation for a Single Prediction
#sample_index = 0  # Change this index to visualize a specific sample
#shap.force_plot(
#    explainer.expected_value,
#    shap_values[1][sample_index],  # Assumes binary classification; change index if multi-class
#    X_test.iloc[sample_index],
#    matplotlib=True,
#).savefig("knn_shap_local_explanation.png")
#print("SHAP local explanation for a sample saved as 'knn_shap_local_explanation.png'.")
#
#mean_shap_values = np.abs(shap_values[1]).mean(axis=0)  # Change `1` to the index of the desired class if multi-class
#top_features = pd.Series(mean_shap_values, index=X_test.columns).sort_values(ascending=False).head(10)
#print("Top 10 Features Based on SHAP Values:\n", top_features)

# CNN + RNN


In [None]:
# Function to build the CNN+RNN model
def create_cnn_rnn_model(conv_filters=64, lstm_units=64, dense_units=128, dropout_rate=0.5, learning_rate=0.001):
    model = Sequential([
        Conv1D(conv_filters, 3, activation='relu', input_shape=(X_train_dl.shape[1], 1)),
        MaxPooling1D(pool_size=2),
        LSTM(lstm_units, return_sequences=False),
        Dropout(dropout_rate),
        Dense(dense_units, activation='relu'),
        Dropout(dropout_rate),
        Dense(len(np.unique(y_train)), activation='softmax')
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

# Reshape data for CNN+RNN
X_train_dl = np.expand_dims(X_train, axis=2)
X_test_dl = np.expand_dims(X_test, axis=2)
y_train_dl = to_categorical(y_train)
y_test_dl = to_categorical(y_test)

# Define parameter grid
#param_grid_cnn_rnn = {
#    'conv_filters': [32, 64, 128],
#    'lstm_units': [32, 64, 128],
#    'dense_units': [64, 128, 256],
#    'dropout_rate': [0.3, 0.5, 0.7],
#    'batch_size': [16, 32, 64],
#    'epochs': [10, 20],
#}

param_grid_cnn_rnn = {
    'conv_filters': [32, 64, 128],
    'lstm_units': [64],
    'dense_units': [64],
    'dropout_rate': [0.5,],
    'batch_size': [16, ],
    'epochs': [10],
}

# Generate all parameter combinations
all_params = list(ParameterGrid(param_grid_cnn_rnn))

# Initialize progress bar
progress_bar = tqdm(total=len(all_params), desc="Training CNN+RNN models", unit="model")

# To store results
cv_results = []
best_score = -1
best_params = None

# Start grid search
start_time = time.time()
for params in all_params:
    # Create the model with current parameters
    model = create_cnn_rnn_model(
        conv_filters=params['conv_filters'],
        lstm_units=params['lstm_units'],
        dense_units=params['dense_units'],
        dropout_rate=params['dropout_rate']
    )
    
    # Train the model
    history = model.fit(
        X_train_dl, y_train_dl,
        validation_split=0.2,
        batch_size=params['batch_size'],
        epochs=params['epochs'],
        verbose=0
    )
    
    # Evaluate the model
    score = model.evaluate(X_test_dl, y_test_dl, verbose=0)
    accuracy = score[1]  # Assuming the second value is accuracy
    
    # Save results
    cv_results.append({**params, 'accuracy': accuracy})
    
    # Update best model parameters
    if accuracy > best_score:
        best_score = accuracy
        best_params = params
        best_model = model  # Save the best model

    # Update progress bar
    progress_bar.update(1)

progress_bar.close()
training_time_cnn_rnn = time.time() - start_time

# Best model inference
start_time = time.time()
y_pred_cnn_rnn = np.argmax(best_model.predict(X_test_dl), axis=1)
inference_time_cnn_rnn = time.time() - start_time

# Calculate metrics
metrics_cnn_rnn = calculate_metrics(y_test, y_pred_cnn_rnn, training_time_cnn_rnn, inference_time_cnn_rnn)
print("CNN+RNN Metrics:", metrics_cnn_rnn)

# Save the best model
best_model.save("best_cnn_rnn_model.h5")
print("Best CNN+RNN model saved as 'best_cnn_rnn_model.h5'.")

# Save Grid Search Results
results_cnn_rnn = pd.DataFrame(cv_results)
results_cnn_rnn.to_csv('gridsearch_cnn_rnn_results.csv', index=False)
print("GridSearchCV results saved to 'gridsearch_cnn_rnn_results.csv'.")

In [None]:
## SHAP Analysis for CNN+RNN
## Create a SHAP explainer. We use KernelExplainer because the model is a black-box.
#explainer = shap.KernelExplainer(best_cnn_rnn_model.model.predict, shap.sample(X_train_dl, 100))  # Use a sample of the training data for efficiency
#shap_values = explainer.shap_values(X_test_dl, nsamples=100)
#
## Global Feature Importance (Bar plot)
#shap.summary_plot(shap_values, X_test_dl, plot_type="bar", show=False)
#plt.savefig("cnn_rnn_shap_feature_importance.png")
#print("SHAP global feature importance saved as 'cnn_rnn_shap_feature_importance.png'.")
#
## Local Explanation for a Single Prediction (change sample_index for a specific instance)
#sample_index = 0  # Change this index to visualize a specific sample
#shap.force_plot(
#    explainer.expected_value,
#    shap_values[1][sample_index],  # Assumes binary classification; change index if multi-class
#    X_test_dl[sample_index],
#    matplotlib=True,
#).savefig("cnn_rnn_shap_local_explanation.png")
#print("SHAP local explanation for a sample saved as 'cnn_rnn_shap_local_explanation.png'.")
#
## Calculate the mean absolute SHAP value for each feature
#mean_shap_values = np.abs(shap_values[1]).mean(axis=0)  # Adjust index for multi-class
#top_features = pd.Series(mean_shap_values, index=X_test.columns).sort_values(ascending=False).head(10)
#print("Top 10 Features Based on SHAP Values:\n", top_features)


# Save metrics


In [None]:
all_metrics = pd.DataFrame([metrics_rf, metrics_xgb, metrics_knn, metrics_cnn_rnn],
                           index=["Random Forest", "XGBoost", "KNN", "CNN+RNN"])
all_metrics.to_csv('model_metrics.csv')
print("Metrics saved to 'model_metrics.csv'.")