# Imports

In [7]:
import pandas as pd
import numpy as np
import time
import joblib
import matplotlib as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb
from sklearn.model_selection import ParameterGrid, StratifiedKFold
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import BatchNormalization
from tqdm import tqdm
import numpy as np
import pandas as pd
import time
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Conv1D, MaxPooling1D, Conv1D, MaxPooling1D
from tensorflow.keras.utils import to_categorical
from scikeras.wrappers import KerasClassifier
import tensorflow as tf
import warnings
import shap
from tqdm import tqdm
import itertools
from lightgbm import LGBMClassifier
from sklearn.model_selection import ParameterGrid, cross_val_score
from tqdm import tqdm
import time
import pandas as pd
warnings.filterwarnings('ignore')

# Data Preprocessing

In [2]:
# Set seed for reproducibility
np.random.seed(42)

# Load dataset
data = pd.read_parquet("data/cic-collection.parquet")  # Replace with the correct path to the dataset

# Separate features and target
X = data.drop(['Label','ClassLabel'], axis=1)  # Replace 'target' with the correct column name
y = data['ClassLabel']

# Encode target if categorical
if y.dtype == 'object':
    y = pd.factorize(y)[0]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, stratify=y, random_state=42)


In [18]:
# Reduce the size of the dataset for faster training
X_train = X_train[:10000]
y_train = y_train[:10000]
X_test = X_test[:1000]
y_test = y_test[:1000]

# Function to calculate metrics

In [3]:
def calculate_metrics(y_true, y_pred, training_time, inference_time):
    return {
        "Accuracy": round(accuracy_score(y_true, y_pred), 4),
        "Precision": round(precision_score(y_true, y_pred, average="weighted"), 4),
        "Recall": round(recall_score(y_true, y_pred, average="weighted"), 4),
        "F1": round(f1_score(y_true, y_pred, average="weighted"), 4),
        "Training Time": round(training_time, 4),
        "Inference Time": round(inference_time, 4),
    }




# Random Forest

In [None]:
rf = RandomForestClassifier(random_state=42)

# Hyperparameter grid

param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5,]
}

# Generate all combinations of parameters
all_params = list(itertools.product(
    param_grid_rf['n_estimators'],
    param_grid_rf['max_depth'],
    param_grid_rf['min_samples_split']
))

# Initialize progress bar
progress_bar = tqdm(total=len(all_params), desc="Training models", unit="model")

best_score = -1
best_params = None
cv_results = []

start_time = time.time()

for params in all_params:
    # Unpack parameters
    param_dict = {
        'n_estimators': params[0],
        'max_depth': params[1],
        'min_samples_split': params[2]
    }
    
    # Update model with current parameters
    rf.set_params(**param_dict)
    
    # Perform cross-validation
    scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='f1_macro', n_jobs=-1)
    mean_score = scores.mean()
    std_score = scores.std()
    
    # Save results
    cv_results.append({**param_dict, 'mean_f1': mean_score, 'std_f1': std_score})
    
    # Update best parameters if needed
    if mean_score > best_score:
        best_score = mean_score
        best_params = param_dict
    
    # Update progress bar
    progress_bar.update(1)

progress_bar.close()

training_time_rf = time.time() - start_time

# Train final model with best parameters
best_rf = RandomForestClassifier(**best_params, random_state=42)
best_rf.fit(X_train, y_train)

start_time = time.time()
y_pred_rf = best_rf.predict(X_test)
inference_time_rf = time.time() - start_time

metrics_rf = calculate_metrics(y_test, y_pred_rf, training_time_rf, inference_time_rf)
print("Random Forest Metrics:", metrics_rf)

# Save results and the best model
results_rf = pd.DataFrame(cv_results)
results_rf.to_csv('gridsearch_rf_results.csv', index=False)
print("GridSearchCV results saved to 'gridsearch_rf_results.csv'.")

joblib.dump(best_rf, "best_random_forest_model.pkl")
print("Model saved as 'best_random_forest_model.pkl'.")

# Normalize data

In [7]:
scaler = StandardScaler()  # Use MinMaxScaler() if you prefer normalization to [0, 1]
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# XGBoost


In [None]:
# Define the model and hyperparameter grid
xgb_model = xgb.XGBClassifier(
    use_label_encoder=False, eval_metric='mlogloss', random_state=42, tree_method='gpu_hist'
)

param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [6, 10],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1.0]
}


# Generate all combinations of parameters
all_params = list(ParameterGrid(param_grid_xgb))

# Initialize progress bar
progress_bar = tqdm(total=len(all_params), desc="Training XGBoost models", unit="model")

# To store results
cv_results = []
best_score = -1
best_params = None

# Start grid search
start_time = time.time()
for params in all_params:
    # Update the model with the current parameters
    xgb_model.set_params(**params)
    
    # Perform cross-validation
    scores = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring='f1_weighted', n_jobs=-1)
    scores = scores[~np.isnan(scores)]
    mean_score = scores.mean()
    std_score = scores.std()
    
    # Save results
    cv_results.append({**params, 'mean_f1': mean_score, 'std_f1': std_score})

    # Update best parameters
    if mean_score > best_score:
        best_score = mean_score
        best_params = params

    # Update progress bar
    progress_bar.update(1)

progress_bar.close()
training_time_xgb = time.time() - start_time



# Save results
results_xgb = pd.DataFrame(cv_results)
results_xgb.to_csv('gridsearch_xgb_results.csv', index=False)
print("GridSearchCV results saved to 'gridsearch_xgb_results.csv'.")

In [None]:
print(scores.size)

# LightGBM


In [4]:
# Define the model and hyperparameter grid
lgbm = LGBMClassifier()

param_grid_lgbm = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [-1, 10],
    'num_leaves': [31, 63]
}

# Generate all combinations of parameters
all_params = list(ParameterGrid(param_grid_lgbm))

# Initialize progress bar
progress_bar = tqdm(total=len(all_params), desc="Training LGBM models", unit="model")

# To store results
cv_results = []
best_score = -1
best_params = None

# Start grid search
start_time = time.time()
for params in all_params:
    # Update the model with the current parameters
    lgbm.set_params(**params)
    
    # Perform cross-validation
    scores = cross_val_score(lgbm, X_train, y_train, cv=5, scoring='f1_macro', n_jobs=-1)
    mean_score = scores.mean()
    std_score = scores.std()
    
    # Save results
    cv_results.append({**params, 'mean_f1': mean_score, 'std_f1': std_score})
    
    # Update best parameters
    if mean_score > best_score:
        best_score = mean_score
        best_params = params

    # Update progress bar
    progress_bar.update(1)

progress_bar.close()
training_time_lgbm = time.time() - start_time

# Save results
results_lgbm = pd.DataFrame(cv_results)
results_lgbm.to_csv('gridsearch_lgbm_results.csv', index=False)
print("GridSearchCV results saved to 'gridsearch_lgbm_results.csv'.")


Training LGBM models: 100%|██████████| 16/16 [53:01<00:00, 198.86s/model]

GridSearchCV results saved to 'gridsearch_lgbm_results.csv'.





# CNN + RNN


In [None]:
# Função para construir o modelo CNN+RNN
def create_cnn_rnn_model(conv_filters=64, lstm_units=64, dense_units=128, dropout_rate=0.5, learning_rate=0.001):
    model = Sequential([
        Conv1D(conv_filters, 3, activation='relu', kernel_initializer='he_uniform', input_shape=(X_train_dl.shape[1], 1)),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),
        LSTM(lstm_units, return_sequences=False),
        Dropout(dropout_rate),
        Dense(dense_units, activation='relu', kernel_initializer='he_uniform'),
        Dropout(dropout_rate),
        Dense(len(np.unique(y_train)), activation='softmax')
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

# Reduzindo o grid de parâmetros
param_grid_cnn_rnn = {
    'conv_filters': [32, 64],
    'lstm_units': [64, 128],
    'dense_units': [64, 128],
    'dropout_rate': [0.3, 0.5],
    'batch_size': [32],
    'epochs': [10, 20],
    'learning_rate': [0.001]  # Mantendo fixo para reduzir busca
}

# Reorganizar os dados
X_train_dl = np.expand_dims(X_train, axis=2)
X_test_dl = np.expand_dims(X_test, axis=2)
y_train_dl = to_categorical(y_train)
y_test_dl = to_categorical(y_test)

# Grid Search
all_params = list(ParameterGrid(param_grid_cnn_rnn))
cv_results = []
best_score = -1
best_params = None
start_time = time.time()

# Estratégia k-fold
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

print(f"Total parameter combinations: {len(all_params)}")
progress_bar = tqdm(total=len(all_params), desc="Training CNN+RNN models", unit="model")

# Realiza grid search
for params in all_params:
    fold_scores = []

    for train_idx, val_idx in kfold.split(X_train_dl, y_train):
        # Separar treinamento e validação
        X_train_fold, X_val_fold = X_train_dl[train_idx], X_train_dl[val_idx]
        y_train_fold, y_val_fold = y_train_dl[train_idx], y_train_dl[val_idx]

        # Criar o modelo
        model = create_cnn_rnn_model(
            conv_filters=params['conv_filters'],
            lstm_units=params['lstm_units'],
            dense_units=params['dense_units'],
            dropout_rate=params['dropout_rate'],
            learning_rate=params['learning_rate']
        )

        # Treinar o modelo com early stopping
        early_stop = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True, verbose=0)
        history = model.fit(
            X_train_fold, y_train_fold,
            validation_data=(X_val_fold, y_val_fold),
            batch_size=params['batch_size'],
            epochs=params['epochs'],
            verbose=0,
            callbacks=[early_stop]
        )

        # Avaliar no conjunto de validação
        score = model.evaluate(X_val_fold, y_val_fold, verbose=0)
        fold_scores.append(score[1])  # Pega a acurácia

    # Média das acurácias dos folds
    avg_score = np.mean(fold_scores)
    cv_results.append({**params, 'accuracy': avg_score})

    # Atualiza melhor modelo se necessário
    if avg_score > best_score:
        best_score = avg_score
        best_params = params
        best_model = model

    progress_bar.update(1)

progress_bar.close()
training_time_cnn_rnn = time.time() - start_time

# Salvar resultados
results_cnn_rnn = pd.DataFrame(cv_results)
results_cnn_rnn.to_csv('gridsearch_cnn_rnn_results.csv', index=False)

print(f"GridSearch completed in {training_time_cnn_rnn:.2f} seconds.")
print(f"Best Accuracy: {best_score:.4f} with params: {best_params}")
print("GridSearch results saved to 'gridsearch_cnn_rnn_results.csv'.")

# Save metrics
