# Bibliotecas usadas

In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from hiclass import LocalClassifierPerNode, LocalClassifierPerLevel, LocalClassifierPerParentNode

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb

from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, recall_score, f1_score, precision_score

# Funções de utilidade

In [2]:
def run_model(model, X_train, X_test, y_train, y_test, scores = False, hierarchical = False):
  model.fit(X_train, y_train)
  y_predicted = model.predict(X_test).astype(int)

  if scores:
    if hierarchical:
      test_accuracy_binary = accuracy_score(y_test["Label"], y_predicted[:, 0])
      test_accuracy_multiclass = accuracy_score(y_test["attack_cat"], y_predicted[:, 1])
      test_balanced_accuracy_binary = balanced_accuracy_score(y_test["Label"], y_predicted[:, 0])
      test_balanced_accuracy_multiclass = balanced_accuracy_score(y_test["attack_cat"], y_predicted[:, 1])
      test_precision_binary = precision_score(y_test["Label"], y_predicted[:, 0], average = None, zero_division = np.nan)
      test_precision_multiclass = precision_score(y_test["attack_cat"], y_predicted[:, 1], average = None, zero_division = np.nan)
      test_recall_binary = recall_score(y_test["Label"], y_predicted[:, 0], average = None, zero_division = np.nan)
      test_recall_multiclass = recall_score(y_test["attack_cat"], y_predicted[:, 1], average = None, zero_division = np.nan)
      test_f1_binary = f1_score(y_test["Label"], y_predicted[:, 0], average = None, zero_division = np.nan)
      test_f1_multiclass = f1_score(y_test["attack_cat"], y_predicted[:, 1], average = None, zero_division = np.nan)

      return [{"Acurácia no Teste Binário": test_accuracy_binary,
                "Acurácia no Teste Multiclasse": test_accuracy_multiclass,
                "Acurácia Balanceada no Teste Binário": test_balanced_accuracy_binary,
                "Acurácia Balanceada no Teste Multiclasse": test_balanced_accuracy_multiclass,
                "Precisão Binário": test_precision_binary,
                "Precisão Multiclasse": test_precision_multiclass,
                "Recall Binário": test_recall_binary,
                "Recall Multiclasse": test_recall_multiclass,
                "F1 Binário": test_f1_binary,
                "F1 Multiclasse": test_f1_multiclass},
                confusion_matrix(y_test["Label"], y_predicted[:, 0]),
                confusion_matrix(y_test["attack_cat"], y_predicted[:, 1])]
    else:
      test_accuracy = accuracy_score(y_test, y_predicted)
      test_balanced_accuracy = balanced_accuracy_score(y_test, y_predicted)
      test_precision = precision_score(y_test, y_predicted, average = None, zero_division = np.nan)
      test_recall = recall_score(y_test, y_predicted, average = None)
      test_f1 = f1_score(y_test, y_predicted, average = None)
    
      return [pd.DataFrame({"Acurácia no Teste": test_accuracy, 
                            "Acurácia Balanceada no Teste": test_balanced_accuracy,
                            "Precisão": test_precision,
                            "Recall": test_recall,
                            "F1": test_f1}), 
                            confusion_matrix(y_test, y_predicted)]

# Carrega-se os dados pré-processados

In [3]:
df = pkl.load(open("../data/files/Normalized_Data.pkl", "rb"))

In [4]:
X = df.drop(columns = ["attack_cat", "Label"])
y = df[["attack_cat", "Label"]]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y[["Label", "attack_cat"]], test_size = 0.2, stratify = y, random_state = 42)

del df

# Separação da base em K-Folds

In [6]:
k_folds = KFold(n_splits = 5, shuffle = True, random_state = 42)

# Seleção das características mais importantes

In [7]:
X_train_binary = X_train[["dstip", "ct_state_ttl", "sttl", "srcip", "sbytes", "smeansz", "dmeansz", "Sload"]]
X_test_binary = X_test[["dstip", "ct_state_ttl", "sttl", "srcip", "sbytes", "smeansz", "dmeansz", "Sload"]]

# Execução dos modelos Adaptive Boosting, Random Forest, Bagging, Extreme Gradient Boosting e Multilayer Perceptron | Classificação binária

In [8]:
ada_boost = AdaBoostClassifier(random_state = 42, algorithm = "SAMME", n_estimators = 400, learning_rate = 1.5)
random_forest = RandomForestClassifier(random_state = 42, n_estimators = 400, n_jobs = -1)
bagging = BaggingClassifier(random_state = 42, n_estimators = 100, max_features = 1, max_samples = 0.3, n_jobs = -1)
mlp = MLPClassifier(random_state = 42, activation = "tanh", hidden_layer_sizes = (100, 100, 100))
xgboost = xgb.XGBClassifier(random_state = 42, n_estimators = 400, max_depth = 5, learning_rate = 0.1)

In [10]:
print("Ada Boost")
run_model(ada_boost, X_train_binary, X_test_binary, y_train["Label"], y_test["Label"])
print("\nRandom Forest")
run_model(random_forest, X_train_binary, X_test_binary, y_train["Label"], y_test["Label"])
print("\nBagging")
run_model(bagging, X_train_binary, X_test_binary, y_train["Label"], y_test["Label"])
print("\nMLP")
run_model(mlp, X_train_binary, X_test_binary, y_train["Label"], y_test["Label"])
print("\nXGBoost")
run_model(xgboost, X_train_binary, X_test_binary, y_train["Label"], y_test["Label"])

Ada Boost
Acurácia no treino 0.9898077141691523 
Acurácia no teste: 0.989616284814973 
Acurácia balanceada no teste: 0.9810401500313333

Random Forest
Acurácia no treino 0.9954498636090957 
Acurácia no teste: 0.9925965587088367 
Acurácia balanceada no teste: 0.9812687681136072

Bagging
Acurácia no treino 0.9865562809942978 
Acurácia no teste: 0.9862777481412658 
Acurácia balanceada no teste: 0.9913534674646067

MLP
Acurácia no treino 0.9893485667267217 
Acurácia no teste: 0.9891458188568267 
Acurácia balanceada no teste: 0.9802983852649803

XGBoost
Acurácia no treino 0.9922550241704465 
Acurácia no teste: 0.9918583799042139 
Acurácia balanceada no teste: 0.9826362914765465

Stacking
Acurácia no treino 0.9943352130655744 
Acurácia no teste: 0.9925296304972175 
Acurácia balanceada no teste: 0.9809576240479746


## Salva-se os modelos

In [13]:
pkl.dump(ada_boost, open("../data/utilities/models/ada_boost_binary.pkl", "wb"))
pkl.dump(random_forest, open("../data/utilities/models/random_forest_binary.pkl", "wb"))
pkl.dump(bagging, open("../data/utilities/models/bagging_binary.pkl", "wb"))
pkl.dump(mlp, open("../data/utilities/models/mlp_binary.pkl", "wb"))
pkl.dump(xgboost, open("../data/utilities/models/xgboost_binary.pkl", "wb"))

# Execução dos modelos Adaptive Boosting, Random Forest, Bagging, Extreme Gradient Boosting e Multilayer Perceptron | Classificação multiclasse

In [7]:
ada_boost = AdaBoostClassifier(random_state = 42, algorithm = "SAMME", n_estimators = 400, learning_rate = 1.5)
random_forest = RandomForestClassifier(random_state = 42, n_estimators = 400, n_jobs = -1)
bagging = BaggingClassifier(random_state = 42, n_estimators = 100, max_features = 1, max_samples = 0.3, n_jobs = -1)
mlp = MLPClassifier(random_state = 42, activation = "tanh", hidden_layer_sizes = (100, 100, 100))
xgboost = xgb.XGBClassifier(random_state = 42, n_estimators = 400, max_depth = 5, learning_rate = 0.1)

In [None]:
print("Ada Boost")
run_model(ada_boost, X_train, X_test, y_train["attack_cat"], y_test["attack_cat"])
print("\nRandom Forest")
run_model(random_forest, X_train, X_test, y_train["attack_cat"], y_test["attack_cat"])
print("\nBagging")
run_model(bagging, X_train, X_test, y_train["attack_cat"], y_test["attack_cat"])
print("\nMLP")
run_model(mlp, X_train, X_test, y_train["attack_cat"], y_test["attack_cat"])
print("\nXGBoost")
run_model(xgboost, X_train, X_test, y_train["attack_cat"], y_test["attack_cat"])

## Salva-se os modelos

In [16]:
pkl.dump(ada_boost, open("../data/utilities/models/ada_boost_multiclass.pkl", "wb"))
pkl.dump(random_forest, open("../data/utilities/models/random_forest_multiclass.pkl", "wb"))
pkl.dump(bagging, open("../data/utilities/models/bagging_multiclass.pkl", "wb"))
pkl.dump(mlp, open("../data/utilities/models/mlp_multiclass.pkl", "wb"))
pkl.dump(xgboost, open("../data/utilities/models/xgboost_multiclass.pkl", "wb"))

# Execução dos modelos: Classificador Local por Nó, Classificador Local por Nó Pai e Classificador Local por Nível | Classificação Hierárquica

In [7]:
local_classifier_per_node_base_estimator = RandomForestClassifier(random_state = 42, n_estimators = 400, n_jobs = 2)
local_classifier_per_parent_node_base_estimator = RandomForestClassifier(random_state = 42, n_estimators = 400, n_jobs = 2)
local_classifier_per_level_base_estimator = RandomForestClassifier(random_state = 42, n_estimators = 400, n_jobs = 2)

local_classifier_per_node = LocalClassifierPerNode(local_classifier_per_node_base_estimator)
local_classifier_per_parent_node = LocalClassifierPerParentNode(local_classifier_per_parent_node_base_estimator)
local_classifier_per_level = LocalClassifierPerLevel(local_classifier_per_level_base_estimator)

In [8]:
print("Local Classifier Per Node")
run_model(local_classifier_per_node, X_train, X_test, y_train[["Label", "attack_cat"]], y_test[["Label", "attack_cat"]])
print("\nLocal Classifier Per Parent Node")
run_model(local_classifier_per_parent_node, X_train, X_test, y_train[["Label", "attack_cat"]], y_test[["Label", "attack_cat"]])
print("\nLocal Classifier Per Level")
run_model(local_classifier_per_level, X_train, X_test, y_train[["Label", "attack_cat"]], y_test[["Label", "attack_cat"]])

Local Classifier Per Node

Local Classifier Per Parent Node

Local Classifier Per Level


## Salva-se os modelos

In [9]:
pkl.dump(local_classifier_per_node, open("../data/utilities/models/local_classifier_per_node.pkl", "wb"))
pkl.dump(local_classifier_per_parent_node, open("../data/utilities/models/local_classifier_per_parent_node.pkl", "wb"))
pkl.dump(local_classifier_per_level, open("../data/utilities/models/local_classifier_per_level.pkl", "wb"))

# Execução dos modelos Adaptive Boosting, Random Forest, Bagging, Extreme Gradient Boosting e Multilayer Perceptron em validação cruzada | Classificação binária

In [47]:
ada_boost = AdaBoostClassifier(random_state = 42, algorithm = "SAMME", n_estimators = 400, learning_rate = 1.5)
random_forest = RandomForestClassifier(random_state = 42, n_estimators = 400, n_jobs = -1)
bagging = BaggingClassifier(random_state = 42, n_estimators = 100, max_features = 1, max_samples = 0.3, n_jobs = -1)
mlp = MLPClassifier(random_state = 42, activation = "tanh", hidden_layer_sizes = (100, 100, 100))
xgboost = xgb.XGBClassifier(random_state = 42, n_estimators = 400, max_depth = 5, learning_rate = 0.1)

In [None]:
ada_boost_scores_binary = []; random_forest_scores_binary = []; bagging_scores_binary = []; mlp_scores_binary = []; xgboost_scores_binary = []


for i, j in k_folds.split(X):
  X_train = X.iloc[i][["dstip", "ct_state_ttl", "sttl", "srcip", "sbytes", "smeansz", "dmeansz", "Sload"]]
  X_test = X.iloc[j][["dstip", "ct_state_ttl", "sttl", "srcip", "sbytes", "smeansz", "dmeansz", "Sload"]]
  y_train, y_test = y.iloc[i], y.iloc[j]


  print("AdaBoost")
  model_scores = run_model(ada_boost, X_train, X_test, y_train["Label"], y_test["Label"], scores = True)
  ada_boost_scores_binary.append(model_scores)
  print("\nRandom Forest")
  model_scores = run_model(random_forest, X_train, X_test, y_train["Label"], y_test["Label"], scores = True)
  random_forest_scores_binary.append(model_scores)
  print("\nBagging")
  model_scores = run_model(bagging, X_train, X_test, y_train["Label"], y_test["Label"], scores = True)
  bagging_scores_binary.append(model_scores)
  print("\nMLP")
  model_scores = run_model(mlp, X_train, X_test, y_train["Label"], y_test["Label"], scores = True)
  mlp_scores_binary.append(model_scores)
  print("\nXGBoost")
  model_scores = run_model(xgboost, X_train, X_test, y_train["Label"], y_test["Label"], scores = True)
  xgboost_scores_binary.append(model_scores)

## Salva-se os resultados

In [50]:
pkl.dump([ada_boost_scores_binary], open("../data/files/results/ada_boost_scores_binary.pkl", "wb"))
pkl.dump([random_forest_scores_binary], open("../data/files/results/random_forest_scores_binary.pkl", "wb"))
pkl.dump([bagging_scores_binary], open("../data/files/results/bagging_scores_binary.pkl", "wb"))
pkl.dump([mlp_scores_binary], open("../data/files/results/mlp_scores_binary.pkl", "wb"))
pkl.dump([xgboost_scores_binary], open("../data/files/results/xgboost_scores_binary.pkl", "wb"))

# Execução dos modelos Adaptive Boosting, Random Forest, Bagging, Extreme Gradient Boosting e Multilayer Perceptron em validação cruzada | Classificação multiclasse

In [7]:
ada_boost = AdaBoostClassifier(random_state = 42, algorithm = "SAMME", n_estimators = 400, learning_rate = 1.5)
random_forest = RandomForestClassifier(random_state = 42, n_estimators = 400, n_jobs = 2)
bagging = BaggingClassifier(random_state = 42, n_estimators = 100, max_features = 1, max_samples = 0.3, n_jobs = 2)
mlp = MLPClassifier(random_state = 42, activation = "tanh", hidden_layer_sizes = (100, 100, 100))
xgboost = xgb.XGBClassifier(random_state = 42, n_estimators = 400, max_depth = 5, learning_rate = 0.1)

In [8]:
ada_boost_scores_multiclass = []; random_forest_scores_multiclass = []; bagging_scores_multiclass = []; mlp_scores_multiclass = []; xgboost_scores_multiclass = []

for i, j in k_folds.split(X):
  
  X_train, X_test = X.iloc[i], X.iloc[j]
  y_train, y_test = y.iloc[i], y.iloc[j]


  print("AdaBoost")
  model_scores = run_model(ada_boost, X_train, X_test, y_train["attack_cat"], y_test["attack_cat"], scores = True)
  ada_boost_scores_multiclass.append(model_scores)
  print("\nRandom Forest")
  model_scores = run_model(random_forest, X_train, X_test, y_train["attack_cat"], y_test["attack_cat"], scores = True)
  random_forest_scores_multiclass.append(model_scores)
  print("\nBagging")
  model_scores = run_model(bagging, X_train, X_test, y_train["attack_cat"], y_test["attack_cat"], scores = True)
  bagging_scores_multiclass.append(model_scores)
  print("\nMLP")
  model_scores = run_model(mlp, X_train, X_test, y_train["attack_cat"], y_test["attack_cat"], scores = True)
  mlp_scores_multiclass.append(model_scores)
  print("\nXGBoost")
  model_scores = run_model(xgboost, X_train, X_test, y_train["attack_cat"], y_test["attack_cat"], scores = True)
  xgboost_scores_multiclass.append(model_scores)

AdaBoost

Random Forest

Bagging

MLP

XGBoost
AdaBoost

Random Forest

Bagging

MLP

XGBoost
AdaBoost

Random Forest

Bagging

MLP

XGBoost
AdaBoost


KeyboardInterrupt: 

## Salva-se os resultados

In [9]:
pkl.dump([ada_boost_scores_multiclass], open("../data/files/results/ada_boost_scores_multiclass_.pkl", "wb"))
pkl.dump([random_forest_scores_multiclass], open("../data/files/results/random_forest_scores_multiclass_.pkl", "wb"))
pkl.dump([bagging_scores_multiclass], open("../data/files/results/bagging_scores_multiclass_.pkl", "wb"))
pkl.dump([mlp_scores_multiclass], open("../data/files/results/mlp_scores_multiclass_.pkl", "wb"))
pkl.dump([xgboost_scores_multiclass], open("../data/files/results/xgboost_scores_multiclass_.pkl", "wb"))

# Execução dos modelos: Classificador Local por Nó, Classificador Local por Nó Pai e Classificador Local por Nível em validação cruzada | Classificação Hierárquica

In [10]:
local_classifier_per_node_base_estimator = RandomForestClassifier(random_state = 42, n_estimators = 400, n_jobs = 2)
local_classifier_per_parent_node_base_estimator = RandomForestClassifier(random_state = 42, n_estimators = 400, n_jobs = 2)
local_classifier_per_level_base_estimator = RandomForestClassifier(random_state = 42, n_estimators = 400, n_jobs = 2)

local_classifier_per_node = LocalClassifierPerNode(local_classifier_per_node_base_estimator)
local_classifier_per_parent_node = LocalClassifierPerParentNode(local_classifier_per_parent_node_base_estimator)
local_classifier_per_level = LocalClassifierPerLevel(local_classifier_per_level_base_estimator)

In [11]:
local_classifier_per_node_scores = []; local_classifier_per_parent_node_scores = []; local_classifier_per_level_scores = []
# local_classifier_per_node_confusion_matrix_binary = []; local_classifier_per_parent_node_confusion_matrix_binary = []; local_classifier_per_level_confusion_matrix_binary = []
# local_classifier_per_node_confusion_matrix_multiclass = []; local_classifier_per_parent_node_confusion_matrix_multiclass = []; local_classifier_per_level_confusion_matrix_multiclass = []


for i, j in k_folds.split(X):

  X_train, X_test = X.iloc[i], X.iloc[j]
  y_train, y_test = y.iloc[i], y.iloc[j]

  print("Local Classifier Per Node")
  model_scores = run_model(local_classifier_per_node, X_train, X_test, y_train[["Label", "attack_cat"]], y_test[["Label", "attack_cat"]], scores = True, hierarchical = True)
  local_classifier_per_node_scores.append(model_scores)
  print("\nLocal Classifier Per Parent Node")
  model_scores = run_model(local_classifier_per_parent_node, X_train, X_test, y_train[["Label", "attack_cat"]], y_test[["Label", "attack_cat"]], scores = True, hierarchical = True)
  local_classifier_per_parent_node_scores.append(model_scores)
  print("\nLocal Classifier Per Level")
  model_scores = run_model(local_classifier_per_level, X_train, X_test, y_train[["Label", "attack_cat"]], y_test[["Label", "attack_cat"]], scores = True, hierarchical = True)
  local_classifier_per_level_scores.append(model_scores)

Local Classifier Per Node

Local Classifier Per Parent Node

Local Classifier Per Level
Local Classifier Per Node

Local Classifier Per Parent Node

Local Classifier Per Level
Local Classifier Per Node

Local Classifier Per Parent Node

Local Classifier Per Level
Local Classifier Per Node


KeyboardInterrupt: 

## Salve-se os resultados

In [12]:
pkl.dump([local_classifier_per_node_scores], open("../data/files/results/local_classifier_per_node_scores_.pkl", "wb"))
pkl.dump([local_classifier_per_parent_node_scores], open("../data/files/results/local_classifier_per_parent_node_scores_.pkl", "wb"))
pkl.dump([local_classifier_per_level_scores], open("../data/files/results/local_classifier_per_level_scores_.pkl", "wb"))