# Bibliotecas usadas

In [39]:
import pandas as pd
import numpy as np
import sweetviz as sv
import pickle as pkl

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb

from sklearn.metrics import accuracy_score, balanced_accuracy_score

# Funções de utilidade

In [40]:
def run_model(model, X_train, X_test, y_train, y_test, verbose = False):
  model.fit(X_train, y_train)
  y_predicted = model.predict(X_test)

  train_accuracy = model.score(X_train, y_train)
  test_accuracy = accuracy_score(y_test, y_predicted)
  test_balanced_accuracy = balanced_accuracy_score(y_test, y_predicted)

  if verbose:
    print(f"Acurácia no treino {train_accuracy} \nAcurácia no teste: {test_accuracy} \nAcurácia balanceada no teste: {test_balanced_accuracy}")


def run_grid_search(model, param_grid, X_train, X_test, y_train, y_test, verbose = True, cv = 3, n_jobs = None):
  grid_search = GridSearchCV(model, param_grid, cv = cv, n_jobs = n_jobs)
  grid_search.fit(X_train, y_train)

  if verbose:
    print(grid_search.best_params_)
    print(grid_search.best_score_)
    print(grid_search.score(X_test, y_test))

# Carregamento da base pré-preprocessada

In [3]:
df = pkl.load(open("../data/files/Normalized_Data.pkl", "rb"))

# Separa-se matriz de características e vetor alvo. Separa-se também o conjunto de treino e teste

In [4]:
X = df.drop(columns = ["attack_cat", "Label"])
y = df[["attack_cat", "Label"]]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y[["Label", "attack_cat"]], test_size = 0.2, stratify = y, random_state = 42)

del df

# Busca-se os melhores parâmetros para o modelo Random Forest | Classificação binária

In [None]:
random_forest = RandomForestClassifier(random_state = 42)

random_forest_param_grid = {
  "n_estimators": [50, 100, 200],
  "max_depth": [5, 10, 20],
  "min_samples_split": [2, 5, 10]
}

run_grid_search(random_forest, random_forest_param_grid, X_train, X_test, y_train["Label"], y_test["Label"], n_jobs = -1)

# Separação da base em K-Folds

In [8]:
k_folds = KFold(n_splits = 5, shuffle = True, random_state = 42)

# Seleção de características para classificação binária

## A seleção de características se dará com base na média da importância das características dos modelos Random Forest

In [18]:
random_forest = RandomForestClassifier(random_state = 42, n_estimators = 200, max_depth = 10, min_samples_split = 10, n_jobs = -1)

feature_importance_random_forest = pd.DataFrame(data = {"Feature": X.columns, "Importance": np.zeros(X.shape[1])})


for i, j in k_folds.split(X):
  X_train_fold, X_test_fold = X.iloc[i], X.iloc[j]
  y_train_fold, y_test_fold = y.iloc[i], y.iloc[j]


  print("Random Forest")
  run_model(random_forest, X_train_fold, X_test_fold, y_train_fold["Label"], y_test_fold["Label"], verbose = True)
  print("\n")

  feature_importance_random_forest["Importance"] = feature_importance_random_forest["Importance"] + random_forest.feature_importances_
  

feature_importance_random_forest["Importance"] = feature_importance_random_forest["Importance"] / k_folds.n_splits


Random Forest
Acurácia no treino 0.9933258760833394 
Acurácia no teste: 0.9934587515526361 
Acurácia balanceada no teste: 0.984551543178408




Random Forest
Acurácia no treino 0.993384930416771 
Acurácia no teste: 0.9930335605611734 
Acurácia balanceada no teste: 0.9834435154975727




Random Forest
Acurácia no treino 0.9933534347722742 
Acurácia no teste: 0.9928701769857502 
Acurácia balanceada no teste: 0.9829033831087333




Random Forest
Acurácia no treino 0.9931762717719794 
Acurácia no teste: 0.9931949756597842 
Acurácia balanceada no teste: 0.983762673891661




Random Forest
Acurácia no treino 0.9932806044011204 
Acurácia no teste: 0.9930650425388676 
Acurácia balanceada no teste: 0.9831589263530347





In [30]:
feature_importance_binary = pd.DataFrame({"Feature": X.columns,  
                                          "Random Forest Feature Importance": feature_importance_random_forest["Importance"]})

In [31]:
feature_importance_binary.sort_values(by = "Random Forest Feature Importance", ascending = False)

Unnamed: 0,Feature,Random Forest Feature Importance
2,dstip,0.194365
36,ct_state_ttl,0.183802
9,sttl,0.156999
0,srcip,0.08502
7,sbytes,0.084182
23,dmeansz,0.033043
22,smeansz,0.030818
17,Dpkts,0.026701
14,Sload,0.025492
8,dbytes,0.021634


# Busca-se os melhores parâmetros para o modelo Random Forest | Classificação multiclasse

In [None]:
random_forest = RandomForestClassifier(random_state = 42)

random_forest_param_grid = {
  "n_estimators": [50, 100, 200],
  "max_depth": [5, 10, 20],
  "min_samples_split": [2, 5, 10]
}

run_grid_search(random_forest, random_forest_param_grid, X_train, X_test, y_train["attack_cat"], y_test["attack_cat"], n_jobs = -1)

# Seleção de características para classificação multiclasse

## A seleção de características se dará com base na média da importância das características dos modelos Random Forest

In [38]:
random_forest = RandomForestClassifier(random_state = 42, n_estimators = 200, max_depth = 10, min_samples_split = 10, n_jobs = -1)

feature_importance_random_forest = pd.DataFrame(data = {"Feature": X.columns, "Importance": np.zeros(X.shape[1])})


for i, j in k_folds.split(X):
  X_train_fold, X_test_fold = X.iloc[i], X.iloc[j]
  y_train_fold, y_test_fold = y.iloc[i], y.iloc[j]


  print("Random Forest")
  run_model(random_forest, X_train_fold, X_test_fold, y_train_fold["attack_cat"], y_test_fold["attack_cat"], verbose = True)
  print("\n")

  feature_importance_random_forest["Importance"] = feature_importance_random_forest["Importance"] + random_forest.feature_importances_
  

feature_importance_random_forest["Importance"] = feature_importance_random_forest["Importance"] / k_folds.n_splits


Random Forest
Acurácia no treino 0.9794825560880834 
Acurácia no teste: 0.9792719391661926 
Acurácia balanceada no teste: 0.44012510943225136




Random Forest
Acurácia no treino 0.979520941404814 
Acurácia no teste: 0.9791203664516434 
Acurácia balanceada no teste: 0.44506302502470596




Random Forest
Acurácia no treino 0.9794279308296593 
Acurácia no teste: 0.9793250880401254 
Acurácia balanceada no teste: 0.4382971723829529




Random Forest
Acurácia no treino 0.9795361971076172 
Acurácia no teste: 0.9793309934705624 
Acurácia balanceada no teste: 0.44147085611759407




Random Forest
Acurácia no treino 0.97962921770763 
Acurácia no teste: 0.978945130569324 
Acurácia balanceada no teste: 0.4436652268804737





In [41]:
feature_importance_multiclass = pd.DataFrame({"Feature": X.columns,  
                                              "Random Forest Feature Importance": feature_importance_random_forest["Importance"]})

In [42]:
feature_importance_multiclass.sort_values(by = "Random Forest Feature Importance", ascending = False)

Unnamed: 0,Feature,Random Forest Feature Importance
36,ct_state_ttl,0.14575
2,dstip,0.139563
9,sttl,0.126505
7,sbytes,0.112985
0,srcip,0.073725
22,smeansz,0.052694
14,Sload,0.034237
23,dmeansz,0.029781
45,ct_dst_sport_ltm,0.02872
17,Dpkts,0.022272


# Busca de melhores parâmetros para os modelos: Adaptive Boosting, Extreme Gradient Boosting, Bagging e MLP | Classificação binária

In [None]:
X_train_binary = X_train[["dstip", "ct_state_ttl", "sttl", "srcip", "sbytes"]]
X_test_binary = X_test[["dstip", "ct_state_ttl", "sttl", "srcip", "sbytes"]]

In [None]:
ada_boost = AdaBoostClassifier(random_state = 42, algorithm = "SAMME")
xgboost = xgb.XGBClassifier(random_state = 42)
bagging = BaggingClassifier(random_state = 42, n_jobs = -1)
mlp = MLPClassifier(random_state = 42)


ada_boost_param_grid = {
  "n_estimators": [50, 100, 200],
  "learning_rate": [0.1, 0.5, 1]
}

run_grid_search(ada_boost, ada_boost_param_grid, X_train_binary, X_test_binary, y_train["Label"], y_test["Label"])

xgboost_param_grid = {
  "n_estimators": [50, 100, 200],
  "learning_rate": [0.1, 0.5, 1],
  "max_depth": [2, 5, 10],
  "min_samples_split": [2, 5, 10]
}

run_grid_search(xgboost, xgboost_param_grid, X_train_binary, X_test_binary, y_train["Label"], y_test["Label"])

bagging_param_grid = {
  "n_estimators": [50, 100, 200],
  "max_samples": [0.3, 0.6, 1.0],
  "max_features": [0.3, 0.6, 1.0]
}

run_grid_search(bagging, bagging_param_grid, X_train_binary, X_test_binary, y_train["Label"], y_test["Label"])

mlp_param_grid = {
  "hidden_layer_sizes": [(100,), (100, 100), (100, 100, 100)],
  "activation": ["identity", "logistic", "tanh", "relu"],
}

run_grid_search(mlp, mlp_param_grid, X_train_binary, X_test_binary, y_train["Label"], y_test["Label"])

# Busca de parâmetros para os modelos: Adaptive Boosting, Extreme Gradient Boosting, Bagging e MLP | Classificação multiclasse

In [None]:
X_train_multiclass = X_train[["dstip", "ct_state_ttl", "sttl", "srcip", "sbytes"]]
X_test_multiclass = X_test[["dstip", "ct_state_ttl", "sttl", "srcip", "sbytes"]]

In [None]:
ada_boost = AdaBoostClassifier(random_state = 42, algorithm = "SAMME")
xgboost = xgb.XGBClassifier(random_state = 42)
bagging = BaggingClassifier(random_state = 42, n_jobs = -1)
mlp = MLPClassifier(random_state = 42)


ada_boost_param_grid = {
  "n_estimators": [50, 100, 200],
  "learning_rate": [0.1, 0.5, 1]
}

run_grid_search(ada_boost, ada_boost_param_grid, X_train_multiclass, X_test_multiclass, y_train["attack_cat"], y_test["attack_cat"])

xgboost_param_grid = {
  "n_estimators": [50, 100, 200],
  "learning_rate": [0.1, 0.5, 1],
  "max_depth": [2, 5, 10],
  "min_samples_split": [2, 5, 10]
}

run_grid_search(xgboost, xgboost_param_grid, X_train_multiclass, X_test_multiclass, y_train["attack_cat"], y_test["attack_cat"])

bagging_param_grid = {
  "n_estimators": [50, 100, 200],
  "max_samples": [0.3, 0.6, 1.0],
  "max_features": [0.3, 0.6, 1.0]
}

run_grid_search(bagging, bagging_param_grid, X_train_multiclass, X_test_multiclass, y_train["attack_cat"], y_test["attack_cat"])

mlp_param_grid = {
  "hidden_layer_sizes": [(100,), (100, 100), (100, 100, 100)],
  "activation": ["identity", "logistic", "tanh", "relu"],
}

run_grid_search(mlp, mlp_param_grid, X_train_multiclass, X_test_multiclass, y_train["attack_cat"], y_test["attack_cat"])