# Bibliotecas usadas

In [1]:
import pandas as pd
import numpy as np
import sweetviz as sv
import pickle as pkl

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb

from sklearn.metrics import accuracy_score, balanced_accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


# Funções de utilidade

In [27]:
def run_model(model, X_train, X_test, y_train, y_test, verbose = False):
  model.fit(X_train, y_train)
  y_predicted = model.predict(X_test)

  train_accuracy = model.score(X_train, y_train)
  test_accuracy = accuracy_score(y_test, y_predicted)
  test_balanced_accuracy = balanced_accuracy_score(y_test, y_predicted)

  if verbose:
    print(f"Acurácia no treino {train_accuracy} \nAcurácia no teste: {test_accuracy} \nAcurácia balanceada no teste: {test_balanced_accuracy}")


def run_grid_search(model, param_grid, X_train, X_test, y_train, y_test, verbose = True, cv = 3, n_jobs = None):
  grid_search = GridSearchCV(model, param_grid, cv = cv, n_jobs = n_jobs, verbose = 3)
  grid_search.fit(X_train, y_train)

  if verbose:
    print(grid_search.best_params_)
    print(grid_search.best_score_)
    print(grid_search.score(X_test, y_test))

# Carregamento da base pré-preprocessada

In [3]:
df = pkl.load(open("../data/files/Normalized_Data.pkl", "rb"))

# Separa-se matriz de características e vetor alvo. Separa-se também o conjunto de treino e teste

In [4]:
X = df.drop(columns = ["attack_cat", "Label"])
y = df[["attack_cat", "Label"]]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y[["Label", "attack_cat"]], test_size = 0.2, stratify = y, random_state = 42)

del df

# Busca-se os melhores parâmetros para o modelo Random Forest | Classificação binária

In [None]:
random_forest = RandomForestClassifier(random_state = 42, n_jobs = -1)

random_forest_param_grid = {
  "n_estimators": [50, 100, 200, 400],
  "max_depth": [None, 5, 10, 20],
  "min_samples_split": [2, 5, 10]
}

run_grid_search(random_forest, random_forest_param_grid, X_train, X_test, y_train["Label"], y_test["Label"], n_jobs = -1)

# Separação da base em K-Folds

In [7]:
k_folds = KFold(n_splits = 5, shuffle = True, random_state = 42)

# Seleção de características para classificação binária

## A seleção de características se dará com base na média da importância das características dos modelos Random Forest

In [None]:
random_forest = RandomForestClassifier(random_state = 42, n_estimators = 400, n_jobs = -1)

feature_importance_random_forest = pd.DataFrame(data = {"Feature": X.columns, "Importance": np.zeros(X.shape[1])})


for i, j in k_folds.split(X):
  X_train_fold, X_test_fold = X.iloc[i], X.iloc[j]
  y_train_fold, y_test_fold = y.iloc[i], y.iloc[j]


  print("Random Forest")
  run_model(random_forest, X_train_fold, X_test_fold, y_train_fold["Label"], y_test_fold["Label"], verbose = True)
  print("\n")

  feature_importance_random_forest["Importance"] = feature_importance_random_forest["Importance"] + random_forest.feature_importances_
  

feature_importance_random_forest["Importance"] = feature_importance_random_forest["Importance"] / k_folds.n_splits

In [22]:
feature_importance_binary = pd.DataFrame({"Feature": X.columns,  
                                          "Random Forest Feature Importance": feature_importance_random_forest["Importance"]})

In [23]:
feature_importance_binary.sort_values(by = "Random Forest Feature Importance", ascending = False)

Unnamed: 0,Feature,Random Forest Feature Importance
2,dstip,0.171892
9,sttl,0.171047
36,ct_state_ttl,0.158517
7,sbytes,0.085213
0,srcip,0.080752
23,dmeansz,0.031187
22,smeansz,0.030895
8,dbytes,0.025733
14,Sload,0.024557
5,state,0.024217


# Busca-se os melhores parâmetros para o modelo Random Forest | Classificação multiclasse

In [13]:
random_forest = RandomForestClassifier(random_state = 42, n_jobs = -1)

random_forest_param_grid = {
  "n_estimators": [50, 100, 200, 400],
  "max_depth": [None, 2, 4, 10, 20],
  "min_samples_split": [2, 5, 10]
}

run_grid_search(random_forest, random_forest_param_grid, X_train, X_test, y_train["attack_cat"], y_test["attack_cat"], n_jobs = -1)

Fitting 3 folds for each of 45 candidates, totalling 135 fits


# Seleção de características para classificação multiclasse

## A seleção de características se dará com base na média da importância das características dos modelos Random Forest

In [None]:
random_forest = RandomForestClassifier(random_state = 42, n_estimators = 400, n_jobs = -1)

feature_importance_random_forest = pd.DataFrame(data = {"Feature": X.columns, "Importance": np.zeros(X.shape[1])})


for i, j in k_folds.split(X):
  X_train_fold, X_test_fold = X.iloc[i], X.iloc[j]
  y_train_fold, y_test_fold = y.iloc[i], y.iloc[j]


  print("Random Forest")
  run_model(random_forest, X_train_fold, X_test_fold, y_train_fold["attack_cat"], y_test_fold["attack_cat"], verbose = True)
  print("\n")

  feature_importance_random_forest["Importance"] = feature_importance_random_forest["Importance"] + random_forest.feature_importances_
  

feature_importance_random_forest["Importance"] = feature_importance_random_forest["Importance"] / k_folds.n_splits

In [25]:
feature_importance_multiclass = pd.DataFrame({"Feature": X.columns,  
                                              "Random Forest Feature Importance": feature_importance_random_forest["Importance"]})

In [26]:
feature_importance_multiclass.sort_values(by = "Random Forest Feature Importance", ascending = False)

Unnamed: 0,Feature,Random Forest Feature Importance
9,sttl,0.129761
2,dstip,0.125982
36,ct_state_ttl,0.120843
7,sbytes,0.11865
0,srcip,0.061702
22,smeansz,0.049278
14,Sload,0.03744
23,dmeansz,0.028542
8,dbytes,0.022531
10,dttl,0.020092


# Busca de melhores parâmetros para os modelos: Adaptive Boosting, Extreme Gradient Boosting, Bagging e MLP | Classificação binária

In [28]:
X_train_binary = X_train[["dstip", "ct_state_ttl", "sttl", "srcip", "sbytes", "smeansz", "dmeansz", "Sload"]]
X_test_binary = X_test[["dstip", "ct_state_ttl", "sttl", "srcip", "sbytes", "smeansz", "dmeansz", "Sload"]]

In [32]:
ada_boost = AdaBoostClassifier(random_state = 42, algorithm = "SAMME")
xgboost = xgb.XGBClassifier(random_state = 42)
bagging = BaggingClassifier(random_state = 42, n_jobs = -1)
mlp = MLPClassifier(random_state = 42)


ada_boost_param_grid = {
  "n_estimators": [100, 200, 400],
  "learning_rate": [0.1, 0.5, 1, 1.5]
}

run_grid_search(ada_boost, ada_boost_param_grid, X_train_binary, X_test_binary, y_train["Label"], y_test["Label"])

xgboost_param_grid = {
  "n_estimators": [100, 200, 400],
  "learning_rate": [0.1, 0.5, 1, 1.5],
  "max_depth": [None, 2, 5, 10]
}

run_grid_search(xgboost, xgboost_param_grid, X_train_binary, X_test_binary, y_train["Label"], y_test["Label"])

bagging_param_grid = {
  "n_estimators": [100, 200, 400],
  "max_samples": [0.3, 0.6, 1.0],
  "max_features": [0.3, 0.6, 1.0]
}

run_grid_search(bagging, bagging_param_grid, X_train_binary, X_test_binary, y_train["Label"], y_test["Label"])

mlp_param_grid = {
  "hidden_layer_sizes": [(100,), (100, 100), (100, 100, 100)],
  "activation": ["identity", "logistic", "tanh", "relu"],
}

run_grid_search(mlp, mlp_param_grid, X_train_binary, X_test_binary, y_train["Label"], y_test["Label"])

Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV 1/3] END learning_rate=0.1, max_depth=None, n_estimators=100;, score=0.990 total time=   2.4s
[CV 2/3] END learning_rate=0.1, max_depth=None, n_estimators=100;, score=0.990 total time=   2.3s
[CV 3/3] END learning_rate=0.1, max_depth=None, n_estimators=100;, score=0.990 total time=   2.3s
[CV 1/3] END learning_rate=0.1, max_depth=None, n_estimators=200;, score=0.990 total time=   5.5s
[CV 2/3] END learning_rate=0.1, max_depth=None, n_estimators=200;, score=0.990 total time=   4.8s
[CV 3/3] END learning_rate=0.1, max_depth=None, n_estimators=200;, score=0.990 total time=   5.4s
[CV 1/3] END learning_rate=0.1, max_depth=None, n_estimators=400;, score=0.990 total time=  12.5s
[CV 2/3] END learning_rate=0.1, max_depth=None, n_estimators=400;, score=0.990 total time=  13.6s
[CV 3/3] END learning_rate=0.1, max_depth=None, n_estimators=400;, score=0.990 total time=  13.2s
[CV 1/3] END learning_rate=0.1, max_depth=2, n_estimator

# Busca de parâmetros para os modelos: Adaptive Boosting, Extreme Gradient Boosting, Bagging e MLP | Classificação multiclasse

In [33]:
X_train_multiclass = X_train[["dstip", "ct_state_ttl", "sttl", "srcip", "sbytes", "smeansz", "dmeansz", "Sload"]]
X_test_multiclass = X_test[["dstip", "ct_state_ttl", "sttl", "srcip", "sbytes", "smeansz", "dmeansz", "Sload"]]

In [None]:
ada_boost = AdaBoostClassifier(random_state = 42, algorithm = "SAMME")
xgboost = xgb.XGBClassifier(random_state = 42)
bagging = BaggingClassifier(random_state = 42, n_jobs = -1)
mlp = MLPClassifier(random_state = 42)


ada_boost_param_grid = {
  "n_estimators": [100, 200, 400],
  "learning_rate": [0.1, 0.5, 1, 1.5]
}

run_grid_search(ada_boost, ada_boost_param_grid, X_train_multiclass, X_test_multiclass, y_train["attack_cat"], y_test["attack_cat"])

xgboost_param_grid = {
  "n_estimators": [100, 200, 400],
  "learning_rate": [0.1, 0.5, 1, 1.5],
  "max_depth": [None, 2, 5, 10]
}

run_grid_search(xgboost, xgboost_param_grid, X_train_multiclass, X_test_multiclass, y_train["attack_cat"], y_test["attack_cat"])

bagging_param_grid = {
  "n_estimators": [100, 200, 400],
  "max_samples": [0.3, 0.6, 1.0],
  "max_features": [0.3, 0.6, 1.0]
}

run_grid_search(bagging, bagging_param_grid, X_train_multiclass, X_test_multiclass, y_train["attack_cat"], y_test["attack_cat"])

mlp_param_grid = {
  "hidden_layer_sizes": [(100,), (100, 100), (100, 100, 100)],
  "activation": ["identity", "logistic", "tanh", "relu"],
}

run_grid_search(mlp, mlp_param_grid, X_train_multiclass, X_test_multiclass, y_train["attack_cat"], y_test["attack_cat"])