In [None]:
import pickle
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, accuracy_score, f1_score, recall_score

In [None]:
slowLoris_train = pd.read_csv("/home/admin2/ML_NIC/datasets/cicids2017/slowLorisTrain.csv")
slowHttp_train = pd.read_csv("/home/admin2/ML_NIC/datasets/cicids2017/slowHttpTrain.csv")
goldenEye_train = pd.read_csv("/home/admin2/ML_NIC/datasets/cicids2017/goldenEyeTrain.csv")
hulk_train = pd.read_csv("/home/admin2/ML_NIC/datasets/cicids2017/hulkTrain.csv")
bruteForce_train = pd.read_csv("/home/admin2/ML_NIC/datasets/cicids2017/bruteForceTrain.csv")
portScan_train = pd.read_csv("/home/admin2/ML_NIC/datasets/cicids2017/portScanTrain.csv")
wedBenign_train = pd.read_csv("/home/admin2/ML_NIC/datasets/cicids2017/wedBenignTrain.csv")
thurBenign_train = pd.read_csv("/home/admin2/ML_NIC/datasets/cicids2017/thurBenignTrain.csv")

In [None]:
slowLoris_test = pd.read_csv("/home/admin2/ML_NIC/datasets/cicids2017/slowLorisTest.csv")
slowHttp_test = pd.read_csv("/home/admin2/ML_NIC/datasets/cicids2017/slowHttpTest.csv")
goldenEye_test = pd.read_csv("/home/admin2/ML_NIC/datasets/cicids2017/goldenEyeTest.csv")
hulk_test = pd.read_csv("/home/admin2/ML_NIC/datasets/cicids2017/hulkTest.csv")
bruteForce_test = pd.read_csv("/home/admin2/ML_NIC/datasets/cicids2017/bruteForceTest.csv")
portScan_test = pd.read_csv("/home/admin2/ML_NIC/datasets/cicids2017/portScanTest.csv")
wedBenign_test = pd.read_csv("/home/admin2/ML_NIC/datasets/cicids2017/wedBenignTest.csv")
thurBenign_test = pd.read_csv("/home/admin2/ML_NIC/datasets/cicids2017/thurBenignTest.csv")

In [None]:
trainDf = pd.concat([slowLoris_train, slowHttp_train, goldenEye_train, hulk_train, bruteForce_train, portScan_train, wedBenign_train, thurBenign_train])
testDf = pd.concat([bruteForce_test, slowLoris_test, slowHttp_test, goldenEye_test, hulk_test, portScan_test, wedBenign_test, thurBenign_test])

In [None]:
# Build training set features
X_train = trainDf.drop(columns=["label"]).to_numpy()

# Convert attack labels to numbers
attacks = ['benign', 'dos_slowloris', 'dos_slowhttptest', 'dos_hulk', 'dos_goldeneye', 'wa_brute_force', 'i_portscan']
y_train = trainDf["label"].replace(attacks, range(7)).to_numpy()


In [None]:
# Combination of hyperparameters seen used in baseline authors codes
params = {"min_samples_leaf": [2, 3, 4, 5, 6],
          "min_samples_split": [2, 3], 
          "min_impurity_decrease": [0.00001, 0.0003, 0.0004, 0.0005, 0.005],}


In [None]:
clf = DecisionTreeClassifier(random_state=99, min_weight_fraction_leaf=0.0, splitter='best')
grid = GridSearchCV(clf, params, scoring="f1_macro", cv=5, refit=True)
grid.fit(X_train, y_train)

In [None]:
best_clf = grid.best_estimator_
with open("model.pkl", "wb") as tree_file:
    pickle.dump(best_clf, tree_file)

In [None]:
### Evaluate classifier on test set
X_test = testDf.drop(columns=["label"]).to_numpy()
attacks = ['benign', 'dos_slowloris', 'dos_slowhttptest', 'dos_hulk', 'dos_goldeneye', 'wa_brute_force', 'i_portscan']
y_test = testDf["label"].replace(attacks, range(7)).to_numpy()


In [None]:
y_pred = best_clf.predict(X_test)

In [None]:
# Evaluate trained tree
print(f"Accuracy Score: {accuracy_score(y_test, y_pred)}")
print(f"F1 Score (Macro): {f1_score(y_test, y_pred, average='macro')}")
print(f"Precision Score (Macro): {precision_score(y_test, y_pred, average='macro')}")
print(f"Recall Score (Macro): {recall_score(y_test, y_pred, average='macro')}")