In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (GradientBoostingClassifier, RandomForestClassifier, HistGradientBoostingClassifier)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
from sklearn.model_selection import StratifiedKFold

In [None]:
#Create data for the to hyperparameter tuning
data = pd.read_csv('final_data_balanced.csv')
X = data.drop(columns=['label', 'detailed_label'])
y = data['label']
X_reduced = data[['conn_state_RSTOS0', 'orig_pkts', 'orig_bytes', 'resp_pkts', 'conn_state_SF']]
stratified_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
#Hyperparameter optimization on ML models on the balanced dataset
#Decision Tree
parameters_dist_tree = {
    "max_depth": [3, 5, 10, 15, 20, None],
    "max_features": ["sqrt", "log2", None],
    "min_samples_split": randint(2, 10),
    "min_samples_leaf": randint(1, 9),
    "criterion": ["gini", "entropy"]
}
tree = DecisionTreeClassifier()
tree_cv = RandomizedSearchCV(tree, parameters_dist_tree, n_iter=60, cv=stratified_cv, random_state=42) #n-iter is the number of iterations to optimize over
tree_cv.fit(X, y)
print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))

#Random Forest
parameter_dist_rf = {
    "n_estimators": randint(50, 200),
    "max_depth": [3, 5, 10, 15, 20, None],
    "max_features": ["sqrt", "log2", None],
    "min_samples_split": randint(2, 10),
    "min_samples_leaf": randint(1, 9),
    "criterion": ["gini", "entropy"]
}
forest = RandomForestClassifier()
forest_cv = RandomizedSearchCV(forest, parameter_dist_rf, n_iter=60, cv=stratified_cv, random_state=42)
forest_cv.fit(X, y)
print("Tuned Random Forest Parameters: {}".format(forest_cv.best_params_))
print("Best score is {}".format(forest_cv.best_score_))

#Gradient Boosting
parameter_dist_gb = {
    "n_estimators": randint(50, 150),
    "learning_rate": uniform(0.01, 0.1),
    "max_depth": [3, 5, 10, 15, 20, None],
    "max_features": ["sqrt", "log2", None],
    "min_samples_split": randint(2, 10),
    "min_samples_leaf": randint(1, 9),
    "subsample": uniform(0.5, 0.5)
}
grad_boost = GradientBoostingClassifier()
grad_boost_cv = RandomizedSearchCV(grad_boost, parameter_dist_gb, n_iter=60, cv=stratified_cv, random_state=42)
grad_boost_cv.fit(X, y)
print("Tuned Gradient Boosting Parameters: {}".format(grad_boost_cv.best_params_))
print("Best score is {}".format(grad_boost_cv.best_score_))

#K-Nearest Neighbors
parameter_dist_knn = {
    "n_neighbors": randint(1, 20),
    "weights": ["uniform", "distance"],
    "p": [1, 2],
    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"]
}
knn = KNeighborsClassifier()
knn_cv = RandomizedSearchCV(knn, parameter_dist_knn, n_iter=60, cv=stratified_cv, random_state=42)
knn_cv.fit(X, y)
print("Tuned K-Nearest Neighbors Parameters: {}".format(knn_cv.best_params_))
print("Best score is {}".format(knn_cv.best_score_))

#Histogram Gradient Boosting
parameter_dist_hgb = {
    "learning_rate": uniform(0.01, 0.1),
    "max_depth": [3, 5, 10, 15, 20, None],
    "max_leaf_nodes": randint(10, 50),
    "min_samples_leaf": randint(1, 9),
    "l2_regularization": uniform(0.0, 1.0),
    "max_bins": randint(100, 255)
}
hist_grad_boost = HistGradientBoostingClassifier()
hist_grad_boost_cv = RandomizedSearchCV(hist_grad_boost, parameter_dist_hgb, n_iter=60, cv=stratified_cv, random_state=42)
hist_grad_boost_cv.fit(X, y)
print("Tuned HistGradientBoostingClassifier Parameters: {}".format(hist_grad_boost_cv.best_params_))
print("Best score is {}".format(hist_grad_boost_cv.best_score_))

In [None]:
#Hyperparameter optimization on ML models on the feature selected balanced dataset
#Decision Tree
parameters_dist_tree = {
    "max_depth": [3, 5, 10, 15, 20, None],
    "max_features": ["sqrt", "log2", None],
    "min_samples_split": randint(2, 10),
    "min_samples_leaf": randint(1, 9),
    "criterion": ["gini", "entropy"]
}
tree = DecisionTreeClassifier()
tree_cv = RandomizedSearchCV(tree, parameters_dist_tree, n_iter=60, cv=stratified_cv, random_state=42) #n-iter is the number of iterations to optimize over
tree_cv.fit(X, y)
print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))

#Random Forest
parameter_dist_rf = {
    "n_estimators": randint(50, 200),
    "max_depth": [3, 5, 10, 15, 20, None],
    "max_features": ["sqrt", "log2", None],
    "min_samples_split": randint(2, 10),
    "min_samples_leaf": randint(1, 9),
    "criterion": ["gini", "entropy"]
}
forest = RandomForestClassifier()
forest_cv = RandomizedSearchCV(forest, parameter_dist_rf, n_iter=60, cv=stratified_cv, random_state=42)
forest_cv.fit(X, y)
print("Tuned Random Forest Parameters: {}".format(forest_cv.best_params_))
print("Best score is {}".format(forest_cv.best_score_))

#Gradient Boosting
parameter_dist_gb = {
    "n_estimators": randint(50, 150),
    "learning_rate": uniform(0.01, 0.1),
    "max_depth": [3, 5, 10, 15, 20, None],
    "max_features": ["sqrt", "log2", None],
    "min_samples_split": randint(2, 10),
    "min_samples_leaf": randint(1, 9),
    "subsample": uniform(0.5, 0.5)
}
grad_boost = GradientBoostingClassifier()
grad_boost_cv = RandomizedSearchCV(grad_boost, parameter_dist_gb, n_iter=60, cv=stratified_cv, random_state=42)
grad_boost_cv.fit(X, y)
print("Tuned Gradient Boosting Parameters: {}".format(grad_boost_cv.best_params_))
print("Best score is {}".format(grad_boost_cv.best_score_))

#K-Nearest Neighbors
parameter_dist_knn = {
    "n_neighbors": randint(1, 20),
    "weights": ["uniform", "distance"],
    "p": [1, 2],
    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"]
}
knn = KNeighborsClassifier()
knn_cv = RandomizedSearchCV(knn, parameter_dist_knn, n_iter=60, cv=stratified_cv, random_state=42)
knn_cv.fit(X, y)
print("Tuned K-Nearest Neighbors Parameters: {}".format(knn_cv.best_params_))
print("Best score is {}".format(knn_cv.best_score_))

#Histogram Gradient Boosting
parameter_dist_hgb = {
    "learning_rate": uniform(0.01, 0.1),
    "max_depth": [3, 5, 10, 15, 20, None],
    "max_leaf_nodes": randint(10, 50),
    "min_samples_leaf": randint(1, 9),
    "l2_regularization": uniform(0.0, 1.0),
    "max_bins": randint(100, 255)
}
hist_grad_boost = HistGradientBoostingClassifier()
hist_grad_boost_cv = RandomizedSearchCV(hist_grad_boost, parameter_dist_hgb, n_iter=60, cv=stratified_cv, random_state=42)
hist_grad_boost_cv.fit(X, y)
print("Tuned HistGradientBoostingClassifier Parameters: {}".format(hist_grad_boost_cv.best_params_))
print("Best score is {}".format(hist_grad_boost_cv.best_score_))