In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score

from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.utils.class_weight import compute_sample_weight, compute_class_weight

import optuna
import joblib  # to save best model

from tqdm import tqdm
tqdm.pandas(desc="Progress")

from sklearn.pipeline import Pipeline
import mlflow
import mlflow.sklearn

from sklearn.metrics import (
    f1_score, accuracy_score, precision_score,
    recall_score, confusion_matrix, classification_report
)



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def plot_confusion_matrix(y_test, y_pred, target_column):
    # Compute confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    # Display class labels if you know them
    labels = df[target_column].astype('category').cat.categories.tolist()
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

def plot_feature_importances(model, feature_names):
    # Get feature importances
    importances = model.feature_importances_
    
    # Plot feature importances
    plt.figure(figsize=(10, 6))
    sns.barplot(x=importances, y=feature_names, palette="viridis")
    plt.title("XGBoost Feature Importances")
    plt.xlabel("Importance Score")
    plt.ylabel("Features")
    plt.tight_layout()
    plt.show()

def calculate_f1_score(y_true, y_pred):
    from sklearn.metrics import f1_score
    # Macro F1 Score: treats all classes equally
    f1_macro = f1_score(y_test, y_pred, average='macro')

    # Micro F1 Score: globally by counting total true positives/negatives
    f1_micro = f1_score(y_test, y_pred, average='micro')

    # Weighted F1 Score: accounts for support (number of true instances per class)
    f1_weighted = f1_score(y_test, y_pred, average='weighted')

    print(f"F1 Score (Macro): {f1_macro:.4f}")
    print(f"F1 Score (Micro): {f1_micro:.4f}")
    print(f"F1 Score (Weighted): {f1_weighted:.4f}")


def f1_per_class(y_test, y_pred, class_labels = None):
    # Compute F1 score per class
    class_f1_scores = f1_score(y_test, y_pred, average=None)

    # Set default class labels if not provided
    if class_labels is None:
        class_labels = sorted(set(y_test))

    # Plot
    plt.figure(figsize=(8, 5))
    sns.barplot(x=class_labels, y=class_f1_scores, palette='rocket')
    plt.title("F1 Score per Class")
    plt.xlabel("Class")
    plt.ylabel("F1 Score")
    plt.ylim(0, 1)
    plt.tight_layout()
    plt.show()

In [3]:
df = pd.read_csv('./data/final_data.csv')

In [4]:
df.head()

Unnamed: 0,high_activity,packets_ratio,source_port,low_activity,bytes_ratio,destination_port,nat_destination_port,elapsed_time_(sec),nat_source_port,burst_transfer,bytes_per_sec,action
0,0,0.5,57222,0,1.119048,53,53,30,54587,0,5.9,allow
1,0,1.0,56258,0,0.504891,3389,3389,17,56258,0,280.470572,allow
2,0,0.5,6881,0,0.975207,50321,50321,1199,43265,0,0.198499,allow
3,0,1.0,50553,0,0.760847,3389,3389,17,50553,0,195.705871,allow
4,0,0.684211,50002,0,0.364781,443,443,16,45848,0,1584.874901,allow


In [5]:
df.rename(columns={'action': 'target'}, inplace=True)

In [6]:
# Encode the target variable
df['target'] = df['target'].astype('category').cat.codes

In [7]:
# separating data in to features and target variable
X = df.drop(columns=['target'])
y = df['target']

print(f"Shape of X: {X.shape}, Shape of y: {y.shape}")

Shape of X: (57170, 11), Shape of y: (57170,)


In [8]:
# splitting data into train and test sets and validating the split

# Initial split: 80% train_val, 20% test
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# Split train_val into 75% train and 25% val → overall: 60% train, 20% val, 20% test
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val,
    test_size=0.25,  # 0.25 * 0.8 = 0.2 of total
    stratify=y_train_val,
    random_state=42
)
print(f"Shape of X_train: {X_train.shape}, Shape of y_train: {y_train.shape}")
print(f"Shape of X_val: {X_val.shape}, Shape of y_val: {y_val.shape}")
print(f"Shape of X_test: {X_test.shape}, Shape of y_test: {y_test.shape}")

Shape of X_train: (34302, 11), Shape of y_train: (34302,)
Shape of X_val: (11434, 11), Shape of y_val: (11434,)
Shape of X_test: (11434, 11), Shape of y_test: (11434,)


In [9]:
y_train.value_counts(), y_val.value_counts(), y_test.value_counts()

(target
 0    22463
 2     6981
 1     4826
 3       32
 Name: count, dtype: int64,
 target
 0    7488
 2    2327
 1    1608
 3      11
 Name: count, dtype: int64,
 target
 0    7488
 2    2327
 1    1608
 3      11
 Name: count, dtype: int64)

In [26]:

def objective(trial):
    # 📦 Hyperparameters
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.1, 10),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.1, 10),
        "objective": "multi:softprob",
        "num_class": len(np.unique(y_train)),
        "eval_metric": "mlogloss",
        "random_state": 42,
    }
    
    # Model definition
    model = xgb.XGBClassifier(**params) # Direct XGBClassifier, not in Pipeline for easier callback handling

    # Compute sample weights dynamically for each trial
    sample_weights = compute_sample_weight(class_weight="balanced", y=y_train)
    
    model.fit(X_train, y_train,
              sample_weight=sample_weights,
              eval_set=[(X_val, y_val)], # Use X_test, y_test as validation set for early stopping
            #   early_stopping_rounds=50,    # Stop if validation metric doesn't improve for 50 rounds
              verbose=False                # Suppress verbose output during tuning
             )

    # Predict
    y_pred = model.predict(X_test)
    f1_weighted = f1_score(y_test, y_pred, average='macro')

    return f1_weighted


In [11]:
# Optuna Optimization 
print("\nStarting Optuna hyperparameter tuning...")
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(objective, n_trials=50, show_progress_bar=True) # Reduced trials for demonstration

[I 2025-07-26 12:23:38,012] A new study created in memory with name: no-name-bf6e694c-1639-4739-909b-91c982f9bdbd



Starting Optuna hyperparameter tuning...


Best trial: 0. Best value: 0.92501:   2%|▏         | 1/50 [00:06<05:37,  6.88s/it]

[I 2025-07-26 12:23:44,888] Trial 0 finished with value: 0.9250103078931113 and parameters: {'learning_rate': 0.11861663446573512, 'max_depth': 10, 'n_estimators': 393, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.5780093202212182, 'gamma': 0.7799726016810132, 'reg_lambda': 0.6750277604651747, 'reg_alpha': 8.675143843171858}. Best is trial 0 with value: 0.9250103078931113.


Best trial: 1. Best value: 0.932844:   4%|▍         | 2/50 [00:08<03:11,  3.99s/it]

[I 2025-07-26 12:23:46,852] Trial 1 finished with value: 0.9328437354342375 and parameters: {'learning_rate': 0.18432335340553055, 'max_depth': 8, 'n_estimators': 108, 'subsample': 0.9849549260809971, 'colsample_bytree': 0.9162213204002109, 'gamma': 1.0616955533913808, 'reg_lambda': 1.9000671753502962, 'reg_alpha': 1.915704647548995}. Best is trial 1 with value: 0.9328437354342375.


Best trial: 1. Best value: 0.932844:   6%|▌         | 3/50 [00:13<03:30,  4.47s/it]

[I 2025-07-26 12:23:51,906] Trial 2 finished with value: 0.9251044266014667 and parameters: {'learning_rate': 0.09823025045826593, 'max_depth': 7, 'n_estimators': 273, 'subsample': 0.645614570099021, 'colsample_bytree': 0.8059264473611898, 'gamma': 0.6974693032602092, 'reg_lambda': 2.99223202049866, 'reg_alpha': 3.726982248607548}. Best is trial 1 with value: 0.9328437354342375.


Best trial: 1. Best value: 0.932844:   8%|▊         | 4/50 [00:17<03:07,  4.08s/it]

[I 2025-07-26 12:23:55,395] Trial 3 finished with value: 0.9251044266014667 and parameters: {'learning_rate': 0.1422602954229404, 'max_depth': 9, 'n_estimators': 180, 'subsample': 0.7571172192068059, 'colsample_bytree': 0.7962072844310213, 'gamma': 0.23225206359998862, 'reg_lambda': 6.1146940338242395, 'reg_alpha': 1.7881888245041864}. Best is trial 1 with value: 0.9328437354342375.


Best trial: 1. Best value: 0.932844:  10%|█         | 5/50 [00:26<04:29,  5.99s/it]

[I 2025-07-26 12:24:04,751] Trial 4 finished with value: 0.9250103078931113 and parameters: {'learning_rate': 0.02886496196573106, 'max_depth': 10, 'n_estimators': 487, 'subsample': 0.9041986740582306, 'colsample_bytree': 0.6523068845866853, 'gamma': 0.48836057003191935, 'reg_lambda': 6.873906962470353, 'reg_alpha': 4.457509688022053}. Best is trial 1 with value: 0.9328437354342375.


Best trial: 1. Best value: 0.932844:  12%|█▏        | 6/50 [00:29<03:28,  4.73s/it]

[I 2025-07-26 12:24:07,056] Trial 5 finished with value: 0.8468049415408737 and parameters: {'learning_rate': 0.045391088104985856, 'max_depth': 6, 'n_estimators': 113, 'subsample': 0.954660201039391, 'colsample_bytree': 0.6293899908000085, 'gamma': 3.31261142176991, 'reg_lambda': 3.1859396532851685, 'reg_alpha': 5.248673409660327}. Best is trial 1 with value: 0.9328437354342375.


Best trial: 1. Best value: 0.932844:  14%|█▍        | 7/50 [00:36<04:04,  5.69s/it]

[I 2025-07-26 12:24:14,712] Trial 6 finished with value: 0.8979889397608612 and parameters: {'learning_rate': 0.1685459810095511, 'max_depth': 4, 'n_estimators': 488, 'subsample': 0.8875664116805573, 'colsample_bytree': 0.9697494707820946, 'gamma': 4.474136752138244, 'reg_lambda': 6.019209790229743, 'reg_alpha': 9.226554926728857}. Best is trial 1 with value: 0.9328437354342375.


Best trial: 1. Best value: 0.932844:  16%|█▌        | 8/50 [00:39<03:14,  4.63s/it]

[I 2025-07-26 12:24:17,062] Trial 7 finished with value: 0.803532884491502 and parameters: {'learning_rate': 0.03566282559505665, 'max_depth': 4, 'n_estimators': 118, 'subsample': 0.6626651653816322, 'colsample_bytree': 0.6943386448447411, 'gamma': 1.3567451588694794, 'reg_lambda': 8.3045013406041, 'reg_alpha': 3.631857934266534}. Best is trial 1 with value: 0.9328437354342375.


Best trial: 1. Best value: 0.932844:  18%|█▊        | 9/50 [00:41<02:48,  4.10s/it]

[I 2025-07-26 12:24:20,003] Trial 8 finished with value: 0.9222949656747865 and parameters: {'learning_rate': 0.09147100780934041, 'max_depth': 7, 'n_estimators': 156, 'subsample': 0.9010984903770198, 'colsample_bytree': 0.5372753218398854, 'gamma': 4.9344346830025865, 'reg_lambda': 7.7452232160369086, 'reg_alpha': 2.0672852471883068}. Best is trial 1 with value: 0.9328437354342375.


Best trial: 9. Best value: 0.943062:  20%|██        | 10/50 [00:50<03:40,  5.52s/it]

[I 2025-07-26 12:24:28,696] Trial 9 finished with value: 0.9430615860902998 and parameters: {'learning_rate': 0.011601413965844696, 'max_depth': 9, 'n_estimators': 383, 'subsample': 0.8645035840204937, 'colsample_bytree': 0.8856351733429728, 'gamma': 0.3702232586704518, 'reg_lambda': 3.648810712588299, 'reg_alpha': 1.2471036892987843}. Best is trial 9 with value: 0.9430615860902998.


Best trial: 9. Best value: 0.943062:  22%|██▏       | 11/50 [00:56<03:40,  5.67s/it]

[I 2025-07-26 12:24:34,699] Trial 10 finished with value: 0.9250103078931113 and parameters: {'learning_rate': 0.2647891979752343, 'max_depth': 6, 'n_estimators': 357, 'subsample': 0.5089809378074099, 'colsample_bytree': 0.8770690800880656, 'gamma': 2.096390561241048, 'reg_lambda': 9.60173038485966, 'reg_alpha': 0.21003240974484227}. Best is trial 9 with value: 0.9430615860902998.


Best trial: 9. Best value: 0.943062:  24%|██▍       | 12/50 [01:00<03:18,  5.22s/it]

[I 2025-07-26 12:24:38,910] Trial 11 finished with value: 0.9235550610749494 and parameters: {'learning_rate': 0.2259516608407133, 'max_depth': 8, 'n_estimators': 261, 'subsample': 0.9729048599868566, 'colsample_bytree': 0.9717595030924887, 'gamma': 1.7939314875484835, 'reg_lambda': 3.4909083380037913, 'reg_alpha': 0.25493817364552085}. Best is trial 9 with value: 0.9430615860902998.


Best trial: 9. Best value: 0.943062:  26%|██▌       | 13/50 [01:07<03:27,  5.60s/it]

[I 2025-07-26 12:24:45,385] Trial 12 finished with value: 0.914970294657546 and parameters: {'learning_rate': 0.1954938375473548, 'max_depth': 8, 'n_estimators': 388, 'subsample': 0.8229282203516205, 'colsample_bytree': 0.8864550600123169, 'gamma': 2.6388944651501602, 'reg_lambda': 0.8488634693259103, 'reg_alpha': 6.561543882227312}. Best is trial 9 with value: 0.9430615860902998.


Best trial: 9. Best value: 0.943062:  28%|██▊       | 14/50 [01:12<03:18,  5.52s/it]

[I 2025-07-26 12:24:50,712] Trial 13 finished with value: 0.925198591258138 and parameters: {'learning_rate': 0.2840673647065922, 'max_depth': 9, 'n_estimators': 333, 'subsample': 0.9980082658737177, 'colsample_bytree': 0.8743294455456313, 'gamma': 0.01784971403017588, 'reg_lambda': 4.408626878732123, 'reg_alpha': 2.207935048321146}. Best is trial 9 with value: 0.9430615860902998.


Best trial: 9. Best value: 0.943062:  30%|███       | 15/50 [01:16<02:58,  5.11s/it]

[I 2025-07-26 12:24:54,869] Trial 14 finished with value: 0.915222405814896 and parameters: {'learning_rate': 0.22197975031348205, 'max_depth': 8, 'n_estimators': 225, 'subsample': 0.8447964091399753, 'colsample_bytree': 0.9307689382336943, 'gamma': 1.3805173532628663, 'reg_lambda': 1.9340714199024371, 'reg_alpha': 1.791688896647217}. Best is trial 9 with value: 0.9430615860902998.


Best trial: 9. Best value: 0.943062:  32%|███▏      | 16/50 [01:24<03:15,  5.76s/it]

[I 2025-07-26 12:25:02,149] Trial 15 finished with value: 0.9250103078931113 and parameters: {'learning_rate': 0.1686837171934692, 'max_depth': 9, 'n_estimators': 442, 'subsample': 0.6924839515611934, 'colsample_bytree': 0.7756848803021621, 'gamma': 2.8128928815738257, 'reg_lambda': 2.0128693428974023, 'reg_alpha': 2.950302064704135}. Best is trial 9 with value: 0.9430615860902998.


Best trial: 9. Best value: 0.943062:  34%|███▍      | 17/50 [01:29<03:10,  5.76s/it]

[I 2025-07-26 12:25:07,906] Trial 16 finished with value: 0.907395361996223 and parameters: {'learning_rate': 0.07336112273050144, 'max_depth': 5, 'n_estimators': 316, 'subsample': 0.9314636609266265, 'colsample_bytree': 0.9993615722428796, 'gamma': 1.2090566742174598, 'reg_lambda': 4.795738837577374, 'reg_alpha': 6.2506721803585386}. Best is trial 9 with value: 0.9430615860902998.


Best trial: 9. Best value: 0.943062:  36%|███▌      | 18/50 [01:34<02:55,  5.50s/it]

[I 2025-07-26 12:25:12,794] Trial 17 finished with value: 0.8908471054500506 and parameters: {'learning_rate': 0.01023467946429813, 'max_depth': 10, 'n_estimators': 202, 'subsample': 0.8579514049132655, 'colsample_bytree': 0.8426133099911273, 'gamma': 3.5359723928317566, 'reg_lambda': 1.6040361919816453, 'reg_alpha': 1.0688620031868474}. Best is trial 9 with value: 0.9430615860902998.


Best trial: 9. Best value: 0.943062:  38%|███▊      | 19/50 [01:41<03:05,  6.00s/it]

[I 2025-07-26 12:25:19,956] Trial 18 finished with value: 0.9430615860902998 and parameters: {'learning_rate': 0.13649978960074022, 'max_depth': 3, 'n_estimators': 424, 'subsample': 0.7696363894573159, 'colsample_bytree': 0.71804991678614, 'gamma': 0.9396731082894904, 'reg_lambda': 3.394959212375065, 'reg_alpha': 3.011675182017197}. Best is trial 9 with value: 0.9430615860902998.


Best trial: 9. Best value: 0.943062:  40%|████      | 20/50 [01:49<03:13,  6.45s/it]

[I 2025-07-26 12:25:27,472] Trial 19 finished with value: 0.906278209886072 and parameters: {'learning_rate': 0.13364053316728727, 'max_depth': 3, 'n_estimators': 428, 'subsample': 0.734673133881798, 'colsample_bytree': 0.7421908381743831, 'gamma': 1.8859411466215472, 'reg_lambda': 4.10822101209132, 'reg_alpha': 5.367586651001604}. Best is trial 9 with value: 0.9430615860902998.


Best trial: 9. Best value: 0.943062:  42%|████▏     | 21/50 [01:57<03:23,  7.02s/it]

[I 2025-07-26 12:25:35,821] Trial 20 finished with value: 0.9150493647697421 and parameters: {'learning_rate': 0.06669535907500572, 'max_depth': 3, 'n_estimators': 430, 'subsample': 0.6006818104614415, 'colsample_bytree': 0.7159288929998385, 'gamma': 0.3707731696901395, 'reg_lambda': 5.352909333291652, 'reg_alpha': 2.8221271868981166}. Best is trial 9 with value: 0.9430615860902998.


Best trial: 9. Best value: 0.943062:  44%|████▍     | 22/50 [02:04<03:12,  6.89s/it]

[I 2025-07-26 12:25:42,396] Trial 21 finished with value: 0.9251044266014667 and parameters: {'learning_rate': 0.19640198180091425, 'max_depth': 8, 'n_estimators': 383, 'subsample': 0.782303687907612, 'colsample_bytree': 0.9149706102735462, 'gamma': 1.019098835441976, 'reg_lambda': 2.629258480934584, 'reg_alpha': 1.1212335974008452}. Best is trial 9 with value: 0.9430615860902998.


Best trial: 9. Best value: 0.943062:  46%|████▌     | 23/50 [02:09<02:48,  6.26s/it]

[I 2025-07-26 12:25:47,187] Trial 22 finished with value: 0.9004000720001285 and parameters: {'learning_rate': 0.24039202510247548, 'max_depth': 7, 'n_estimators': 281, 'subsample': 0.7191922923603448, 'colsample_bytree': 0.8204115858567087, 'gamma': 1.5949851101775123, 'reg_lambda': 3.604677087159918, 'reg_alpha': 3.0918824991306773}. Best is trial 9 with value: 0.9430615860902998.


Best trial: 9. Best value: 0.943062:  48%|████▊     | 24/50 [02:15<02:40,  6.16s/it]

[I 2025-07-26 12:25:53,115] Trial 23 finished with value: 0.9329379481329648 and parameters: {'learning_rate': 0.19123445483937648, 'max_depth': 5, 'n_estimators': 343, 'subsample': 0.8805991604929908, 'colsample_bytree': 0.6683050342129456, 'gamma': 0.923141869919963, 'reg_lambda': 2.432933426459842, 'reg_alpha': 0.9805039540175801}. Best is trial 9 with value: 0.9430615860902998.


Best trial: 9. Best value: 0.943062:  50%|█████     | 25/50 [02:21<02:32,  6.11s/it]

[I 2025-07-26 12:25:59,110] Trial 24 finished with value: 0.9327495687437016 and parameters: {'learning_rate': 0.1140778881820452, 'max_depth': 5, 'n_estimators': 341, 'subsample': 0.8589969793121721, 'colsample_bytree': 0.6610679688214769, 'gamma': 2.3031739737789234, 'reg_lambda': 2.591812062950813, 'reg_alpha': 1.029742410533474}. Best is trial 9 with value: 0.9430615860902998.


Best trial: 9. Best value: 0.943062:  52%|█████▏    | 26/50 [02:27<02:25,  6.08s/it]

[I 2025-07-26 12:26:05,125] Trial 25 finished with value: 0.915222405814896 and parameters: {'learning_rate': 0.29979510835429424, 'max_depth': 4, 'n_estimators': 369, 'subsample': 0.7989612996457269, 'colsample_bytree': 0.6166275299131155, 'gamma': 0.7128207058165171, 'reg_lambda': 0.32645628384210923, 'reg_alpha': 4.198463463186748}. Best is trial 9 with value: 0.9430615860902998.


Best trial: 9. Best value: 0.943062:  54%|█████▍    | 27/50 [02:35<02:38,  6.90s/it]

[I 2025-07-26 12:26:13,932] Trial 26 finished with value: 0.9153165702757838 and parameters: {'learning_rate': 0.2503401775002799, 'max_depth': 5, 'n_estimators': 452, 'subsample': 0.7604984496062445, 'colsample_bytree': 0.7558603275558285, 'gamma': 0.14160508075427003, 'reg_lambda': 4.0585780610001825, 'reg_alpha': 0.9151209135369558}. Best is trial 9 with value: 0.9430615860902998.


Best trial: 9. Best value: 0.943062:  56%|█████▌    | 28/50 [02:41<02:23,  6.50s/it]

[I 2025-07-26 12:26:19,501] Trial 27 finished with value: 0.9329379481329648 and parameters: {'learning_rate': 0.21017087837735787, 'max_depth': 3, 'n_estimators': 304, 'subsample': 0.9307741879126271, 'colsample_bytree': 0.5036445160187465, 'gamma': 0.9485798028610363, 'reg_lambda': 5.212386030593326, 'reg_alpha': 0.1470558137025968}. Best is trial 9 with value: 0.9430615860902998.


Best trial: 9. Best value: 0.943062:  58%|█████▊    | 29/50 [02:48<02:21,  6.72s/it]

[I 2025-07-26 12:26:26,733] Trial 28 finished with value: 0.9251044266014667 and parameters: {'learning_rate': 0.154633024099922, 'max_depth': 6, 'n_estimators': 405, 'subsample': 0.8213190480367871, 'colsample_bytree': 0.7055120330282184, 'gamma': 0.5792599716981898, 'reg_lambda': 1.352574624775233, 'reg_alpha': 2.5353458755910334}. Best is trial 9 with value: 0.9430615860902998.


Best trial: 9. Best value: 0.943062:  60%|██████    | 30/50 [02:55<02:16,  6.80s/it]

[I 2025-07-26 12:26:33,729] Trial 29 finished with value: 0.914954429540161 and parameters: {'learning_rate': 0.1252166636418829, 'max_depth': 4, 'n_estimators': 407, 'subsample': 0.7973071774804307, 'colsample_bytree': 0.5886417935302463, 'gamma': 1.5676063661889863, 'reg_lambda': 2.4515264211617094, 'reg_alpha': 6.666745878514207}. Best is trial 9 with value: 0.9430615860902998.


Best trial: 9. Best value: 0.943062:  62%|██████▏   | 31/50 [03:03<02:15,  7.16s/it]

[I 2025-07-26 12:26:41,708] Trial 30 finished with value: 0.9327495687437016 and parameters: {'learning_rate': 0.10009769163316082, 'max_depth': 5, 'n_estimators': 454, 'subsample': 0.8325322407954311, 'colsample_bytree': 0.5730681784765594, 'gamma': 2.995197872145817, 'reg_lambda': 0.7923261571831706, 'reg_alpha': 1.4294096674841237}. Best is trial 9 with value: 0.9430615860902998.


Best trial: 9. Best value: 0.943062:  64%|██████▍   | 32/50 [03:09<02:00,  6.68s/it]

[I 2025-07-26 12:26:47,295] Trial 31 finished with value: 0.9153165702757838 and parameters: {'learning_rate': 0.2230313420698033, 'max_depth': 3, 'n_estimators': 305, 'subsample': 0.9334951689612037, 'colsample_bytree': 0.514864797928912, 'gamma': 0.9382411840194043, 'reg_lambda': 5.393337853578779, 'reg_alpha': 0.17337396411726713}. Best is trial 9 with value: 0.9430615860902998.


Best trial: 9. Best value: 0.943062:  66%|██████▌   | 33/50 [03:15<01:50,  6.50s/it]

[I 2025-07-26 12:26:53,359] Trial 32 finished with value: 0.9153165702757838 and parameters: {'learning_rate': 0.19792755392545267, 'max_depth': 3, 'n_estimators': 344, 'subsample': 0.873148172258375, 'colsample_bytree': 0.5683797617939496, 'gamma': 0.9003086257996423, 'reg_lambda': 4.6112408166079994, 'reg_alpha': 0.7297138952649042}. Best is trial 9 with value: 0.9430615860902998.


Best trial: 9. Best value: 0.943062:  68%|██████▊   | 34/50 [03:20<01:35,  5.99s/it]

[I 2025-07-26 12:26:58,149] Trial 33 finished with value: 0.9328437354342375 and parameters: {'learning_rate': 0.17569765750651428, 'max_depth': 4, 'n_estimators': 249, 'subsample': 0.9227931522018202, 'colsample_bytree': 0.7342838940485049, 'gamma': 0.5580727060103948, 'reg_lambda': 3.7483187190808147, 'reg_alpha': 1.459217677995671}. Best is trial 9 with value: 0.9430615860902998.


Best trial: 9. Best value: 0.943062:  70%|███████   | 35/50 [03:25<01:27,  5.82s/it]

[I 2025-07-26 12:27:03,569] Trial 34 finished with value: 0.9391087695630487 and parameters: {'learning_rate': 0.2097388200299059, 'max_depth': 3, 'n_estimators': 315, 'subsample': 0.9645657455199594, 'colsample_bytree': 0.6811071583589248, 'gamma': 1.1818811479058728, 'reg_lambda': 6.316764365773222, 'reg_alpha': 0.523856731463697}. Best is trial 9 with value: 0.9430615860902998.


Best trial: 9. Best value: 0.943062:  72%|███████▏  | 36/50 [03:32<01:24,  6.05s/it]

[I 2025-07-26 12:27:10,170] Trial 35 finished with value: 0.9235550610749494 and parameters: {'learning_rate': 0.1552915445991553, 'max_depth': 4, 'n_estimators': 366, 'subsample': 0.966302227468579, 'colsample_bytree': 0.6814172024521776, 'gamma': 0.2943851808953415, 'reg_lambda': 6.20565451558878, 'reg_alpha': 2.0600317662769956}. Best is trial 9 with value: 0.9430615860902998.


Best trial: 9. Best value: 0.943062:  74%|███████▍  | 37/50 [03:40<01:25,  6.61s/it]

[I 2025-07-26 12:27:18,084] Trial 36 finished with value: 0.8688088993276499 and parameters: {'learning_rate': 0.05979322112324996, 'max_depth': 3, 'n_estimators': 412, 'subsample': 0.8895810572189262, 'colsample_bytree': 0.660769563406926, 'gamma': 1.302684263592739, 'reg_lambda': 7.20228509333312, 'reg_alpha': 8.151346783894185}. Best is trial 9 with value: 0.9430615860902998.


Best trial: 37. Best value: 0.943156:  76%|███████▌  | 38/50 [03:45<01:15,  6.33s/it]

[I 2025-07-26 12:27:23,768] Trial 37 finished with value: 0.943155752976619 and parameters: {'learning_rate': 0.18572632917675896, 'max_depth': 9, 'n_estimators': 322, 'subsample': 0.9507730314617947, 'colsample_bytree': 0.6172894699068588, 'gamma': 0.7225954499111111, 'reg_lambda': 2.8254431770418664, 'reg_alpha': 3.5896720736505423}. Best is trial 37 with value: 0.943155752976619.


Best trial: 37. Best value: 0.943156:  78%|███████▊  | 39/50 [03:51<01:08,  6.23s/it]

[I 2025-07-26 12:27:29,768] Trial 38 finished with value: 0.925198591258138 and parameters: {'learning_rate': 0.14809473051704045, 'max_depth': 10, 'n_estimators': 288, 'subsample': 0.9519401875678404, 'colsample_bytree': 0.6183010231948016, 'gamma': 0.01333980638640675, 'reg_lambda': 3.0552918648937335, 'reg_alpha': 3.4132459513120903}. Best is trial 37 with value: 0.943155752976619.


Best trial: 37. Best value: 0.943156:  80%|████████  | 40/50 [03:59<01:06,  6.64s/it]

[I 2025-07-26 12:27:37,347] Trial 39 finished with value: 0.9430615860902998 and parameters: {'learning_rate': 0.25347936908189306, 'max_depth': 9, 'n_estimators': 467, 'subsample': 0.9918293306503908, 'colsample_bytree': 0.7796030282285746, 'gamma': 0.4403058465602697, 'reg_lambda': 5.874963444311765, 'reg_alpha': 4.828410638739404}. Best is trial 37 with value: 0.943155752976619.


Best trial: 37. Best value: 0.943156:  82%|████████▏ | 41/50 [04:07<01:04,  7.14s/it]

[I 2025-07-26 12:27:45,647] Trial 40 finished with value: 0.9430615860902998 and parameters: {'learning_rate': 0.25756614251527216, 'max_depth': 9, 'n_estimators': 500, 'subsample': 0.9910527011633248, 'colsample_bytree': 0.7907583006829646, 'gamma': 0.6783852774742457, 'reg_lambda': 5.60893412059348, 'reg_alpha': 4.264802276607025}. Best is trial 37 with value: 0.943155752976619.


Best trial: 37. Best value: 0.943156:  84%|████████▍ | 42/50 [04:15<00:59,  7.39s/it]

[I 2025-07-26 12:27:53,626] Trial 41 finished with value: 0.9430615860902998 and parameters: {'learning_rate': 0.2663967083915026, 'max_depth': 9, 'n_estimators': 493, 'subsample': 0.978108349152475, 'colsample_bytree': 0.7906857903470916, 'gamma': 0.4310661283643323, 'reg_lambda': 5.776576766741928, 'reg_alpha': 4.619338009054724}. Best is trial 37 with value: 0.943155752976619.


Best trial: 37. Best value: 0.943156:  86%|████████▌ | 43/50 [04:23<00:53,  7.61s/it]

[I 2025-07-26 12:28:01,761] Trial 42 finished with value: 0.9250103078931113 and parameters: {'learning_rate': 0.24704145858752194, 'max_depth': 10, 'n_estimators': 475, 'subsample': 0.9125610018314334, 'colsample_bytree': 0.8286440843070175, 'gamma': 0.7292243742504104, 'reg_lambda': 6.9029983580461085, 'reg_alpha': 4.0265185145084095}. Best is trial 37 with value: 0.943155752976619.


Best trial: 37. Best value: 0.943156:  88%|████████▊ | 44/50 [04:31<00:45,  7.62s/it]

[I 2025-07-26 12:28:09,403] Trial 43 finished with value: 0.9430615860902998 and parameters: {'learning_rate': 0.2700330286743068, 'max_depth': 9, 'n_estimators': 470, 'subsample': 0.99284791196678, 'colsample_bytree': 0.7594185264085779, 'gamma': 0.47467925383884657, 'reg_lambda': 3.1841570907125996, 'reg_alpha': 5.46620673026255}. Best is trial 37 with value: 0.943155752976619.


Best trial: 37. Best value: 0.943156:  90%|█████████ | 45/50 [04:39<00:38,  7.74s/it]

[I 2025-07-26 12:28:17,420] Trial 44 finished with value: 0.9251044266014667 and parameters: {'learning_rate': 0.29925934348652117, 'max_depth': 9, 'n_estimators': 468, 'subsample': 0.9513281322710282, 'colsample_bytree': 0.8412597774415306, 'gamma': 0.2637613883004015, 'reg_lambda': 8.498636368661682, 'reg_alpha': 4.66244154913578}. Best is trial 37 with value: 0.943155752976619.


Best trial: 37. Best value: 0.943156:  92%|█████████▏| 46/50 [04:47<00:31,  7.84s/it]

[I 2025-07-26 12:28:25,495] Trial 45 finished with value: 0.9327495687437016 and parameters: {'learning_rate': 0.28104071652914125, 'max_depth': 9, 'n_estimators': 498, 'subsample': 0.9989747445591426, 'colsample_bytree': 0.7992388457464253, 'gamma': 0.6784474479652929, 'reg_lambda': 4.334759315652603, 'reg_alpha': 3.577957573791237}. Best is trial 37 with value: 0.943155752976619.


Best trial: 37. Best value: 0.943156:  94%|█████████▍| 47/50 [04:55<00:23,  7.78s/it]

[I 2025-07-26 12:28:33,149] Trial 46 finished with value: 0.8939811495448422 and parameters: {'learning_rate': 0.23639744934929519, 'max_depth': 8, 'n_estimators': 424, 'subsample': 0.6429311967311929, 'colsample_bytree': 0.8664053044761729, 'gamma': 3.944179129308928, 'reg_lambda': 3.681585260316309, 'reg_alpha': 5.668234196674583}. Best is trial 37 with value: 0.943155752976619.


Best trial: 47. Best value: 0.948617:  96%|█████████▌| 48/50 [05:02<00:15,  7.79s/it]

[I 2025-07-26 12:28:40,962] Trial 47 finished with value: 0.9486165123981306 and parameters: {'learning_rate': 0.08901379915169082, 'max_depth': 7, 'n_estimators': 449, 'subsample': 0.9468333362489876, 'colsample_bytree': 0.9031483836902796, 'gamma': 1.5485601338104753, 'reg_lambda': 4.983849338092982, 'reg_alpha': 4.943670816615743}. Best is trial 47 with value: 0.9486165123981306.


Best trial: 47. Best value: 0.948617:  98%|█████████▊| 49/50 [05:10<00:07,  7.77s/it]

[I 2025-07-26 12:28:48,679] Trial 48 finished with value: 0.9234608945954541 and parameters: {'learning_rate': 0.08872527917837253, 'max_depth': 7, 'n_estimators': 396, 'subsample': 0.9128350977395951, 'colsample_bytree': 0.911202969961542, 'gamma': 1.5168665510700026, 'reg_lambda': 4.750954549675857, 'reg_alpha': 4.955261367011411}. Best is trial 47 with value: 0.9486165123981306.


Best trial: 47. Best value: 0.948617: 100%|██████████| 50/50 [05:18<00:00,  6.36s/it]

[I 2025-07-26 12:28:56,150] Trial 49 finished with value: 0.888069424624257 and parameters: {'learning_rate': 0.048024962433257996, 'max_depth': 10, 'n_estimators': 376, 'subsample': 0.7699259306689287, 'colsample_bytree': 0.9460388522724476, 'gamma': 1.9873455783493728, 'reg_lambda': 4.945645852929802, 'reg_alpha': 7.597822372591748}. Best is trial 47 with value: 0.9486165123981306.





In [14]:
print("\n--- Optuna Optimization Results ---")
print(f"Number of finished trials: {len(study.trials)}")
print(f"Best trial number: {study.best_trial.number}")
print(f"Best F1-score: {study.best_value:.4f}")
print("Best hyperparameters:")
for key, value in study.best_trial.params.items():
    print(f"  {key}: {value}")


--- Optuna Optimization Results ---
Number of finished trials: 50
Best trial number: 47
Best F1-score: 0.9486
Best hyperparameters:
  learning_rate: 0.08901379915169082
  max_depth: 7
  n_estimators: 449
  subsample: 0.9468333362489876
  colsample_bytree: 0.9031483836902796
  gamma: 1.5485601338104753
  reg_lambda: 4.983849338092982
  reg_alpha: 4.943670816615743


In [13]:
# --- Optimization Results ---
try:
    fig_history = optuna.visualization.plot_optimization_history(study)
    fig_history.update_layout(title="Optuna Optimization History (F1-Score)")
    fig_history.show()

    fig_importance = optuna.visualization.plot_param_importances(study)
    fig_importance.update_layout(title="Hyperparameter Importances")
    fig_importance.show()

    fig_slice = optuna.visualization.plot_slice(study)
    fig_slice.update_layout(title="Slice Plot of Hyperparameters")
    fig_slice.show()
except Exception as e:
    print(f"\nCould not generate Optuna visualizations. Error: {e}")
    print("This might happen if Plotly is not installed or if running in an environment without a browser/display for interactive plots.")
    print("You can install plotly using: pip install plotly")



In [15]:
study.best_trial.params.items()

dict_items([('learning_rate', 0.08901379915169082), ('max_depth', 7), ('n_estimators', 449), ('subsample', 0.9468333362489876), ('colsample_bytree', 0.9031483836902796), ('gamma', 1.5485601338104753), ('reg_lambda', 4.983849338092982), ('reg_alpha', 4.943670816615743)])

## final Model

In [18]:
best_params = study.best_trial.params
print("\nBest Hyperparameters:")
for key, value in best_params.items():
    print(f"  {key}: {value}")  



Best Hyperparameters:
  learning_rate: 0.08901379915169082
  max_depth: 7
  n_estimators: 449
  subsample: 0.9468333362489876
  colsample_bytree: 0.9031483836902796
  gamma: 1.5485601338104753
  reg_lambda: 4.983849338092982
  reg_alpha: 4.943670816615743


In [19]:
# read the final data
print("\nReading final data for model training...")
df = pd.read_csv('./data/final_data.csv')

# renaming the target column to 'target' for consistency
df.rename(columns={'action': 'target'}, inplace=True)

# Encode the target variable
df['target'] = df['target'].astype('category').cat.codes

# separating data in to features and target variable
X = df.drop(columns=['target'])
y = df['target']

print(f"Shape of X: {X.shape}, Shape of y: {y.shape}")


Reading final data for model training...
Shape of X: (57170, 11), Shape of y: (57170,)


In [20]:
# Initial split: 80% train_val, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=41
)

print(f"Shape of X_train: {X_train.shape}, Shape of y_train: {y_train.shape}")
print(f"Shape of X_test: {X_test.shape}, Shape of y_test: {y_test.shape}")


Shape of X_train: (45736, 11), Shape of y_train: (45736,)
Shape of X_test: (11434, 11), Shape of y_test: (11434,)


In [22]:
# Model definition
final_model = xgb.XGBClassifier(**best_params)

# Compute sample weights dynamically for each trial
sample_weights = compute_sample_weight(class_weight="balanced", y=y_train)

final_model.fit(X_train, y_train,
            sample_weight=sample_weights,
            verbose=True
            )


In [23]:
# predict
y_pred = final_model.predict(X_test)

In [42]:
def classification_report_and_confusion_matrix(y_true, y_pred, labels):
    # 📝 Classification report
    report_text = classification_report(y_true, y_pred, target_names=labels)
    ## save report to a text file
    with open("./data/classification_report.txt", "w") as f:
        f.write(report_text)
        
    # 🔄 Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.savefig("./data/confusion_matrix.png")
    plt.close()
    # mlflow.log_artifact("./data/confusion_matrix.png")

    # 🎯 F1 Score per class
    f1_per_class = f1_score(y_true, y_pred, average='macro')
    plt.figure(figsize=(8, 5))
    # sns.barplot(x=labels, y=f1_per_class, palette="rocket")
    sns.barplot(x=labels, y=f1_per_class, hue=labels, palette="rocket", legend=False)
    plt.title("F1 Score per Class")
    plt.ylim(0, 1)
    plt.tight_layout()
    plt.savefig("./data/f1_per_class.png")
    plt.close()
    # mlflow.log_artifact("./data/f1_per_class.png")

    return f1_per_class



In [43]:
labels = df['target'].astype('category').cat.categories.tolist()
labels = [str(label) for label in labels]  # Convert to string for better readability

In [44]:
classification_report_and_confusion_matrix(y_test, y_pred, labels=labels)

0.8731127837523902

In [45]:
## saving the model
joblib.dump(final_model, './data/final_model.pkl')


['./data/final_model.pkl']

In [46]:
df.columns

Index(['high_activity', 'packets_ratio', 'source_port', 'low_activity',
       'bytes_ratio', 'destination_port', 'nat_destination_port',
       'elapsed_time_(sec)', 'nat_source_port', 'burst_transfer',
       'bytes_per_sec', 'target'],
      dtype='object')

In [48]:
final_model.get_booster().feature_names


['high_activity',
 'packets_ratio',
 'source_port',
 'low_activity',
 'bytes_ratio',
 'destination_port',
 'nat_destination_port',
 'elapsed_time_(sec)',
 'nat_source_port',
 'burst_transfer',
 'bytes_per_sec']