In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import os
import sys
from sklearn.model_selection import GridSearchCV

project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..', '..'))
sys.path.append(project_root)

from Datasets.BoT_IoT.BoT_IoT_config import BoT_IoT_Config as Dataset_Config

DATASET_NAME = "BoT_IoT"

COLS_TO_NORM = Dataset_Config.COLS_TO_NORM
CATEGORICAL_COLS = Dataset_Config.CATEGORICAL_COLS

DROP_COLS = Dataset_Config.DROP_COLS + ['pkSeqID', 'stime', 'ltime']

SOURCE_IP_COL_NAME = Dataset_Config.SOURCE_IP_COL_NAME
DESTINATION_IP_COL_NAME = Dataset_Config.DESTINATION_IP_COL_NAME

ATTACK_CLASS_COL_NAME = Dataset_Config.ATTACK_CLASS_COL_NAME
IS_ATTACK_COL_NAME = Dataset_Config.IS_ATTACK_COL_NAME

csv_file_name = "all_raw"

data = pd.read_csv(os.path.join(project_root, "Datasets", f"{DATASET_NAME}/All/{csv_file_name}.csv"))


In [2]:
# Preprocess the dataset
from sklearn.discriminant_analysis import StandardScaler

# Normalize numerical columns
scaler = StandardScaler()
print(data[COLS_TO_NORM].describe()) # Check if there's any too large value

# Check for numeric issues in the columns before normalization
def check_numeric_issues(df, cols_to_norm):
    for col in cols_to_norm:
        try:
            # Try to coerce to numeric
            df[col] = pd.to_numeric(df[col], errors='coerce')
            
        except Exception as e:
            print(f"❌ Column '{col}' failed with error: {e}")
            print(f"  - Sample values: {df[col].dropna().unique()[:5]}")
            print(f"  - Data type: {df[col].dtype}")
            continue

    print("\n✅ All other columns processed successfully.")

check_numeric_issues(data, COLS_TO_NORM)

data[COLS_TO_NORM] = scaler.fit_transform(data[COLS_TO_NORM])

data = pd.get_dummies(data, columns = CATEGORICAL_COLS) # One Hot Encoding for categorical data
converted_categorical_cols = [col for col in data.columns if col.startswith(tuple(CATEGORICAL_COLS))]
feature_cols = COLS_TO_NORM + converted_categorical_cols

print('Feature Columns:', feature_cols)
num_features = len(feature_cols)

# Save the scaler for future use
print("Data after normalization:")
X = data.drop(columns=DROP_COLS + [SOURCE_IP_COL_NAME, DESTINATION_IP_COL_NAME, ATTACK_CLASS_COL_NAME, IS_ATTACK_COL_NAME])
y = data[ATTACK_CLASS_COL_NAME]


               pkts         bytes           dur          mean        stddev  \
count  3.668522e+06  3.668522e+06  3.668522e+06  3.668522e+06  3.668522e+06   
mean   7.725963e+00  8.690501e+02  2.033479e+01  2.231063e+00  8.871499e-01   
std    1.155876e+02  1.122667e+05  2.148764e+01  1.517728e+00  8.037139e-01   
min    1.000000e+00  6.000000e+01  0.000000e+00  0.000000e+00  0.000000e+00   
25%    5.000000e+00  4.200000e+02  1.256256e+01  1.819670e-01  3.001900e-02   
50%    7.000000e+00  6.000000e+02  1.550852e+01  2.690125e+00  7.938960e-01   
75%    9.000000e+00  7.700000e+02  2.709986e+01  3.565203e+00  1.745296e+00   
max    7.005700e+04  7.183334e+07  2.771485e+03  4.981882e+00  2.496763e+00   

                sum           min           max         spkts         dpkts  \
count  3.668522e+06  3.668522e+06  3.668522e+06  3.668522e+06  3.668522e+06   
mean   7.721635e+00  1.017540e+00  3.020015e+00  7.314146e+00  4.118173e-01   
std    7.616199e+00  1.483688e+00  1.860877e+00  7.

In [None]:

# Split the dataset into train, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [None, 10, 20, 30],
}

# # Perform Grid Search with cross-validation
# grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
#                            param_grid=param_grid,
#                            scoring='accuracy',
#                            cv=3,
#                            n_jobs=-1,
#                            verbose=2)

# grid_search.fit(X_train, y_train)

# Get the best parameters and train the model with them
# best_params = grid_search.best_params_
# print("Best Parameters:", best_params)

best_params = {
    'n_estimators': 200,
    'max_depth': 20
}

rf = RandomForestClassifier(random_state=42, **best_params)
rf.fit(X_train, y_train)

# Test the model
y_test_pred = rf.predict(X_test)
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))

Test Classification Report:
                precision    recall  f1-score   support

          DDoS       1.00      1.00      1.00    288931
           DoS       1.00      1.00      1.00    247760
        Normal       0.97      1.00      0.99        72
Reconnaissance       1.00      1.00      1.00     13508
         Theft       1.00      0.75      0.86         8

      accuracy                           1.00    550279
     macro avg       0.99      0.95      0.97    550279
  weighted avg       1.00      1.00      1.00    550279



In [6]:
import numpy as np

# Perturb the test set by adding noise
def perturb_data(X, noise_level=0.01):
    noise = noise_level * np.random.randn(*X.shape)
    return X + noise
X_test_perturbed = perturb_data(X_test.copy(), noise_level=5)

# Test the model on the perturbed data
y_test_perturbed_pred = rf.predict(X_test_perturbed)
print("Test Classification Report on Perturbed Data:")
print(classification_report(y_test, y_test_perturbed_pred))

Test Classification Report on Perturbed Data:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                precision    recall  f1-score   support

          DDoS       0.56      0.21      0.31    288931
           DoS       0.48      0.46      0.47    247760
        Normal       0.01      0.18      0.01        72
Reconnaissance       0.04      0.61      0.08     13508
         Theft       0.00      0.00      0.00         8

      accuracy                           0.34    550279
     macro avg       0.22      0.29      0.17    550279
  weighted avg       0.51      0.34      0.38    550279



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
