In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import os
import sys
from sklearn.model_selection import GridSearchCV

project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..', '..'))
sys.path.append(project_root)

from Datasets.CIC_IDS_2017.CIC_IDS_2017_config import CIC_IDS_2017_Config as Dataset_Config

DATASET_NAME = "CIC_IDS_2017"

COLS_TO_NORM = Dataset_Config.COLS_TO_NORM
CATEGORICAL_COLS = Dataset_Config.CATEGORICAL_COLS

DROP_COLS = Dataset_Config.DROP_COLS + ['Flow ID', 'Timestamp']

SOURCE_IP_COL_NAME = Dataset_Config.SOURCE_IP_COL_NAME
DESTINATION_IP_COL_NAME = Dataset_Config.DESTINATION_IP_COL_NAME

ATTACK_CLASS_COL_NAME = Dataset_Config.ATTACK_CLASS_COL_NAME

csv_file_name = "all_raw"

data = pd.read_csv(os.path.join(project_root, "Datasets", f"{DATASET_NAME}/All/{csv_file_name}.csv"))


In [3]:
# Preprocess the dataset
import numpy as np
from sklearn.discriminant_analysis import StandardScaler

# Data Cleaning
data.reset_index(drop=True, inplace=True)
data.replace([np.inf, -np.inf], np.nan,inplace = True)
data.fillna(0,inplace = True)

# Normalize numerical columns
scaler = StandardScaler()
print(data[COLS_TO_NORM].describe()) # Check if there's any too large value

# Check for numeric issues in the columns before normalization
def check_numeric_issues(df, cols_to_norm):
    for col in cols_to_norm:
        try:
            # Try to coerce to numeric
            df[col] = pd.to_numeric(df[col], errors='coerce')
            
        except Exception as e:
            print(f"❌ Column '{col}' failed with error: {e}")
            print(f"  - Sample values: {df[col].dropna().unique()[:5]}")
            print(f"  - Data type: {df[col].dtype}")
            continue

    print("\n✅ All other columns processed successfully.")

check_numeric_issues(data, COLS_TO_NORM)

data[COLS_TO_NORM] = scaler.fit_transform(data[COLS_TO_NORM])

data = pd.get_dummies(data, columns = CATEGORICAL_COLS) # One Hot Encoding for categorical data
converted_categorical_cols = [col for col in data.columns if col.startswith(tuple(CATEGORICAL_COLS))]
feature_cols = COLS_TO_NORM + converted_categorical_cols

print('Feature Columns:', feature_cols)
num_features = len(feature_cols)

# Save the scaler for future use
print("Data after normalization:")
X = data.drop(columns=DROP_COLS + [SOURCE_IP_COL_NAME, DESTINATION_IP_COL_NAME, ATTACK_CLASS_COL_NAME])
y = data[ATTACK_CLASS_COL_NAME]


       Bwd Packet Length Min  Subflow Fwd Packets  \
count           2.830743e+06         2.830743e+06   
mean            4.104958e+01         9.361160e+00   
std             6.886260e+01         7.496728e+02   
min             0.000000e+00         1.000000e+00   
25%             0.000000e+00         2.000000e+00   
50%             0.000000e+00         2.000000e+00   
75%             7.700000e+01         5.000000e+00   
max             2.896000e+03         2.197590e+05   

       Total Length of Fwd Packets  Fwd Packet Length Mean  \
count                 2.830743e+06            2.830743e+06   
mean                  5.493024e+02            5.820194e+01   
std                   9.993589e+03            1.860912e+02   
min                   0.000000e+00            0.000000e+00   
25%                   1.200000e+01            6.000000e+00   
50%                   6.200000e+01            3.400000e+01   
75%                   1.870000e+02            5.000000e+01   
max                   1.29

In [4]:

# Split the dataset into train, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [None, 10, 20, 30],
}

# # Perform Grid Search with cross-validation
# grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
#                            param_grid=param_grid,
#                            scoring='accuracy',
#                            cv=3,
#                            n_jobs=-1,
#                            verbose=2)

# grid_search.fit(X_train, y_train)

# Get the best parameters and train the model with them
# best_params = grid_search.best_params_
# print("Best Parameters:", best_params)

best_params = {
    'n_estimators': 200,
    'max_depth': 20
}

rf = RandomForestClassifier(random_state=42, **best_params)
rf.fit(X_train, y_train)

# Test the model
y_test_pred = rf.predict(X_test)
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))

Test Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                            precision    recall  f1-score   support

                    BENIGN       1.00      1.00      1.00    341134
                       Bot       1.00      0.99      1.00       333
                      DDoS       1.00      1.00      1.00     19194
             DoS GoldenEye       1.00      1.00      1.00      1536
                  DoS Hulk       1.00      1.00      1.00     34466
          DoS Slowhttptest       1.00      1.00      1.00       807
             DoS slowloris       1.00      0.99      1.00       823
               FTP-Patator       1.00      1.00      1.00      1228
                Heartbleed       1.00      1.00      1.00         2
              Infiltration       1.00      0.83      0.91         6
                  PortScan       1.00      1.00      1.00     23857
               SSH-Patator       1.00      1.00      1.00       894
  Web Attack - Brute Force       0.84      0.92      0.87       224
Web Attack - Sql Injection       0.00      0.00

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
import numpy as np

# Perturb the test set by adding noise
def perturb_data(X, noise_level=0.01):
    noise = noise_level * np.random.randn(*X.shape)
    return X + noise
X_test_perturbed = perturb_data(X_test.copy(), noise_level=1)

# Test the model on the perturbed data
y_test_perturbed_pred = rf.predict(X_test_perturbed)
print("Test Classification Report on Perturbed Data:")
print(classification_report(y_test, y_test_perturbed_pred))

Test Classification Report on Perturbed Data:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                            precision    recall  f1-score   support

                    BENIGN       0.81      1.00      0.89    341134
                       Bot       0.00      0.00      0.00       333
                      DDoS       0.99      0.01      0.02     19194
             DoS GoldenEye       0.00      0.00      0.00      1536
                  DoS Hulk       0.96      0.06      0.11     34466
          DoS Slowhttptest       0.00      0.00      0.00       807
             DoS slowloris       1.00      0.00      0.01       823
               FTP-Patator       0.00      0.00      0.00      1228
                Heartbleed       0.00      0.00      0.00         2
              Infiltration       0.00      0.00      0.00         6
                  PortScan       0.89      0.00      0.00     23857
               SSH-Patator       0.00      0.00      0.00       894
  Web Attack - Brute Force       0.00      0.00      0.00       224
Web Attack - Sql Injection       0.00      0.00

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
