In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight
import shap
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import shap
from explainerdashboard import ExplainerDashboard, ClassifierExplainer
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
import captum.attr as c
import time
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
import os
import joblib

#Tried these to make increase the performance of the overall model
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [2]:
data = pd.read_csv('../Cleaned_full_data.csv')



In [3]:
data = data.reset_index(drop=True)
# Set NA to 0.
data['ct_ftp_cmd'] = data['ct_ftp_cmd'].fillna(0)
data['attack_cat'] = data['attack_cat'].str.replace(r'\s+', '', regex=True)
data['attack_cat'] = data['attack_cat'].str.replace('Backdoors', 'Backdoor')

data = data.drop(columns=['proto', 'dsport', 'service', 'state', 'srcip', 'sport', 'dstip'])

temp = data[['is_ftp_login', 'is_sm_ips_ports', 'label', 'attack_cat']]
data = data.drop(columns=['is_ftp_login', 'is_sm_ips_ports', 'label', 'attack_cat'])

ohe1 = pd.read_csv('../Full_proto_encoded.csv')
ohe2 = pd.read_csv('../Full_dsport_encoded.csv')
ohe3 = pd.read_csv('../Full_service_encoded.csv')
ohe4 = pd.read_csv('../Full_state_encoded.csv')
# Spelling error.
ohe5 = pd.read_csv('../Full_scrip_encoded.csv')
#------------------------------------------#
ohe6 = pd.read_csv('../Full_sport_encoded.csv')
ohe7 = pd.read_csv('../Full_dstip_encoded.csv')

In [4]:
# MinMax seperates Normal data well and reduces noise. Please see Kmeans TSNE evaluation in Archive.
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data)
data = pd.DataFrame(scaled_data, columns=data.columns)
data = pd.concat([data, temp, ohe1, ohe2, ohe3, ohe4, ohe5, ohe6, ohe7], axis=1)

In [5]:
# Assuming 'data' is your DataFrame and 'attack_cat' is the target variable
X = data.drop(columns=['label', 'attack_cat'])
y = data['attack_cat']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Downsample only 'Normal' in y_train
train_df = X_train.copy()
train_df['attack_cat'] = y_train

In [7]:
# Identify 'Normal' samples
normal_samples = train_df[train_df['attack_cat'] == 'Normal']

# Identify all other samples
non_normal_samples = train_df[train_df['attack_cat'] != 'Normal']

# Downsample 'Normal' samples (for example, keep 30%)
normal_downsampled = normal_samples.sample(frac=0.3, random_state=42)

# Combine back
train_df_downsampled = pd.concat([normal_downsampled, non_normal_samples], ignore_index=True)

# Shuffle after concatenation
train_df_downsampled = train_df_downsampled.sample(frac=1.0, random_state=42).reset_index(drop=True)

# Separate features and target again
X_train = train_df_downsampled.drop(columns=['attack_cat'])
y_train = train_df_downsampled['attack_cat']

# Label encoding
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

# Display class distribution in the train set
print('Train:', pd.Series(y_train).value_counts())
print(len(pd.Series(y_train).value_counts()))
print('Test:', pd.Series(y_test).value_counts())
print(len(pd.Series(y_test).value_counts()))

Train: 6    532480
5    172371
3     35793
4     19462
2     13038
7     11137
0      2127
1      1852
8      1191
9       133
Name: count, dtype: int64
10
Test: 6    443831
5     43110
3      8732
4      4784
2      3315
7      2850
0       550
1       477
8       320
9        41
Name: count, dtype: int64
10


In [8]:
# Random Forest
#rf = RandomForestClassifier(random_state=42)

# XGBoost 
#rf = XGBClassifier(eval_metric='mlogloss', random_state=42)

# ExtraTrees
rf = ExtraTreesClassifier(n_estimators=200, random_state=42)

#Catboost
#rf = CatBoostClassifier(random_state=42)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print('\nClassification Report (Test Set):')
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Classification Report (Test Set):
                precision    recall  f1-score   support

      Analysis       0.43      0.10      0.17       550
      Backdoor       0.46      0.03      0.05       477
           DoS       0.30      0.26      0.28      3315
      Exploits       0.60      0.82      0.69      8732
       Fuzzers       0.65      0.82      0.73      4784
       Generic       1.00      0.98      0.99     43110
        Normal       1.00      1.00      1.00    443831
Reconnaissance       0.95      0.74      0.83      2850
     Shellcode       0.70      0.71      0.70       320
         Worms       0.33      0.02      0.05        41

      accuracy                           0.98    508010
     macro avg       0.64      0.55      0.55    508010
  weighted avg       0.98      0.98      0.98    508010



In [18]:
# Time the prediction with higher precision
start = time.perf_counter()
y_pred = rf.predict(X_test)
end = time.perf_counter()

# FLOPs estimation
n_samples = X_test.shape[0]
n_trees = len(rf.estimators_)
avg_depth = np.mean([estimator.get_depth() for estimator in rf.estimators_])

flops = n_samples * n_trees * avg_depth
gflops = flops / ((end - start) * 1e9)

print(f"Time taken: {end - start:.9f} seconds")
print(f"Approx. GFLOPs: {gflops:.6f}")



Time taken: 12.792065000 seconds
Approx. GFLOPs: 0.742433


In [13]:
accuracy_per_label = {}
for i, label in enumerate(label_encoder.classes_):
    true_label_indices = np.where(y_test == i)[0]
    y_pred_for_label = y_pred[true_label_indices]
    correct = np.sum(y_pred_for_label == i)
    total = len(true_label_indices)
    accuracy_per_label[label] = correct / total
print("\nAccuracy per label:")
for label, acc in accuracy_per_label.items():
    print(f"{label}: {acc:.4f}")


Accuracy per label:
Analysis: 0.1036
Backdoor: 0.0252
DoS: 0.2582
Exploits: 0.8170
Fuzzers: 0.8225
Generic: 0.9808
Normal: 0.9955
Reconnaissance: 0.7393
Shellcode: 0.7094
Worms: 0.0244


In [11]:
#model_filename = './saved_models/SecondaryMulti.joblib'
#joblib.dump(rf, model_filename)