In [28]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight
import shap
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import shap
from explainerdashboard import ExplainerDashboard, ClassifierExplainer
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
import captum.attr as c
import time
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
import os
import joblib


#Tried XGBoost to increase the performance of the overall model
from xgboost import XGBClassifier

#Assessing Gflops and Runtime
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split


In [29]:
data = pd.read_csv('../Cleaned_full_data.csv')



In [30]:
# Reset retained index.
data = data.reset_index(drop=True)
# Set NA to 0.
data['ct_ftp_cmd'] = data['ct_ftp_cmd'].fillna(0)
data['attack_cat'] = data['attack_cat'].str.replace(r'\s+', '', regex=True)
data['attack_cat'] = data['attack_cat'].str.replace('Backdoors', 'Backdoor')

In [31]:
data = data.drop(columns=['proto', 'dsport', 'service', 'state', 'srcip', 'sport', 'dstip'])

In [32]:
temp = data[['is_ftp_login', 'is_sm_ips_ports', 'label', 'attack_cat']]
data = data.drop(columns=['is_ftp_login', 'is_sm_ips_ports', 'label', 'attack_cat'])

In [33]:
ohe1 = pd.read_csv('../Full_proto_encoded.csv')
ohe2 = pd.read_csv('../Full_dsport_encoded.csv')
ohe3 = pd.read_csv('../Full_service_encoded.csv')
ohe4 = pd.read_csv('../Full_state_encoded.csv')
# Spelling error.
ohe5 = pd.read_csv('../Full_scrip_encoded.csv')
#------------------------------------------#
ohe6 = pd.read_csv('../Full_sport_encoded.csv')
ohe7 = pd.read_csv('../Full_dstip_encoded.csv')

In [34]:
# MinMax seperates Normal data well and reduces noise. Please see Kmeans TSNE evaluation in Archive.
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data)
data = pd.DataFrame(scaled_data, columns=data.columns)
data = pd.concat([data, temp, ohe1, ohe2, ohe3, ohe4, ohe5, ohe6, ohe7], axis=1)

In [35]:
# Assuming 'data' is your DataFrame and 'attack_cat' is the target variable
X = data.drop(columns=['label', 'attack_cat'])
y = data['attack_cat']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Removing Label Encoder for CatBoost
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)
path = './saved_models/label_encoder.joblib'
joblib.dump(label_encoder, path)

print('Train:', pd.Series(y_train).value_counts())
print(len(pd.Series(y_train).value_counts()))
print('Test:', pd.Series(y_test).value_counts())
print(len(pd.Series(y_test).value_counts()))

Train: 6    1774933
5     172371
3      35793
4      19462
2      13038
7      11137
0       2127
1       1852
8       1191
9        133
Name: count, dtype: int64
10
Test: 6    443831
5     43110
3      8732
4      4784
2      3315
7      2850
0       550
1       477
8       320
9        41
Name: count, dtype: int64
10


In [43]:
# Random Forest
#rf = RandomForestClassifier(random_state=42)

# XGBoost 
rf = XGBClassifier(eval_metric='mlogloss', random_state=42)

# ExtraTrees
#rf = ExtraTreesClassifier(n_estimators=200, random_state=42)

#Catboost
#rf = CatBoostClassifier(random_state=42)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
print('\nClassification Report (Test Set):')
#print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print(classification_report(y_test, y_pred))

# Estimate FLOPs
n_samples = X_test.shape[0]
n_trees = rf.get_booster().num_boosted_rounds()
avg_depth = rf.get_params().get("max_depth") or 6 

print("n_samples:", n_samples)
print("n_trees:", n_trees)
print("avg_depth:", avg_depth)

flops = n_samples * n_trees * avg_depth
gflops = flops / ((end - start) * 1e9)

print(f"Time taken: {end - start:.6f} seconds")
print(f"Estimated GFLOPS: {gflops:.6f}")



Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.65      0.06      0.11       550
           1       0.76      0.08      0.14       477
           2       0.44      0.29      0.35      3315
           3       0.64      0.87      0.74      8732
           4       0.77      0.63      0.69      4784
           5       1.00      0.99      0.99     43110
           6       1.00      1.00      1.00    443831
           7       0.92      0.79      0.85      2850
           8       0.74      0.86      0.80       320
           9       0.70      0.34      0.46        41

    accuracy                           0.98    508010
   macro avg       0.76      0.59      0.61    508010
weighted avg       0.98      0.98      0.98    508010

n_samples: 508010
n_trees: 100
avg_depth: 6
Time taken: 1.214368 seconds
Estimated GFLOPS: 0.251000


In [44]:
#Label Encoded Version
accuracy_per_label = {}
for i, label in enumerate(label_encoder.classes_):
    true_label_indices = np.where(y_test == i)[0]
    y_pred_for_label = y_pred[true_label_indices]
    correct = np.sum(y_pred_for_label == i)
    total = len(true_label_indices)
    accuracy_per_label[label] = correct / total
print("\nAccuracy per label:")
for label, acc in accuracy_per_label.items():
    print(f"{label}: {acc:.4f}")


##Non Label Encoded Version
#accuracy_per_label = {}
#labels = np.unique(y_test)  # Get unique class labels directly
#
#for label in labels:
#    true_label_indices = np.where(y_test == label)[0]
#    y_pred_for_label = y_pred[true_label_indices]
#    correct = np.sum(y_pred_for_label == label)
#    total = len(true_label_indices)
#    accuracy_per_label[label] = correct / total

print("\nAccuracy per label:")
for label, acc in accuracy_per_label.items():
    print(f"{label}: {acc:.4f}")


Accuracy per label:
Analysis: 0.0600
Backdoor: 0.0776
DoS: 0.2935
Exploits: 0.8715
Fuzzers: 0.6269
Generic: 0.9858
Normal: 0.9982
Reconnaissance: 0.7923
Shellcode: 0.8625
Worms: 0.3415

Accuracy per label:
Analysis: 0.0600
Backdoor: 0.0776
DoS: 0.2935
Exploits: 0.8715
Fuzzers: 0.6269
Generic: 0.9858
Normal: 0.9982
Reconnaissance: 0.7923
Shellcode: 0.8625
Worms: 0.3415


In [12]:
#model_filename = './saved_models/SecondaryMulti.joblib'
#joblib.dump(rf, model_filename)

['./saved_models/SecondaryMulti.joblib']