In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight
import shap
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import shap
from explainerdashboard import ExplainerDashboard, ClassifierExplainer
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
import captum.attr as c
import time
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
import os
import joblib

In [8]:
data = pd.read_csv('../Cleaned_full_data.csv')



In [9]:
# Reset retained index.
data = data.reset_index(drop=True)
# Set NA to 0.
data['ct_ftp_cmd'] = data['ct_ftp_cmd'].fillna(0)
data['attack_cat'] = data['attack_cat'].str.replace(r'\s+', '', regex=True)
data['attack_cat'] = data['attack_cat'].str.replace('Backdoors', 'Backdoor')

In [10]:
data = data.drop(columns=['proto', 'dsport', 'service', 'state', 'srcip', 'sport', 'dstip'])

In [11]:
temp = data[['is_ftp_login', 'is_sm_ips_ports', 'label', 'attack_cat']]
data = data.drop(columns=['is_ftp_login', 'is_sm_ips_ports', 'label', 'attack_cat'])

In [12]:
ohe1 = pd.read_csv('../Full_proto_encoded.csv')
ohe2 = pd.read_csv('../Full_dsport_encoded.csv')
ohe3 = pd.read_csv('../Full_service_encoded.csv')
ohe4 = pd.read_csv('../Full_state_encoded.csv')
# Spelling error.
ohe5 = pd.read_csv('../Full_scrip_encoded.csv')
#------------------------------------------#
ohe6 = pd.read_csv('../Full_sport_encoded.csv')
ohe7 = pd.read_csv('../Full_dstip_encoded.csv')

In [13]:
# MinMax seperates Normal data well and reduces noise. Please see Kmeans TSNE evaluation in Archive.
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data)
data = pd.DataFrame(scaled_data, columns=data.columns)
data = pd.concat([data, temp, ohe1, ohe2, ohe3, ohe4, ohe5, ohe6, ohe7], axis=1)

In [14]:
# Assuming 'data' is your DataFrame and 'attack_cat' is the target variable
X = data.drop(columns=['label', 'attack_cat'])
y = data['attack_cat']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)
path = './saved_models/label_encoder.joblib'
joblib.dump(label_encoder, path)
print('Train:', pd.Series(y_train).value_counts())
print(len(pd.Series(y_train).value_counts()))
print('Test:', pd.Series(y_test).value_counts())
print(len(pd.Series(y_test).value_counts()))

Train: 6    1774933
5     172371
3      35793
4      19462
2      13038
7      11137
0       2127
1       1852
8       1191
9        133
Name: count, dtype: int64
10
Test: 6    443831
5     43110
3      8732
4      4784
2      3315
7      2850
0       550
1       477
8       320
9        41
Name: count, dtype: int64
10


In [15]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print('\nClassification Report (Test Set):')
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Classification Report (Test Set):
                precision    recall  f1-score   support

      Analysis       0.69      0.08      0.14       550
      Backdoor       0.74      0.06      0.12       477
           DoS       0.30      0.22      0.26      3315
      Exploits       0.61      0.82      0.70      8732
       Fuzzers       0.79      0.70      0.74      4784
       Generic       1.00      0.98      0.99     43110
        Normal       1.00      1.00      1.00    443831
Reconnaissance       0.94      0.77      0.85      2850
     Shellcode       0.69      0.77      0.73       320
         Worms       0.33      0.02      0.05        41

      accuracy                           0.98    508010
     macro avg       0.71      0.54      0.56    508010
  weighted avg       0.98      0.98      0.98    508010



In [16]:
accuracy_per_label = {}
for i, label in enumerate(label_encoder.classes_):
    true_label_indices = np.where(y_test == i)[0]
    y_pred_for_label = y_pred[true_label_indices]
    correct = np.sum(y_pred_for_label == i)
    total = len(true_label_indices)
    accuracy_per_label[label] = correct / total
print("\nAccuracy per label:")
for label, acc in accuracy_per_label.items():
    print(f"{label}: {acc:.4f}")


Accuracy per label:
Analysis: 0.0800
Backdoor: 0.0650
DoS: 0.2244
Exploits: 0.8221
Fuzzers: 0.7032
Generic: 0.9835
Normal: 0.9987
Reconnaissance: 0.7723
Shellcode: 0.7688
Worms: 0.0244


In [17]:
model_filename = './saved_models/SecondaryMulti.joblib'
joblib.dump(rf, model_filename)

['./saved_models/SecondaryMulti.joblib']