In [1]:
import pandas as pd
import numpy as np
import glob

path = r"C:\Users\vacha\Two-Stage-NIDS\MachineLearningCSV\MachineLearningCVE\*.csv"
files = glob.glob(path)

df = pd.concat(
    (pd.read_csv(f, low_memory=False) for f in files),
    ignore_index=True
)

df.columns = df.columns.str.strip()

df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(0, inplace=True)

df.drop_duplicates(inplace=True)

# Remove BENIGN for multiclass
df = df[df["Label"] != "BENIGN"]

print("Multiclass Dataset Shape:", df.shape)
print(df["Label"].value_counts())


Multiclass Dataset Shape: (425878, 79)
Label
DoS Hulk                      172849
DDoS                          128016
PortScan                       90819
DoS GoldenEye                  10286
FTP-Patator                     5933
DoS slowloris                   5385
DoS Slowhttptest                5228
SSH-Patator                     3219
Bot                             1953
Web Attack � Brute Force        1470
Web Attack � XSS                 652
Infiltration                      36
Web Attack � Sql Injection        21
Heartbleed                        11
Name: count, dtype: int64


In [2]:
dos_labels = [
    "DoS Hulk",
    "DoS GoldenEye",
    "DoS slowloris",
    "DoS Slowhttptest"
]

df["Label"] = df["Label"].replace(dos_labels, "DoS")


In [3]:
multiclass_labels = [
    "DoS",
    "PortScan",
    "FTP-Patator",
    "SSH-Patator"
]

df_multi = df[df["Label"].isin(multiclass_labels)].copy()


In [9]:
# ---------------- FEATURE ENGINEERING FOR MULTICLASS ----------------

# Total packets
df_multi["Total Packets"] = (
    df_multi["Total Fwd Packets"] +
    df_multi["Total Backward Packets"]
)

# Total bytes
df_multi["Total Bytes"] = (
    df_multi["Total Length of Fwd Packets"] +
    df_multi["Total Length of Bwd Packets"]
)

df_multi["Packets/s"] = df_multi["Flow Packets/s"]
df_multi["Bytes/s"] = df_multi["Flow Bytes/s"]

df_multi["Mean Packet Length"] = (
    df_multi["Total Bytes"] /
    df_multi["Total Packets"].replace(0, 1)
)

df_multi["Fwd Packets"] = df_multi["Total Fwd Packets"]
df_multi["Bwd Packets"] = df_multi["Total Backward Packets"]
df_multi["Fwd Bytes"] = df_multi["Total Length of Fwd Packets"]
df_multi["Bwd Bytes"] = df_multi["Total Length of Bwd Packets"]

df_multi["SYN Count"] = df_multi["SYN Flag Count"]
df_multi["ACK Count"] = df_multi["ACK Flag Count"]
df_multi["FIN Count"] = df_multi["FIN Flag Count"]
df_multi["RST Count"] = df_multi["RST Flag Count"]

df_multi["Std Packet Length"] = df_multi["Fwd Packet Length Std"]
df_multi["Max Packet Length"] = df_multi["Fwd Packet Length Max"]
df_multi["Min Packet Length"] = df_multi["Fwd Packet Length Min"]

df_multi["IAT Mean"] = df_multi["Flow IAT Mean"]
df_multi["IAT Std"] = df_multi["Flow IAT Std"]
df_multi["IAT Max"] = df_multi["Flow IAT Max"]
df_multi["IAT Min"] = df_multi["Flow IAT Min"]

# Clean again
df_multi.replace([np.inf, -np.inf], 0, inplace=True)
df_multi.fillna(0, inplace=True)

print("Feature Engineering Done.")


Feature Engineering Done.


In [10]:
final_features = [
    "Flow Duration", "Total Packets", "Total Bytes",
    "Packets/s", "Bytes/s",
    "Fwd Packets", "Bwd Packets",
    "Fwd Bytes", "Bwd Bytes",
    "Mean Packet Length", "Std Packet Length",
    "Max Packet Length", "Min Packet Length",
    "SYN Count", "ACK Count", "FIN Count", "RST Count",
    "IAT Mean", "IAT Std", "IAT Max", "IAT Min",
    "Destination Port"
]

X = df_multi[final_features].values
y = df_multi["Label"].values


In [11]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(y)

print("Classes:", le.classes_)


Classes: ['DoS' 'FTP-Patator' 'PortScan' 'SSH-Patator']


In [12]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold_accuracies = []
all_y_true = []
all_y_pred = []

for fold, (train_idx, test_idx) in enumerate(skf.split(X, y_encoded)):

    print(f"\n===== Fold {fold+1} =====")

    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y_encoded[train_idx], y_encoded[test_idx]

    # Scale per fold (NO DATA LEAKAGE)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    model = Sequential([
        Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(len(le.classes_), activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    early_stop = EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True
    )

    model.fit(
        X_train,
        y_train,
        validation_split=0.2,
        epochs=30,
        batch_size=1024,
        callbacks=[early_stop],
        verbose=0
    )

    y_pred = np.argmax(model.predict(X_test), axis=1)

    acc = accuracy_score(y_test, y_pred)
    fold_accuracies.append(acc)

    all_y_true.extend(y_test)
    all_y_pred.extend(y_pred)

    print("Fold Accuracy:", acc)

print("\n===== Cross Validation Results =====")
print("Average Accuracy:", np.mean(fold_accuracies))
print("Std Deviation:", np.std(fold_accuracies))

print("\n===== Confusion Matrix =====")
print(confusion_matrix(all_y_true, all_y_pred))

print("\n===== Classification Report =====")
print(classification_report(all_y_true, all_y_pred, target_names=le.classes_))



===== Fold 1 =====


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1836/1836[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
Fold Accuracy: 0.9977870080348631

===== Fold 2 =====


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1836/1836[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
Fold Accuracy: 0.9975146397930001

===== Fold 3 =====


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1836/1836[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
Fold Accuracy: 0.9977529620046303

===== Fold 4 =====


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1836/1836[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
Fold Accuracy: 0.9982636524581234

===== Fold 5 =====


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1836/1836[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
Fold Accuracy: 0.9975997140084776

===== Cross Validation Results =====
Average Accuracy: 0.997783595259819
Std Deviation: 0.00025982852604265484

===== Confusion Matrix =====
[[193540    117     90      1]
 [    17   5912      4      0]
 [   156      0  90663      0]
 [   233     14     19   2953]]

===== Classification Report =====
              precision    recall  f1-score   support

         DoS       1.00      1.00      1.00    193748
 FTP-Patator       0.98      1.00      0.99      5933
    PortScan       1.00      1.00      1.00     90819
 SSH-Patator       1.00      0.92      0.96      3219

    accuracy                           1.00    293719
   macro avg       0.99      0.98      0.99    293719
weighted avg       1.00      1.00      1.00    293719



In [13]:
from sklearn.preprocessing import StandardScaler
import joblib

# Scale entire dataset
scaler_final = StandardScaler()
X_scaled = scaler_final.fit_transform(X)

final_model = Sequential([
    Dense(128, activation='relu', input_shape=(X_scaled.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(len(le.classes_), activation='softmax')
])

final_model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

final_model.fit(
    X_scaled,
    y_encoded,
    epochs=25,
    batch_size=1024,
    verbose=1
)


Epoch 1/25


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m287/287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9688 - loss: 0.1121
Epoch 2/25
[1m287/287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9962 - loss: 0.0186
Epoch 3/25
[1m287/287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.9974 - loss: 0.0123
Epoch 4/25
[1m287/287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9978 - loss: 0.0112
Epoch 5/25
[1m287/287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9980 - loss: 0.0091
Epoch 6/25
[1m287/287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9981 - loss: 0.0081
Epoch 7/25
[1m287/287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9983 - loss: 0.0070
Epoch 8/25
[1m287/287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9985 - loss: 0.0064
Epoch 9/25
[1m287/287[0m [32m━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x1670416e2c0>

In [14]:
final_model.save("multiclass_model.h5")
joblib.dump(scaler_final, "multiclass_scaler.pkl")
joblib.dump(final_features, "multiclass_features.pkl")
joblib.dump(le, "multiclass_label_encoder.pkl")

print("Multiclass model saved successfully.")




Multiclass model saved successfully.
