In [2]:
import pandas as pd
import numpy as np
import glob

# Load all CSV files
path = r"C:\Users\vacha\Two-Stage-NIDS\MachineLearningCSV\MachineLearningCVE\*.csv"
files = glob.glob(path)

df = pd.concat((pd.read_csv(f, low_memory=False) for f in files),
               ignore_index=True)

df.columns = df.columns.str.strip()

# Clean infinities
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(0, inplace=True)

print("Dataset Loaded:", df.shape)


Dataset Loaded: (2830743, 79)


In [21]:
# Total packets
df["Total Packets"] = (
    df["Total Fwd Packets"] +
    df["Total Backward Packets"]
)

# Total bytes
df["Total Bytes"] = (
    df["Total Length of Fwd Packets"] +
    df["Total Length of Bwd Packets"]
)

# Packets/s and Bytes/s (already exist but ensure clean)
df["Packets/s"] = df["Flow Packets/s"]
df["Bytes/s"] = df["Flow Bytes/s"]

# Mean packet length (combined)
df["Mean Packet Length"] = (
    df["Total Bytes"] /
    df["Total Packets"].replace(0, 1)
)

# Directional
df["Fwd Packets"] = df["Total Fwd Packets"]
df["Bwd Packets"] = df["Total Backward Packets"]
df["Fwd Bytes"] = df["Total Length of Fwd Packets"]
df["Bwd Bytes"] = df["Total Length of Bwd Packets"]

# TCP Flags
df["SYN Count"] = df["SYN Flag Count"]
df["ACK Count"] = df["ACK Flag Count"]
df["FIN Count"] = df["FIN Flag Count"]
df["RST Count"] = df["RST Flag Count"]

# Keep existing from dataset
df["Flow Duration"] = df["Flow Duration"]
df["Destination Port"] = df["Destination Port"]

# For Std / Max / Min packet length,
# use forward stats as approximation (since combined not available)
df["Std Packet Length"] = df["Fwd Packet Length Std"]
df["Max Packet Length"] = df["Fwd Packet Length Max"]
df["Min Packet Length"] = df["Fwd Packet Length Min"]

df["IAT Mean"] = df["Flow IAT Mean"]
df["IAT Std"] = df["Flow IAT Std"]
df["IAT Max"] = df["Flow IAT Max"]
df["IAT Min"] = df["Flow IAT Min"]




In [22]:
final_features = [
    "Flow Duration",
    "Total Packets",
    "Total Bytes",
    "Packets/s",
    "Bytes/s",

    "Fwd Packets",
    "Bwd Packets",
    "Fwd Bytes",
    "Bwd Bytes",

    "Mean Packet Length",
    "Std Packet Length",
    "Max Packet Length",
    "Min Packet Length",

    "SYN Count",
    "ACK Count",
    "FIN Count",
    "RST Count",

    "IAT Mean",
    "IAT Std",
    "IAT Max",
    "IAT Min",

    "Destination Port"
]

df_model = df[final_features + ["Label"]].copy()

print(df_model.head())


   Flow Duration  Total Packets  Total Bytes     Packets/s       Bytes/s  \
0              3              2           12  666666.66670  4.000000e+06   
1            109              2           12   18348.62385  1.100917e+05   
2             52              2           12   38461.53846  2.307692e+05   
3             34              2           12   58823.52941  3.529412e+05   
4              3              2           12  666666.66670  4.000000e+06   

   Fwd Packets  Bwd Packets  Fwd Bytes  Bwd Bytes  Mean Packet Length  ...  \
0            2            0         12          0                 6.0  ...   
1            1            1          6          6                 6.0  ...   
2            1            1          6          6                 6.0  ...   
3            1            1          6          6                 6.0  ...   
4            2            0         12          0                 6.0  ...   

   SYN Count  ACK Count  FIN Count  RST Count  IAT Mean  IAT Std  IAT Max 

In [23]:
df_model["BinaryLabel"] = df_model["Label"].apply(
    lambda x: 0 if x == "BENIGN" else 1
)


In [24]:
from sklearn.utils import resample

benign = df_model[df_model["BinaryLabel"] == 0]
attack = df_model[df_model["BinaryLabel"] == 1]

benign_down = resample(
    benign,
    replace=False,
    n_samples=len(attack),
    random_state=42
)

df_balanced = pd.concat([benign_down, attack])

print(df_balanced["BinaryLabel"].value_counts())


BinaryLabel
0    557646
1    557646
Name: count, dtype: int64


In [40]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import BatchNormalization
import joblib

X = df_balanced[final_features]
y = df_balanced["BinaryLabel"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2,
    stratify=y,
    random_state=42
)

model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    Dropout(0.3),

    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(64, activation='relu'),
    Dropout(0.2),

    Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

class_weight = {0: 1, 1: 1.5}

model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=30,
    batch_size=1024,
    callbacks=[early_stop],
    class_weight=class_weight, 
    verbose=1
)

loss, acc = model.evaluate(X_test, y_test)
print("Test Accuracy:", acc)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m698/698[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - accuracy: 0.9338 - loss: 0.1925 - val_accuracy: 0.9503 - val_loss: 0.1250
Epoch 2/30
[1m698/698[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 12ms/step - accuracy: 0.9597 - loss: 0.1317 - val_accuracy: 0.9671 - val_loss: 0.1019
Epoch 3/30
[1m698/698[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - accuracy: 0.9667 - loss: 0.1111 - val_accuracy: 0.9716 - val_loss: 0.0826
Epoch 4/30
[1m698/698[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - accuracy: 0.9718 - loss: 0.0924 - val_accuracy: 0.9730 - val_loss: 0.0805
Epoch 5/30
[1m698/698[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - accuracy: 0.9774 - loss: 0.0755 - val_accuracy: 0.9766 - val_loss: 0.0737
Epoch 6/30
[1m698/698[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 12ms/step - accuracy: 0.9814 - loss: 0.0637 - val_accuracy: 0.9721 - val_loss: 0.0725
Epoch 7/30
[1m698/69

In [41]:
X = df_balanced[final_features].values
y = df_balanced["BinaryLabel"].values


In [42]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import pandas as pd
import numpy as np

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold_accuracies = []
all_y_true = []
all_y_pred = []
all_original_labels = []   # ← IMPORTANT

for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(f"\n===== Fold {fold+1} =====")

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Original multiclass labels for diagnosis
    original_test_labels = df_balanced["Label"].iloc[test_index].values


    # Scale per fold
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    model = Sequential([
        Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    early_stop = EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True
    )

    model.fit(
        X_train,
        y_train,
        validation_split=0.2,
        epochs=30,
        batch_size=1024,
        callbacks=[early_stop],
        verbose=0
    )

    probs = model.predict(X_test).ravel()

    threshold = 0.30
    y_pred = (probs > threshold).astype(int)

    acc = accuracy_score(y_test, y_pred)
    fold_accuracies.append(acc)

    all_y_true.extend(y_test)
    all_y_pred.extend(y_pred)
    all_original_labels.extend(original_test_labels)

    print("Fold Accuracy:", acc)


# ==========================
# FINAL CROSS-VALIDATION RESULTS
# ==========================

print("\n===== Cross Validation Results =====")
print("Average Accuracy:", np.mean(fold_accuracies))
print("Std Deviation:", np.std(fold_accuracies))

print("\n===== Confusion Matrix =====")
print(confusion_matrix(all_y_true, all_y_pred))

print("\n===== Classification Report =====")
print(classification_report(all_y_true, all_y_pred))


# ==========================
# FALSE NEGATIVE DIAGNOSIS
# ==========================

results_df = pd.DataFrame({
    "True_Binary": all_y_true,
    "Pred_Binary": all_y_pred,
    "Original_Label": all_original_labels
})

false_negatives = results_df[
    (results_df["True_Binary"] == 1) &
    (results_df["Pred_Binary"] == 0)
]

print("\n===== Missed Attack Types =====")
print(false_negatives["Original_Label"].value_counts())



===== Fold 1 =====


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m6971/6971[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step
Fold Accuracy: 0.9411814811327944

===== Fold 2 =====


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m6971/6971[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step
Fold Accuracy: 0.9373215158321341

===== Fold 3 =====


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m6971/6971[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step
Fold Accuracy: 0.9460633557191404

===== Fold 4 =====


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m6971/6971[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step
Fold Accuracy: 0.9384823678146491

===== Fold 5 =====


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m6971/6971[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step
Fold Accuracy: 0.9384958172313927

===== Cross Validation Results =====
Average Accuracy: 0.940308907546022
Std Deviation: 0.0031439819202066163

===== Confusion Matrix =====
[[526788  30858]
 [ 35715 521931]]

===== Classification Report =====
              precision    recall  f1-score   support

           0       0.94      0.94      0.94    557646
           1       0.94      0.94      0.94    557646

    accuracy                           0.94   1115292
   macro avg       0.94      0.94      0.94   1115292
weighted avg       0.94      0.94      0.94   1115292


===== Missed Attack Types =====
Original_Label
DoS Hulk                      23070
DoS GoldenEye                  7410
SSH-Patator                    1636
Web Attack � Brute Force       1374
Bot                             777
Web Attack � XSS                629
DoS slowloris                   256
DoS Slowhttptest                230
DDoS           

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import BatchNormalization
import joblib

X = df_balanced[final_features]
y = df_balanced["BinaryLabel"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2,
    stratify=y,
    random_state=42
)

model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    Dropout(0.3),

    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(64, activation='relu'),
    Dropout(0.2),

    Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

class_weight = {0: 1, 1: 1.5}

model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=30,
    batch_size=1024,
    callbacks=[early_stop],
    class_weight=class_weight, 
    verbose=1
)

loss, acc = model.evaluate(X_test, y_test)
print("Test Accuracy:", acc)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m698/698[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - accuracy: 0.9332 - loss: 0.1945 - val_accuracy: 0.9561 - val_loss: 0.1188
Epoch 2/30
[1m698/698[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 10ms/step - accuracy: 0.9575 - loss: 0.1356 - val_accuracy: 0.9613 - val_loss: 0.1042
Epoch 3/30
[1m698/698[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - accuracy: 0.9659 - loss: 0.1114 - val_accuracy: 0.9746 - val_loss: 0.0784
Epoch 4/30
[1m698/698[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - accuracy: 0.9719 - loss: 0.0911 - val_accuracy: 0.9831 - val_loss: 0.0683
Epoch 5/30
[1m698/698[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.9771 - loss: 0.0772 - val_accuracy: 0.9765 - val_loss: 0.0596
Epoch 6/30
[1m698/698[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 10ms/step - accuracy: 0.9807 - loss: 0.0661 - val_accuracy: 0.9732 - val_loss: 0.1016
Epoch 7/30
[1m698/698

In [45]:
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
import joblib
import numpy as np

# =========================
# 1. Prepare Data
# =========================

X = df_balanced[final_features].values
y = df_balanced["BinaryLabel"].values

# Scale entire dataset
scaler_final = StandardScaler()
X_scaled = scaler_final.fit_transform(X)

# =========================
# 2. Build Final Model
# =========================

final_model = Sequential([
    Dense(256, activation='relu', input_shape=(X_scaled.shape[1],)),
    BatchNormalization(),
    Dropout(0.3),

    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(64, activation='relu'),
    Dropout(0.2),

    Dense(1, activation='sigmoid')
])

final_model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# =========================
# 3. Training Setup
# =========================

early_stop = EarlyStopping(
    monitor='loss',
    patience=3,
    restore_best_weights=True
)

class_weight = {0: 1, 1: 1.5}  # Give slightly higher importance to ATTACK

# =========================
# 4. Train On FULL Dataset
# =========================

final_model.fit(
    X_scaled,
    y,
    epochs=30,
    batch_size=1024,
    callbacks=[early_stop],
    class_weight=class_weight,
    verbose=1
)

# =========================
# 5. Save Everything
# =========================

final_model.save("binary_model_final.h5")
joblib.dump(scaler_final, "binary_scaler_final.pkl")
joblib.dump(final_features, "binary_features_final.pkl")

print("✅ Final Binary Model Saved Successfully")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m1090/1090[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 11ms/step - accuracy: 0.9405 - loss: 0.1753
Epoch 2/30
[1m1090/1090[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 11ms/step - accuracy: 0.9648 - loss: 0.1150
Epoch 3/30
[1m1090/1090[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 11ms/step - accuracy: 0.9722 - loss: 0.0900
Epoch 4/30
[1m1090/1090[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 11ms/step - accuracy: 0.9781 - loss: 0.0736
Epoch 5/30
[1m1090/1090[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 11ms/step - accuracy: 0.9813 - loss: 0.0651
Epoch 6/30
[1m1090/1090[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 11ms/step - accuracy: 0.9822 - loss: 0.0625
Epoch 7/30
[1m1090/1090[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 11ms/step - accuracy: 0.9821 - loss: 0.0610
Epoch 8/30
[1m1090/1090[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 11ms/step - accuracy: 0.9842 - loss: 0.0546
Epoch 9/



✅ Final Binary Model Saved Successfully
