In [1]:
import os
import pandas as pd

# ===== CONFIG =====
DATA_DIR = r"D:\Grad\Dataset"              # where your *_train*.csv are
OUTPUT_DIR = r"D:\Grad\mixed_dataset"
SAMPLES_PER_CLASS = 20000                  # keep like before; can change later
RANDOM_SEED = 42

os.makedirs(OUTPUT_DIR, exist_ok=True)

all_samples = []
class_counts = {}

def infer_class_name(filename: str) -> str:
    name = filename
    for suffix in ["_train.pcap.csv", "_train.csv", ".pcap.csv", ".csv"]:
        if name.lower().endswith(suffix):
            name = name[: -len(suffix)]
            break
    # IMPORTANT: keep same label style you used in training BEFORE
    # e.g. "Benign_train.csv", "Recon-Port_Scan_train.csv"
    # If your previous labels were exactly class_name + "_train.csv", keep it that way:
    if not name.lower().endswith("_train"):
        # if file naming is different, do nothing
        pass
    return name + ".csv"

for file in os.listdir(DATA_DIR):
    f_lower = file.lower()

    # Accept train CSVs (covers both *_train.csv and *_train.pcap.csv)
    if not (f_lower.endswith(".csv") and "_train" in f_lower):
        continue

    file_path = os.path.join(DATA_DIR, file)
    class_name = infer_class_name(file)

    print(f"Loading: {file}  --> label: {class_name}")

    df = pd.read_csv(file_path)

    # Sample N rows per class (if file has fewer rows, take all)
    if len(df) > SAMPLES_PER_CLASS:
        df = df.sample(SAMPLES_PER_CLASS, random_state=RANDOM_SEED)

    # Add label column
    df["Label"] = class_name

    all_samples.append(df)
    class_counts[class_name] = len(df)

if not all_samples:
    raise SystemExit("No train CSV files found. Make sure filenames contain '_train' and end with .csv")

final_df = pd.concat(all_samples, ignore_index=True)

# ✅ IMPORTANT: Shuffle rows to create a "mixed stream"
final_df = final_df.sample(frac=1.0, random_state=RANDOM_SEED).reset_index(drop=True)

out_path = os.path.join(OUTPUT_DIR, "mixed_train_balanced.csv")
final_df.to_csv(out_path, index=False)

print("\n✅ Mixed+balanced training dataset saved to:")
print(out_path)

print("\nClass counts used:")
for k, v in sorted(class_counts.items()):
    print(f"{k}: {v}")

print("\nTotal rows:", len(final_df))
print("Columns:", final_df.shape[1])


Loading: ARP_Spoofing_train.csv.csv  --> label: ARP_Spoofing_train.csv.csv
Loading: Benign_train.csv.csv  --> label: Benign_train.csv.csv
Loading: MQTT-DDoS-Connect_Flood_train.csv.csv  --> label: MQTT-DDoS-Connect_Flood_train.csv.csv
Loading: MQTT-DDoS-Publish_Flood_train.csv.csv  --> label: MQTT-DDoS-Publish_Flood_train.csv.csv
Loading: MQTT-DoS-Connect_Flood_train.csv.csv  --> label: MQTT-DoS-Connect_Flood_train.csv.csv
Loading: MQTT-DoS-Publish_Flood_train.csv.csv  --> label: MQTT-DoS-Publish_Flood_train.csv.csv
Loading: MQTT-Malformed_Data_train.csv.csv  --> label: MQTT-Malformed_Data_train.csv.csv
Loading: Recon-OS_Scan_train.csv.csv  --> label: Recon-OS_Scan_train.csv.csv
Loading: Recon-Ping_Sweep_train.csv.csv  --> label: Recon-Ping_Sweep_train.csv.csv
Loading: Recon-Port_Scan_train.csv.csv  --> label: Recon-Port_Scan_train.csv.csv

✅ Mixed+balanced training dataset saved to:
D:\Grad\mixed_dataset\mixed_train_balanced.csv

Class counts used:
ARP_Spoofing_train.csv.csv: 16047
Be

In [2]:
import pandas as pd
import numpy as np
import os

INPUT_PATH = r"D:\Grad\derived_dataset\train_balanced.csv"
OUTPUT_PATH = r"D:\Grad\derived_dataset\train_balanced_clean.csv"

df = pd.read_csv(INPUT_PATH)

print("Loaded:", INPUT_PATH)
print("Shape:", df.shape)

# ----- 1) Drop leakage / identifier columns if they exist -----
leak_cols = ["Flow ID", "Src IP", "Dst IP", "Timestamp"]
existing_leaks = [c for c in leak_cols if c in df.columns]
if existing_leaks:
    df = df.drop(columns=existing_leaks)
    print("Dropped leakage columns:", existing_leaks)
else:
    print("No leakage columns found among:", leak_cols)

# ----- 2) Ensure Label exists -----
if "Label" not in df.columns:
    raise ValueError("Label column not found. Make sure your dataset has a 'Label' column.")

# ----- 3) Replace infinities with NaN -----
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# ----- 4) Handle missing values -----
# Strategy: fill numeric NaNs with median (robust), non-numeric NaNs with mode
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
obj_cols = [c for c in df.columns if c not in num_cols]

# Don't impute labels
if "Label" in num_cols:
    num_cols.remove("Label")

for c in num_cols:
    if df[c].isna().any():
        med = df[c].median()
        df[c] = df[c].fillna(med)

for c in obj_cols:
    if c == "Label":
        continue
    if df[c].isna().any():
        mode_val = df[c].mode(dropna=True)
        df[c] = df[c].fillna(mode_val.iloc[0] if len(mode_val) else "")

# ----- 5) Drop duplicate rows (optional but safe) -----
before = len(df)
df = df.drop_duplicates()
after = len(df)
print(f"Dropped duplicates: {before - after}")

# ----- 6) Save cleaned dataset -----
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
df.to_csv(OUTPUT_PATH, index=False)

print("✅ Saved cleaned dataset to:", OUTPUT_PATH)
print("Final shape:", df.shape)
print("\nLabel distribution:")
print(df["Label"].value_counts())


Loaded: D:\Grad\derived_dataset\train_balanced.csv
Shape: (151522, 46)
No leakage columns found among: ['Flow ID', 'Src IP', 'Dst IP', 'Timestamp']
Dropped duplicates: 974
✅ Saved cleaned dataset to: D:\Grad\derived_dataset\train_balanced_clean.csv
Final shape: (150548, 46)

Label distribution:
Label
Benign_train.csv                     20000
MQTT-DDoS-Connect_Flood_train.csv    20000
MQTT-DoS-Publish_Flood_train.csv     20000
MQTT-DDoS-Publish_Flood_train.csv    20000
Recon-Port_Scan_train.csv            19695
Recon-OS_Scan_train.csv              16163
ARP_Spoofing_train.csv               16047
MQTT-DoS-Connect_Flood_train.csv     12773
MQTT-Malformed_Data_train.csv         5130
Recon-Ping_Sweep_train.csv             740
Name: count, dtype: int64


In [3]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, LabelEncoder
import joblib

# ===== PATHS =====
INPUT_PATH = r"D:\Grad\derived_dataset\train_balanced_clean.csv"
OUTPUT_DIR = r"D:\Grad\derived_dataset"

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ----- Load cleaned data -----
df = pd.read_csv(INPUT_PATH)
print("Loaded:", INPUT_PATH)
print("Shape:", df.shape)

# ----- Separate features and labels -----
X = df.drop(columns=["Label"])
y = df["Label"]

print("Features shape:", X.shape)
print("Labels shape:", y.shape)

# ----- Encode labels -----
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Save label mapping
label_map = pd.DataFrame({
    "Label": label_encoder.classes_,
    "Encoded": range(len(label_encoder.classes_))
})
label_map_path = os.path.join(OUTPUT_DIR, "label_mapping.csv")
label_map.to_csv(label_map_path, index=False)

print("\nLabel mapping:")
print(label_map)

# ----- Scale features -----
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save scaler
scaler_path = os.path.join(OUTPUT_DIR, "scaler.joblib")
joblib.dump(scaler, scaler_path)

# ----- Save processed arrays -----
np.save(os.path.join(OUTPUT_DIR, "X_scaled.npy"), X_scaled)
np.save(os.path.join(OUTPUT_DIR, "y_encoded.npy"), y_encoded)

print("\n✅ Saved:")
print("- X_scaled.npy")
print("- y_encoded.npy")
print("- label_mapping.csv")
print("- scaler.joblib")


Loaded: D:\Grad\derived_dataset\train_balanced_clean.csv
Shape: (150548, 46)
Features shape: (150548, 45)
Labels shape: (150548,)

Label mapping:
                               Label  Encoded
0             ARP_Spoofing_train.csv        0
1                   Benign_train.csv        1
2  MQTT-DDoS-Connect_Flood_train.csv        2
3  MQTT-DDoS-Publish_Flood_train.csv        3
4   MQTT-DoS-Connect_Flood_train.csv        4
5   MQTT-DoS-Publish_Flood_train.csv        5
6      MQTT-Malformed_Data_train.csv        6
7            Recon-OS_Scan_train.csv        7
8         Recon-Ping_Sweep_train.csv        8
9          Recon-Port_Scan_train.csv        9

✅ Saved:
- X_scaled.npy
- y_encoded.npy
- label_mapping.csv
- scaler.joblib


In [4]:
import numpy as np
import os
from collections import Counter

INPUT_DIR = r"D:\Grad\derived_dataset"
OUTPUT_DIR = r"D:\Grad\derived_dataset"

X_path = os.path.join(INPUT_DIR, "X_scaled.npy")
y_path = os.path.join(INPUT_DIR, "y_encoded.npy")

X = np.load(X_path)
y = np.load(y_path)

print("Loaded X:", X.shape)
print("Loaded y:", y.shape)

# ===== SEQUENCE SETTINGS =====
WINDOW_SIZE = 20
STRIDE = 5

def majority_vote(labels):
    return Counter(labels).most_common(1)[0][0]

X_seq = []
y_seq = []

num_samples = X.shape[0]

for start in range(0, num_samples - WINDOW_SIZE + 1, STRIDE):
    end = start + WINDOW_SIZE

    window_X = X[start:end]
    window_y = y[start:end]

    X_seq.append(window_X)
    y_seq.append(majority_vote(window_y))

X_seq = np.array(X_seq, dtype=np.float32)
y_seq = np.array(y_seq, dtype=np.int64)

print("\n✅ Sequence dataset created")
print("X_seq shape:", X_seq.shape)
print("y_seq shape:", y_seq.shape)

# Save
np.save(os.path.join(OUTPUT_DIR, "X_seq.npy"), X_seq)
np.save(os.path.join(OUTPUT_DIR, "y_seq.npy"), y_seq)

print("\n✅ Saved:")
print("- X_seq.npy")
print("- y_seq.npy")


Loaded X: (150548, 45)
Loaded y: (150548,)

✅ Sequence dataset created
X_seq shape: (30106, 20, 45)
y_seq shape: (30106,)

✅ Saved:
- X_seq.npy
- y_seq.npy


In [5]:
import numpy as np
import os
import joblib
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

import tensorflow as tf
from tensorflow.keras import layers, models

DATA_DIR = r"D:\Grad\derived_dataset"
MODEL_DIR = r"D:\Grad\model_output"
os.makedirs(MODEL_DIR, exist_ok=True)

# ---- Load sequences ----
X = np.load(os.path.join(DATA_DIR, "X_seq.npy"))
y = np.load(os.path.join(DATA_DIR, "y_seq.npy"))

print("X:", X.shape)
print("y:", y.shape)

num_classes = int(np.max(y)) + 1
print("num_classes:", num_classes)

# ---- Train/Val split (keep test separate if you have it later) ----
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train:", X_train.shape, y_train.shape)
print("Val:", X_val.shape, y_val.shape)

# ---- Class weights (helps imbalance) ----
classes = np.unique(y_train)
weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
class_weight = {int(c): float(w) for c, w in zip(classes, weights)}
print("Class weights:", class_weight)

# ---- Build CNN-LSTM model ----
timesteps = X.shape[1]   # 20
features = X.shape[2]    # 45

model = models.Sequential([
    layers.Input(shape=(timesteps, features)),

    layers.Conv1D(64, kernel_size=3, padding="same", activation="relu"),
    layers.MaxPooling1D(pool_size=2),

    layers.Conv1D(128, kernel_size=3, padding="same", activation="relu"),
    layers.MaxPooling1D(pool_size=2),

    layers.LSTM(64, return_sequences=False),
    layers.Dropout(0.3),

    layers.Dense(64, activation="relu"),
    layers.Dropout(0.3),

    layers.Dense(num_classes, activation="softmax")
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()

# ---- Callbacks ----
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor="val_loss", patience=5, restore_best_weights=True
    ),
    tf.keras.callbacks.ModelCheckpoint(
        filepath=os.path.join(MODEL_DIR, "cnn_lstm_best.keras"),
        monitor="val_loss", save_best_only=True
    )
]

# ---- Train ----
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=30,
    batch_size=128,
    class_weight=class_weight,
    callbacks=callbacks,
    verbose=1
)

# ---- Save final model ----
final_model_path = os.path.join(MODEL_DIR, "cnn_lstm_final.keras")
model.save(final_model_path)
print("✅ Saved model to:", final_model_path)

# ---- Save training history ----
hist_path = os.path.join(MODEL_DIR, "training_history.csv")
pd.DataFrame(history.history).to_csv(hist_path, index=False)
print("✅ Saved history to:", hist_path)


X: (30106, 20, 45)
y: (30106,)
num_classes: 10
Train: (24084, 20, 45) (24084,)
Val: (6022, 20, 45) (6022,)
Class weights: {0: 0.9385814497272019, 1: 0.752625, 2: 0.752625, 3: 0.752625, 4: 1.1782778864970647, 5: 0.752625, 6: 2.933495736906212, 7: 0.931322505800464, 8: 20.410169491525423, 9: 0.7648142267386472}


Epoch 1/30
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.8847 - loss: 0.4292 - val_accuracy: 0.9714 - val_loss: 0.0854
Epoch 2/30
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9731 - loss: 0.1148 - val_accuracy: 0.9701 - val_loss: 0.0832
Epoch 3/30
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9796 - loss: 0.0915 - val_accuracy: 0.9841 - val_loss: 0.0467
Epoch 4/30
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9839 - loss: 0.0678 - val_accuracy: 0.9816 - val_loss: 0.0547
Epoch 5/30
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9863 - loss: 0.0576 - val_accuracy: 0.9890 - val_loss: 0.0355
Epoch 6/30
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9890 - loss: 0.0473 - val_accuracy: 0.9890 - val_loss: 0.0331
Epoch 7/30
[1m189/189[0m 

In [6]:
import numpy as np
import os
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import tensorflow as tf

DATA_DIR = r"D:\Grad\derived_dataset"
MODEL_PATH = r"D:\Grad\model_output\cnn_lstm_final.keras"
LABEL_MAP_PATH = os.path.join(DATA_DIR, "label_mapping.csv")

# Load data
X = np.load(os.path.join(DATA_DIR, "X_seq.npy"))
y = np.load(os.path.join(DATA_DIR, "y_seq.npy"))

# Recreate the same split (must match Step 4)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Load label names
label_map = pd.read_csv(LABEL_MAP_PATH)
# label_map: Label, Encoded
label_names = label_map.sort_values("Encoded")["Label"].tolist()

# Load model
model = tf.keras.models.load_model(MODEL_PATH)

# Predict
y_prob = model.predict(X_val, batch_size=256, verbose=0)
y_pred = np.argmax(y_prob, axis=1)

print("✅ Classification report (validation):")
print(classification_report(y_val, y_pred, target_names=label_names, digits=4))

print("\n✅ Confusion matrix (validation):")
cm = confusion_matrix(y_val, y_pred)
print(cm)

# Save results
out_dir = r"D:\Grad\model_output"
os.makedirs(out_dir, exist_ok=True)

pd.DataFrame(cm, index=label_names, columns=label_names).to_csv(
    os.path.join(out_dir, "confusion_matrix_val.csv")
)
print("\n✅ Saved confusion matrix to model_output/confusion_matrix_val.csv")


✅ Classification report (validation):
                                   precision    recall  f1-score   support

           ARP_Spoofing_train.csv     0.9717    0.9611    0.9663       642
                 Benign_train.csv     1.0000    1.0000    1.0000       800
MQTT-DDoS-Connect_Flood_train.csv     0.9963    1.0000    0.9981       800
MQTT-DDoS-Publish_Flood_train.csv     1.0000    1.0000    1.0000       800
 MQTT-DoS-Connect_Flood_train.csv     1.0000    0.9941    0.9971       511
 MQTT-DoS-Publish_Flood_train.csv     1.0000    1.0000    1.0000       800
    MQTT-Malformed_Data_train.csv     0.9752    0.9610    0.9681       205
          Recon-OS_Scan_train.csv     0.9953    0.9861    0.9907       646
       Recon-Ping_Sweep_train.csv     0.5714    0.9333    0.7089        30
        Recon-Port_Scan_train.csv     1.0000    0.9962    0.9981       788

                         accuracy                         0.9917      6022
                        macro avg     0.9510    0.9832    0.