<a href="https://colab.research.google.com/github/Vidit122/Mini-Project/blob/main/FinalCombined.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, QuantileTransformer
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Reshape, Conv2D, DepthwiseConv2D,
    BatchNormalization, ReLU, GlobalAveragePooling2D,
    Bidirectional, GRU, Dense, Dropout, Concatenate
)
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [None]:
df = pd.read_csv("5G_NIDD_FULL_9CLASS_MIXED.csv", low_memory=False)

print(df.shape)
print(df.columns[:10])
print(df["Label"].value_counts())

print(df.shape)
print(df.columns[:10])
print(df["Label"].value_counts())

(1693627, 51)
Index(['Max', 'AckDat', 'DstLoss', 'dDSb', 'Sum', 'Mean', 'SrcTCPBase', 'sDSb',
       'dTtl', 'TotBytes'],
      dtype='object')
Label
Benign         477737
UDPFlood1      467717
UDPFlood2      286197
Goldeneye1      93803
Goldeneye2      93650
SYNFlood1       44636
Torshammer1     38120
Torshammer2     31669
Slowloris1      31015
ICMPFlood1      18279
ICMPFlood2      14452
SYNFlood2       14108
Slowloris2      12656
TCPConnect2     11653
TCPConnect1     11645
SYNScan2        11526
SYNScan1        11450
UDPScan2        10305
UDPScan1        10043
SSH1             1608
SSH2             1358
Name: count, dtype: int64
(1693627, 51)
Index(['Max', 'AckDat', 'DstLoss', 'dDSb', 'Sum', 'Mean', 'SrcTCPBase', 'sDSb',
       'dTtl', 'TotBytes'],
      dtype='object')
Label
Benign         477737
UDPFlood1      467717
UDPFlood2      286197
Goldeneye1      93803
Goldeneye2      93650
SYNFlood1       44636
Torshammer1     38120
Torshammer2     31669
Slowloris1      31015
ICMPFlood1    

In [None]:
# ALWAYS start from df
X = df.drop(columns=["Label"])
y = df["Label"]

# Force numeric
X = X.apply(pd.to_numeric, errors="coerce")
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(X.mean(), inplace=True)

print("X shape BEFORE slicing:", X.shape)
 # MUST be > 36 columns

X shape BEFORE slicing: (1693627, 50)


In [None]:
# X = X.iloc[:, :36]
# print("X shape AFTER slicing:", X.shape)

In [None]:
label_map = {
    # HTTP floods
    "Goldeneye1": "HTTP_Flood",
    "Goldeneye2": "HTTP_Flood",
    "Torshammer1": "HTTP_Flood",
    "Torshammer2": "HTTP_Flood",

    # Slow DoS
    "Slowloris1": "Slowrate_DoS",
    "Slowloris2": "Slowrate_DoS",

    # UDP flood
    "UDPFlood1": "UDP_Flood",
    "UDPFlood2": "UDP_Flood",

    # SYN flood
    "SYNFlood1": "SYN_Flood",
    "SYNFlood2": "SYN_Flood",

    # ICMP flood
    "ICMPFlood1": "ICMP_Flood",
    "ICMPFlood2": "ICMP_Flood",

    # Scans
    "UDPScan1": "UDP_Scan",
    "UDPScan2": "UDP_Scan",

    "SYNScan1": "SYN_Scan",
    "SYNScan2": "SYN_Scan",

    "TCPConnect1": "TCP_Connect_Scan",
    "TCPConnect2": "TCP_Connect_Scan",

    # Benign (already correct)
    "Benign": "Benign",

    # SSH (DROP — not used)
    "SSH1": None,
    "SSH2": None
}

In [None]:
df["Label"] = df["Label"].map(label_map)

# Remove rows mapped to None (SSH etc.)
df = df.dropna(subset=["Label"])

In [None]:
print(sorted(df["Label"].unique()))
print("Number of classes:", df["Label"].nunique())

['Benign', 'HTTP_Flood', 'ICMP_Flood', 'SYN_Flood', 'SYN_Scan', 'Slowrate_DoS', 'TCP_Connect_Scan', 'UDP_Flood', 'UDP_Scan']
Number of classes: 9


In [None]:
# Features & labels
X = df.drop(columns=["Label"])
y = df["Label"]

X = X.apply(pd.to_numeric, errors="coerce")
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(X.mean(), inplace=True)

# Convert to numeric
X = X.apply(pd.to_numeric, errors="coerce")

# Remove inf first
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaNs BEFORE QT
X.fillna(X.mean(), inplace=True)


# EXACT 36 features (fixed, no filtering later)
X = X.iloc[:, :36]

# Encode labels
le = LabelEncoder()
y_enc = le.fit_transform(y)
y_onehot = tf.keras.utils.to_categorical(y_enc, 9)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_onehot, test_size=0.2, random_state=42, stratify=y_enc
)


# Quantile transform (paper style)
qt = QuantileTransformer(
    n_quantiles=1000,
    output_distribution="normal",
    random_state=42
)
X_train = qt.fit_transform(X_train)
X_test  = qt.transform(X_test)

X_train = np.nan_to_num(X_train, nan=0.0, posinf=0.0, neginf=0.0)
X_test  = np.nan_to_num(X_test,  nan=0.0, posinf=0.0, neginf=0.0)

# Reshape
X_train = X_train.reshape(-1, 36, 1)
X_test  = X_test.reshape(-1, 36, 1)


  return fnb._ureduce(a,


In [None]:
print("NaNs in X:", np.isnan(X_train).sum())
print("Infs in X:", np.isinf(X_test).sum())
print("y unique sums:", np.unique(y_train.sum(axis=1)))

NaNs in X: 0
Infs in X: 0
y unique sums: [1.]


In [None]:
def MobileNetV1_BiGRU():
    inp = Input(shape=(36, 1))

    # CNN branch
    x = Reshape((36, 1, 1))(inp)
    x = Conv2D(32, (3,3), padding="same")(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)

    x = DepthwiseConv2D((3,3), padding="same")(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = Conv2D(64, (1,1), padding="same")(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)

    x = DepthwiseConv2D((3,3), padding="same")(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = Conv2D(128, (1,1), padding="same")(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)

    cnn_out = GlobalAveragePooling2D()(x)

    # GRU branch (conceptual, limited by data)
    r = Reshape((36,1))(inp)
    r = Bidirectional(GRU(128, return_sequences=True))(r)
    r = Bidirectional(GRU(128))(r)

    # Projection
    merged = Concatenate()([cnn_out, r])
    merged = Dense(256, activation="relu")(merged)
    merged = Dense(128, activation="relu")(merged)
    merged = Dropout(0.5)(merged)

    out = Dense(9, activation="softmax")(merged)
    return Model(inp, out)


In [None]:
model = MobileNetV1_BiGRU()
optimizer = tf.keras.optimizers.Adam(
    learning_rate=0.001,
    clipnorm=1.0   # prevents gradient explosions
)

model.compile(
    optimizer=optimizer,
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

# model.compile(
#     optimizer=Adam(0.001),
#     loss="categorical_crossentropy",
#     metrics=["accuracy"]
# )

model.fit(
    X_train, y_train,
    epochs=10,                      # originally = 50
    batch_size=64,
    validation_split=0.1,
    verbose=1
)

Epoch 1/10
[1m19020/19020[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m457s[0m 23ms/step - accuracy: 0.7019 - loss: 0.6132 - val_accuracy: 0.7161 - val_loss: 0.5255
Epoch 2/10
[1m19020/19020[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m438s[0m 23ms/step - accuracy: 0.7157 - loss: 0.5325 - val_accuracy: 0.7156 - val_loss: 0.5190
Epoch 3/10
[1m19020/19020[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m447s[0m 24ms/step - accuracy: 0.7173 - loss: 0.5243 - val_accuracy: 0.7170 - val_loss: 0.5160
Epoch 4/10
[1m19020/19020[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m449s[0m 24ms/step - accuracy: 0.7177 - loss: 0.5214 - val_accuracy: 0.7171 - val_loss: 0.5130
Epoch 5/10
[1m19020/19020[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m440s[0m 23ms/step - accuracy: 0.7171 - loss: 0.5213 - val_accuracy: 0.7171 - val_loss: 0.5173
Epoch 6/10
[1m19020/19020[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m443s[0m 23ms/step - accuracy: 0.7171 - loss: 0.5199 - val_accuracy: 0.7153 - val

<keras.src.callbacks.history.History at 0x78d57cd8c440>

In [None]:
model = MobileNetV1_BiGRU()
optimizer = tf.keras.optimizers.Adam(
    learning_rate=0.001,
    clipnorm=1.0   # prevents gradient explosions
)

model.compile(
    optimizer=optimizer,
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

# model.compile(
#     optimizer=Adam(0.001),
#     loss="categorical_crossentropy",
#     metrics=["accuracy"]
# )

model.fit(
    X_train, y_train,
    epochs=20,                      # originally = 50
    batch_size=64,
    validation_split=0.1,
    verbose=1
)

Epoch 1/20
[1m19020/19020[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m449s[0m 23ms/step - accuracy: 0.7027 - loss: 0.6116 - val_accuracy: 0.7168 - val_loss: 0.5252
Epoch 2/20
[1m19020/19020[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m437s[0m 23ms/step - accuracy: 0.7169 - loss: 0.5296 - val_accuracy: 0.7169 - val_loss: 0.5229
Epoch 3/20
[1m19020/19020[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m436s[0m 23ms/step - accuracy: 0.7161 - loss: 0.5240 - val_accuracy: 0.7171 - val_loss: 0.5132
Epoch 4/20
[1m19020/19020[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m445s[0m 23ms/step - accuracy: 0.7168 - loss: 0.5203 - val_accuracy: 0.7164 - val_loss: 0.5128
Epoch 5/20
[1m19020/19020[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m440s[0m 23ms/step - accuracy: 0.7172 - loss: 0.5180 - val_accuracy: 0.7174 - val_loss: 0.5130
Epoch 6/20
[1m19020/19020[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m439s[0m 23ms/step - accuracy: 0.7171 - loss: 0.5180 - val_accuracy: 0.7164 - val

In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print("Test Accuracy:", test_acc)


[1m10567/10567[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 7ms/step - accuracy: 0.7124 - loss: 0.5219
Test Accuracy: 0.7127165794372559


In [None]:
from sklearn.metrics import classification_report
import numpy as np

y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)

print(classification_report(y_true, y_pred, target_names=le.classes_))

[1m10567/10567[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 5ms/step
                  precision    recall  f1-score   support

          Benign       0.50      0.36      0.42     95548
      HTTP_Flood       0.94      0.79      0.86     51448
      ICMP_Flood       1.00      0.04      0.07      6546
       SYN_Flood       0.85      0.19      0.31     11749
        SYN_Scan       1.00      0.87      0.93      4595
    Slowrate_DoS       0.56      0.30      0.39      8734
TCP_Connect_Scan       1.00      0.87      0.93      4660
       UDP_Flood       0.72      0.99      0.83    150783
        UDP_Scan       1.00      0.77      0.87      4070

        accuracy                           0.71    338133
       macro avg       0.84      0.58      0.62    338133
    weighted avg       0.71      0.71      0.68    338133

