<a href="https://colab.research.google.com/github/Vidit122/Mini-Project/blob/main/BIGRU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**DATA PREPROCESSING**

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, QuantileTransformer
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Reshape, Conv2D, DepthwiseConv2D,
    BatchNormalization, ReLU, GlobalAveragePooling2D,
    Bidirectional, GRU, Dense, Dropout, Concatenate
)
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [None]:
df = pd.read_csv("5G_NIDD_FULL_9CLASS_MIXED.csv", low_memory=False)

print(df.shape)
print(df.columns[:10])
print(df["Label"].value_counts())

print(df.shape)
print(df.columns[:10])
print(df["Label"].value_counts())

(1693627, 51)
Index(['Max', 'AckDat', 'DstLoss', 'dDSb', 'Sum', 'Mean', 'SrcTCPBase', 'sDSb',
       'dTtl', 'TotBytes'],
      dtype='object')
Label
Benign         477737
UDPFlood1      467717
UDPFlood2      286197
Goldeneye1      93803
Goldeneye2      93650
SYNFlood1       44636
Torshammer1     38120
Torshammer2     31669
Slowloris1      31015
ICMPFlood1      18279
ICMPFlood2      14452
SYNFlood2       14108
Slowloris2      12656
TCPConnect2     11653
TCPConnect1     11645
SYNScan2        11526
SYNScan1        11450
UDPScan2        10305
UDPScan1        10043
SSH1             1608
SSH2             1358
Name: count, dtype: int64
(1693627, 51)
Index(['Max', 'AckDat', 'DstLoss', 'dDSb', 'Sum', 'Mean', 'SrcTCPBase', 'sDSb',
       'dTtl', 'TotBytes'],
      dtype='object')
Label
Benign         477737
UDPFlood1      467717
UDPFlood2      286197
Goldeneye1      93803
Goldeneye2      93650
SYNFlood1       44636
Torshammer1     38120
Torshammer2     31669
Slowloris1      31015
ICMPFlood1    

In [None]:
# ALWAYS start from df
X = df.drop(columns=["Label"])
y = df["Label"]

# Force numeric
X = X.apply(pd.to_numeric, errors="coerce")
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(X.mean(), inplace=True)

print("X shape BEFORE slicing:", X.shape)
 # MUST be > 36 columns

X shape BEFORE slicing: (1693627, 50)


In [None]:
# X = X.iloc[:, :36]
# print("X shape AFTER slicing:", X.shape)


X shape AFTER slicing: (1693627, 36)


In [None]:
label_map = {
    # HTTP floods
    "Goldeneye1": "HTTP_Flood",
    "Goldeneye2": "HTTP_Flood",
    "Torshammer1": "HTTP_Flood",
    "Torshammer2": "HTTP_Flood",

    # Slow DoS
    "Slowloris1": "Slowrate_DoS",
    "Slowloris2": "Slowrate_DoS",

    # UDP flood
    "UDPFlood1": "UDP_Flood",
    "UDPFlood2": "UDP_Flood",

    # SYN flood
    "SYNFlood1": "SYN_Flood",
    "SYNFlood2": "SYN_Flood",

    # ICMP flood
    "ICMPFlood1": "ICMP_Flood",
    "ICMPFlood2": "ICMP_Flood",

    # Scans
    "UDPScan1": "UDP_Scan",
    "UDPScan2": "UDP_Scan",

    "SYNScan1": "SYN_Scan",
    "SYNScan2": "SYN_Scan",

    "TCPConnect1": "TCP_Connect_Scan",
    "TCPConnect2": "TCP_Connect_Scan",

    # Benign (already correct)
    "Benign": "Benign",

    # SSH (DROP — not used)
    "SSH1": None,
    "SSH2": None
}

In [None]:
df["Label"] = df["Label"].map(label_map)

# Remove rows mapped to None (SSH etc.)
df = df.dropna(subset=["Label"])

In [None]:
print(sorted(df["Label"].unique()))
print("Number of classes:", df["Label"].nunique())

['Benign', 'HTTP_Flood', 'ICMP_Flood', 'SYN_Flood', 'SYN_Scan', 'Slowrate_DoS', 'TCP_Connect_Scan', 'UDP_Flood', 'UDP_Scan']
Number of classes: 9


In [None]:
# Features & labels
X = df.drop(columns=["Label"])
y = df["Label"]

X = X.apply(pd.to_numeric, errors="coerce")
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(X.mean(), inplace=True)

# Convert to numeric
X = X.apply(pd.to_numeric, errors="coerce")

# Remove inf first
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaNs BEFORE QT
X.fillna(X.mean(), inplace=True)


# EXACT 36 features (fixed, no filtering later)
X = X.iloc[:, :36]

# Encode labels
le = LabelEncoder()
y_enc = le.fit_transform(y)
y_onehot = tf.keras.utils.to_categorical(y_enc, 9)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_onehot, test_size=0.2, random_state=42, stratify=y_enc
)


# Quantile transform (paper style)
qt = QuantileTransformer(
    n_quantiles=1000,
    output_distribution="normal",
    random_state=42
)
X_train = qt.fit_transform(X_train)
X_test  = qt.transform(X_test)

X_train = np.nan_to_num(X_train, nan=0.0, posinf=0.0, neginf=0.0)
X_test  = np.nan_to_num(X_test,  nan=0.0, posinf=0.0, neginf=0.0)

# Reshape
X_train = X_train.reshape(-1, 36, 1)
X_test  = X_test.reshape(-1, 36, 1)

  return fnb._ureduce(a,


In [None]:
print("NaNs in X:", np.isnan(X_train).sum())
print("Infs in X:", np.isinf(X_test).sum())
print("y unique sums:", np.unique(y_train.sum(axis=1)))

NaNs in X: 0
Infs in X: 0
y unique sums: [1.]


In [None]:
# from sklearn.preprocessing import LabelEncoder
# from tensorflow.keras.utils import to_categorical
# import numpy as np

# X = df.drop(columns=["Label"])
# y = df["Label"]

# le = LabelEncoder()
# y_encoded = le.fit_transform(y)

# NUM_CLASSES = len(np.unique(y_encoded))
# y_onehot = to_categorical(y_encoded, NUM_CLASSES)

# print("Final NUM_CLASSES:", NUM_CLASSES)

Final NUM_CLASSES: 9


In [None]:
# # ALWAYS start from df
# X = df.drop(columns=["Label"])
# y = df["Label"]

# # Force numeric
# X = X.apply(pd.to_numeric, errors="coerce")
# X.replace([np.inf, -np.inf], np.nan, inplace=True)
# X.fillna(X.mean(), inplace=True)

# print("X shape BEFORE slicing:", X.shape)
#  # MUST be > 36 columns

X shape BEFORE slicing: (1690661, 50)


In [None]:
# # Replace inf → nan
# X.replace([np.inf, -np.inf], np.nan, inplace=True)

# # Fill nan with column mean
# X.fillna(X.mean(), inplace=True)

# # Final check
# print("NaNs:", X.isna().sum().sum())
# print("Infs:", np.isinf(X.values).sum())

NaNs: 10143966
Infs: 0


In [None]:
# # MUST be done on X BEFORE split & scaling
# X = X.iloc[:, :36]
# print("X shape after slicing:", X.shape)

X shape after slicing: (1690661, 36)


In [None]:
# X_train, X_test, y_train, y_test = train_test_split(
#     X,
#     y_onehot,
#     test_size=0.2,
#     random_state=42,
#     stratify=y_encoded
# )

In [None]:
# from sklearn.feature_selection import VarianceThreshold

# vt = VarianceThreshold(threshold=0.0)

# X_train = vt.fit_transform(X_train)
# X_test  = vt.transform(X_test)

# print("After variance filter:", X_train.shape)


  self.variances_ = np.nanvar(X, axis=0)
  self.variances_ = np.nanmin(compare_arr, axis=0)


After variance filter: (1352528, 29)


In [None]:
# from sklearn.preprocessing import QuantileTransformer

# qt = QuantileTransformer(
#     n_quantiles=min(100, X_train.shape[0]),
#     output_distribution="normal",
#     subsample=100000,   # MUST be int
#     random_state=42
# )

# X_train = qt.fit_transform(X_train)
# X_test  = qt.transform(X_test)


In [None]:
# assert not np.isnan(X_train).any()
# assert not np.isinf(X_train).any()

In [None]:
# import numpy as np

# TARGET_FEATURES = 36

# if X_train.shape[1] < TARGET_FEATURES:
#     pad_width = TARGET_FEATURES - X_train.shape[1]
#     X_train = np.pad(X_train, ((0, 0), (0, pad_width)), mode="constant")
#     X_test  = np.pad(X_test,  ((0, 0), (0, pad_width)), mode="constant")

# print("After padding:", X_train.shape)


After padding: (1352528, 36)


In [None]:
# print("X_train shape before reshape:", X_train.shape)
# print("X_test shape before reshape:", X_test.shape)

X_train shape before reshape: (1352528, 36)
X_test shape before reshape: (338133, 36)


In [None]:
# X_train = X_train.reshape(-1, 36, 1)
# X_test  = X_test.reshape(-1, 36, 1)


# print("Train shape:", X_train.shape)
# print("Test shape:", X_test.shape)

Train shape: (1352528, 36, 1)
Test shape: (338133, 36, 1)


In [None]:
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()

# X_train_2d = X_train.reshape(X_train.shape[0], X_train.shape[1])
# X_test_2d  = X_test.reshape(X_test.shape[0], X_test.shape[1])

# X_train_scaled = scaler.fit_transform(X_train_2d)
# X_test_scaled  = scaler.transform(X_test_2d)


In [None]:
# import numpy as np

# X_train_scaled[np.isinf(X_train_scaled)] = np.nan
# X_test_scaled[np.isinf(X_test_scaled)] = np.nan

# X_train_scaled = np.nan_to_num(X_train_scaled)
# X_test_scaled = np.nan_to_num(X_test_scaled)

# print("NaNs:", np.isnan(X_train_scaled).sum())
# print("Infs:", np.isinf(X_test_scaled).sum())

In [None]:
print("NaNs in X:", np.isnan(X_train).sum())
print("Infs in X:", np.isinf(X_test).sum())
print("y unique sums:", np.unique(y_train.sum(axis=1)))


NaNs in X: 0
Infs in X: 0
y unique sums: [1.]


In [None]:
# X_train_scaled = X_train_scaled.reshape(X_train_scaled.shape[0], X_train_scaled.shape[1], 1)
# X_test_scaled  = X_test_scaled.reshape(X_test_scaled.shape[0], X_test_scaled.shape[1], 1)

In [None]:
# from sklearn.utils.class_weight import compute_class_weight
# import numpy as np

# class_weights = compute_class_weight(
#     class_weight="balanced",
#     classes=np.unique(y_encoded),
#     y=y_encoded
# )

# class_weights = dict(enumerate(class_weights))
# print(class_weights)

{0: np.float64(0.39321053680628093), 1: np.float64(0.7302509785424706), 2: np.float64(5.739244820574447), 3: np.float64(3.1977941955301343), 4: np.float64(8.175975897554936), 5: np.float64(4.301509519411559), 6: np.float64(8.062976316517393), 7: np.float64(0.2491679717079431), 8: np.float64(9.231925605574121)}


**BIGRU MODEL**

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Bidirectional, GRU,
    Dense, Dropout
)


In [None]:
def BiGRU_Model(num_features, num_classes):

    from tensorflow.keras.layers import Input, Reshape, GRU, Bidirectional

    # 1️⃣ Input
    inputs = Input(shape=(36, 1))

    # 2️⃣ Reshape EXACTLY as paper
    x = Reshape((1, 36))(inputs)

    # 3️⃣ Bi-GRU stack
    x = Bidirectional(GRU(128, return_sequences=True))(x)
    x = Bidirectional(GRU(128, return_sequences=False))(x)

    # -------- Projection layers --------
    x = Dense(128, activation="relu")(x)
    x = Dense(128, activation="relu")(x)
    x = Dropout(0.5)(x)

    # -------- Output layer --------
    outputs = Dense(num_classes, activation="softmax")(x)

    model = Model(inputs, outputs)
    return model


In [None]:
model = BiGRU_Model(
    num_features=36,
    num_classes=9
)

optimizer = tf.keras.optimizers.Adam(
    learning_rate=1e-3,
    clipnorm=1.0
)

# def focal_loss(alpha=0.25, gamma=2.0):
#     def loss(y_true, y_pred):
#         y_pred = tf.clip_by_value(y_pred, 1e-7, 1.0 - 1e-7)
#         ce = -y_true * tf.math.log(y_pred)
#         weight = alpha * tf.pow(1 - y_pred, gamma)
#         return tf.reduce_mean(weight * ce)
#     return loss


model.compile(
    optimizer=optimizer,
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()


In [None]:
# history = model.fit(
#     X_train, y_train,
#     validation_data=(X_test, y_test),
#     epochs=10,
#     batch_size=64,
#     class_weight=class_weights,  # optional but recommended
#     verbose=1
# )

model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=15,                      # originally = 50
    batch_size=64,
    validation_split=0.1,
    verbose=1
)


Epoch 1/10
[1m21134/21134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m258s[0m 12ms/step - accuracy: 0.6913 - loss: 0.0094 - val_accuracy: 0.7136 - val_loss: 0.0060
Epoch 2/10
[1m21134/21134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m245s[0m 12ms/step - accuracy: 0.7104 - loss: 0.0063 - val_accuracy: 0.7132 - val_loss: 0.0057
Epoch 3/10
[1m21134/21134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 11ms/step - accuracy: 0.7140 - loss: 0.0060 - val_accuracy: 0.7161 - val_loss: 0.0057
Epoch 4/10
[1m21134/21134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m248s[0m 12ms/step - accuracy: 0.7146 - loss: 0.0059 - val_accuracy: 0.7163 - val_loss: 0.0055
Epoch 5/10
[1m21134/21134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m247s[0m 12ms/step - accuracy: 0.7164 - loss: 0.0058 - val_accuracy: 0.7165 - val_loss: 0.0055
Epoch 6/10
[1m21134/21134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m244s[0m 12ms/step - accuracy: 0.7153 - loss: 0.0058 - val_accuracy: 0.7174 - val

In [None]:
model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=10,                      # originally = 50
    batch_size=64,
    validation_split=0.1,
    verbose=1
)

Epoch 1/10
[1m21134/21134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m259s[0m 12ms/step - accuracy: 0.7071 - loss: 0.5979 - val_accuracy: 0.7165 - val_loss: 0.5262
Epoch 2/10
[1m21134/21134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 12ms/step - accuracy: 0.7156 - loss: 0.5337 - val_accuracy: 0.7130 - val_loss: 0.5305
Epoch 3/10
[1m21134/21134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m250s[0m 12ms/step - accuracy: 0.7163 - loss: 0.5281 - val_accuracy: 0.7176 - val_loss: 0.5209
Epoch 4/10
[1m21134/21134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m252s[0m 12ms/step - accuracy: 0.7164 - loss: 0.5248 - val_accuracy: 0.7173 - val_loss: 0.5172
Epoch 5/10
[1m21134/21134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m259s[0m 12ms/step - accuracy: 0.7173 - loss: 0.5221 - val_accuracy: 0.7169 - val_loss: 0.5221
Epoch 6/10
[1m21134/21134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m252s[0m 12ms/step - accuracy: 0.7177 - loss: 0.5209 - val_accuracy: 0.7174 - val

<keras.src.callbacks.history.History at 0x7e949f7044a0>

In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print("Test Accuracy:", test_acc)


[1m10567/10567[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 5ms/step - accuracy: 0.7177 - loss: 0.5133
Test Accuracy: 0.7180991172790527


In [None]:
import numpy as np

labels, counts = np.unique(y_encoded, return_counts=True)
for l, c in zip(le.classes_, counts):
    print(l, c)

Benign 477737
HTTP_Flood 257242
ICMP_Flood 32731
SYN_Flood 58744
SYN_Scan 22976
Slowrate_DoS 43671
TCP_Connect_Scan 23298
UDP_Flood 753914
UDP_Scan 20348


In [None]:
from sklearn.metrics import classification_report
import numpy as np

y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)

print(classification_report(y_true, y_pred, target_names=le.classes_))

[1m10567/10567[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 2ms/step
                  precision    recall  f1-score   support

          Benign       0.50      0.36      0.42     95548
      HTTP_Flood       0.93      0.84      0.88     51448
      ICMP_Flood       0.79      0.05      0.09      6546
       SYN_Flood       0.98      0.17      0.29     11749
        SYN_Scan       1.00      0.87      0.93      4595
    Slowrate_DoS       0.89      0.31      0.45      8734
TCP_Connect_Scan       1.00      0.87      0.93      4660
       UDP_Flood       0.72      0.99      0.83    150783
        UDP_Scan       0.99      0.77      0.87      4070

        accuracy                           0.72    338133
       macro avg       0.87      0.58      0.63    338133
    weighted avg       0.72      0.72      0.68    338133

