<a href="https://colab.research.google.com/github/PIYAL-DATTA/Multi-attack/blob/main/NSL_KDD_ViT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub
import os
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.metrics import Precision, Recall

In [None]:
path = kagglehub.dataset_download("hassan06/nslkdd")

print("Path to dataset files:", path)

train_path = os.path.join(path, "KDDTrain+.txt")
test_path  = os.path.join(path, "KDDTest+.txt")

print("Path to Train dataset files:", train_path)
print("Path to Test dataset files:", test_path)

column_names = [
    'duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent',
    'hot','num_failed_logins','logged_in','num_compromised','root_shell','su_attempted','num_root','num_file_creations',
    'num_shells','num_access_files','num_outbound_cmds','is_host_login','is_guest_login','count','srv_count','serror_rate',
    'srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate',
    'dst_host_count','dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate',
    'dst_host_srv_rerror_rate','label','difficulty_level'
]

train_df = pd.read_csv(train_path, names=column_names)
test_df = pd.read_csv(test_path, names=column_names)
df = pd.concat([train_df, test_df], ignore_index=True)
df

Path to dataset files: /kaggle/input/nslkdd
Path to Train dataset files: /kaggle/input/nslkdd/KDDTrain+.txt
Path to Test dataset files: /kaggle/input/nslkdd/KDDTest+.txt


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label,difficulty_level
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148512,0,tcp,smtp,SF,794,333,0,0,0,0,...,0.72,0.06,0.01,0.01,0.01,0.00,0.00,0.00,normal,21
148513,0,tcp,http,SF,317,938,0,0,0,0,...,1.00,0.00,0.01,0.01,0.01,0.00,0.00,0.00,normal,21
148514,0,tcp,http,SF,54540,8314,0,0,0,2,...,1.00,0.00,0.00,0.00,0.00,0.00,0.07,0.07,back,15
148515,0,udp,domain_u,SF,42,42,0,0,0,0,...,0.99,0.01,0.00,0.00,0.00,0.00,0.00,0.00,normal,21


In [None]:
def map_attack(label):
    dos = ['back','land','neptune','pod','smurf','teardrop','mailbomb','apache2','processtable','udpstorm']
    probe = ['satan','ipsweep','nmap','portsweep','mscan','saint']
    r2l = ['guess_passwd','ftp_write','imap','phf','multihop','warezmaster','warezclient','spy','xlock','xsnoop','snmpguess','snmpgetattack','httptunnel','sendmail','named']
    u2r = ['buffer_overflow','loadmodule','rootkit','perl','sqlattack','xterm','ps']

    if label == 'normal': return 0 # 'normal'
    elif label in dos: return 1 # 'DoS'
    elif label in probe: return 2 # 'Probe'
    elif label in r2l: return 2 # 'R2L'
    elif label in u2r: return 2 # 'U2R'
    else: return 2 # 'unknown'
# 2 represent BotNet attack.
# 'Probe' Early stage, 'R2L' and 'U2R' attack phase

df['attack_class'] = df['label'].apply(map_attack)
df

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label,difficulty_level,attack_class
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.03,0.17,0.00,0.00,0.00,0.05,0.00,normal,20,0
1,0,udp,other,SF,146,0,0,0,0,0,...,0.60,0.88,0.00,0.00,0.00,0.00,0.00,normal,15,0
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.05,0.00,0.00,1.00,1.00,0.00,0.00,neptune,19,1
3,0,tcp,http,SF,232,8153,0,0,0,0,...,0.00,0.03,0.04,0.03,0.01,0.00,0.01,normal,21,0
4,0,tcp,http,SF,199,420,0,0,0,0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal,21,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148512,0,tcp,smtp,SF,794,333,0,0,0,0,...,0.06,0.01,0.01,0.01,0.00,0.00,0.00,normal,21,0
148513,0,tcp,http,SF,317,938,0,0,0,0,...,0.00,0.01,0.01,0.01,0.00,0.00,0.00,normal,21,0
148514,0,tcp,http,SF,54540,8314,0,0,0,2,...,0.00,0.00,0.00,0.00,0.00,0.07,0.07,back,15,1
148515,0,udp,domain_u,SF,42,42,0,0,0,0,...,0.01,0.00,0.00,0.00,0.00,0.00,0.00,normal,21,0


# Vision Transformer (ViT)
Directly to Tabular NSL-KDD Data

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score

def prepare_tabular_data(df):
    # Handle categorical features
    label_encoders = {}
    categorical_cols = ['protocol_type', 'service', 'flag', 'attack_class']

    for col in categorical_cols:
        if col in df.columns:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
            label_encoders[col] = le

    # Separate features and target
    X = df.drop(['attack_class', 'difficulty_level','label'], axis=1, errors='ignore')
    y = df['attack_class']

    # Normalize features
    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)

    # Convert target to one-hot encoding
    num_classes = len(np.unique(y))
    y_categorical = tf.keras.utils.to_categorical(y, num_classes=num_classes)

    return X, y_categorical, label_encoders

class TabularPatchEncoder(layers.Layer):
    def __init__(self, num_features, projection_dim):
        super().__init__()
        self.num_features = num_features
        self.projection = layers.Dense(units=projection_dim)
        self.position_embedding = layers.Embedding(
            input_dim=num_features, output_dim=projection_dim
        )

    def call(self, inputs):
        positions = tf.range(start=0, limit=self.num_features, delta=1)
        # Expand inputs to match position embeddings
        inputs = tf.expand_dims(inputs, axis=-1)  # [batch, features] -> [batch, features, 1]
        projected = self.projection(inputs)  # [batch, features, projection_dim]
        positions = self.position_embedding(positions)  # [features, projection_dim]
        encoded = projected + positions  # broadcast addition
        return encoded

def create_tabular_vit(num_features, projection_dim, transformer_layers,
                      num_heads, transformer_units, mlp_head_units, num_classes):
    inputs = layers.Input(shape=(num_features,))

    # Encode features as patches with positional embeddings
    encoded_features = TabularPatchEncoder(num_features, projection_dim)(inputs)

    # Create multiple layers of the Transformer block
    for _ in range(transformer_layers):
        # Layer normalization 1
        x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_features)

        # Multi-head attention
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=projection_dim, dropout=0.1
        )(x1, x1)

        # Skip connection 1
        x2 = layers.Add()([attention_output, encoded_features])

        # Layer normalization 2
        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)

        # MLP
        x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1)

        # Skip connection 2
        encoded_features = layers.Add()([x3, x2])

    # Create classification head
    representation = layers.LayerNormalization(epsilon=1e-6)(encoded_features)
    representation = layers.GlobalAveragePooling1D()(representation)
    representation = layers.Dropout(0.5)(representation)

    # Add MLP
    features = mlp(representation, hidden_units=mlp_head_units, dropout_rate=0.5)

    # Classify outputs
    logits = layers.Dense(num_classes, activation='softmax')(features)

    return models.Model(inputs=inputs, outputs=logits)

def mlp(x, hidden_units, dropout_rate):
    for units in hidden_units:
        x = layers.Dense(units, activation=tf.nn.gelu)(x)
        x = layers.Dropout(dropout_rate)(x)
    return x

# Main execution
def main():
    # Assuming df is your loaded NSL-KDD dataset
    X, y, label_encoders = prepare_tabular_data(df)
    num_features = X.shape[1]
    num_classes = y.shape[1]

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)

    # ViT parameters
    projection_dim = 64
    transformer_layers = 6
    num_heads = 4
    transformer_units = [projection_dim * 2, projection_dim]
    mlp_head_units = [256, 128]

    # Create model
    vit_model = create_tabular_vit(
        num_features=num_features,
        projection_dim=projection_dim,
        transformer_layers=transformer_layers,
        num_heads=num_heads,
        transformer_units=transformer_units,
        mlp_head_units=mlp_head_units,
        num_classes=num_classes
    )

    # Compile model
    vit_model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss="categorical_crossentropy",
        metrics=["accuracy",
                tf.keras.metrics.Precision(name='precision'),
                tf.keras.metrics.Recall(name='recall')]
    )

    # Train model
    history = vit_model.fit(
        X_train, y_train,
        batch_size=64,
        epochs=30,
        validation_data=(X_test, y_test),
        verbose=1
    )

    # Evaluate
    y_pred = vit_model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_true_classes = np.argmax(y_test, axis=1)

    print("\nClassification Report:")
    print(classification_report(
        y_true_classes, y_pred_classes, digits=4, zero_division=0))

    f1 = f1_score(y_true_classes, y_pred_classes, average='weighted')
    print(f"\nWeighted F1-Score: {f1:.4f}")

if __name__ == "__main__":
    main()

Epoch 1/30
[1m1857/1857[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 30ms/step - accuracy: 0.7989 - loss: 0.5289 - precision: 0.8375 - recall: 0.7494 - val_accuracy: 0.9223 - val_loss: 0.2101 - val_precision: 0.9380 - val_recall: 0.9131
Epoch 2/30
[1m1857/1857[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 18ms/step - accuracy: 0.9359 - loss: 0.1889 - precision: 0.9449 - recall: 0.9280 - val_accuracy: 0.9417 - val_loss: 0.1707 - val_precision: 0.9448 - val_recall: 0.9392
Epoch 3/30
[1m1857/1857[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 17ms/step - accuracy: 0.9487 - loss: 0.1445 - precision: 0.9541 - recall: 0.9440 - val_accuracy: 0.9468 - val_loss: 0.1324 - val_precision: 0.9495 - val_recall: 0.9450
Epoch 4/30
[1m1857/1857[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 17ms/step - accuracy: 0.9518 - loss: 0.1356 - precision: 0.9554 - recall: 0.9487 - val_accuracy: 0.9617 - val_loss: 0.1002 - val_precision: 0.9627 - val_recall: 0.9601
Epoch 5/