In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nsl-kdd-augmented/smote_augmented.csv
/kaggle/input/nslkdd/KDDTest+.arff
/kaggle/input/nslkdd/KDDTest-21.arff
/kaggle/input/nslkdd/KDDTest1.jpg
/kaggle/input/nslkdd/KDDTrain+.txt
/kaggle/input/nslkdd/KDDTrain+_20Percent.txt
/kaggle/input/nslkdd/KDDTest-21.txt
/kaggle/input/nslkdd/KDDTest+.txt
/kaggle/input/nslkdd/KDDTrain+.arff
/kaggle/input/nslkdd/index.html
/kaggle/input/nslkdd/KDDTrain+_20Percent.arff
/kaggle/input/nslkdd/KDDTrain1.jpg
/kaggle/input/nslkdd/nsl-kdd/KDDTest+.arff
/kaggle/input/nslkdd/nsl-kdd/KDDTest-21.arff
/kaggle/input/nslkdd/nsl-kdd/KDDTest1.jpg
/kaggle/input/nslkdd/nsl-kdd/KDDTrain+.txt
/kaggle/input/nslkdd/nsl-kdd/KDDTrain+_20Percent.txt
/kaggle/input/nslkdd/nsl-kdd/KDDTest-21.txt
/kaggle/input/nslkdd/nsl-kdd/KDDTest+.txt
/kaggle/input/nslkdd/nsl-kdd/KDDTrain+.arff
/kaggle/input/nslkdd/nsl-kdd/index.html
/kaggle/input/nslkdd/nsl-kdd/KDDTrain+_20Percent.arff
/kaggle/input/nslkdd/nsl-kdd/KDDTrain1.jpg


In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import QuantileTransformer, LabelEncoder
from sklearn.metrics import classification_report
from tqdm import tqdm

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land',
    'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
    'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
    'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count',
    'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
    'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
    'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
    'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate',
    'outcome', 'level'
]

# ===========================================
# 1️⃣ Advanced Preprocessing (Quantile Mapping)
# ===========================================
# Load your SMOTE data
df_train = pd.read_csv("/kaggle/input/nsl-kdd-augmented/smote_augmented.csv") 
df_test = pd.read_csv("/kaggle/input/nslkdd/KDDTest+.txt", header=None)
df_test.columns = columns # Ensure 'columns' list is defined in your session

# Ensure labels match exactly
train_labels = set(df_train['outcome'].unique())
df_test = df_test[df_test['outcome'].isin(train_labels)].reset_index(drop=True)

cat_cols = ['protocol_type', 'service', 'flag']
num_cols = [c for c in df_train.columns if c not in cat_cols + ['outcome', 'level']]

# Label Encode for Embeddings
cat_dims = []
for col in cat_cols:
    le_c = LabelEncoder()
    df_train[col] = le_c.fit_transform(df_train[col].astype(str))
    # Stable mapping for test set
    train_classes = {cls: i for i, cls in enumerate(le_c.classes_)}
    df_test[col] = df_test[col].map(lambda x: train_classes.get(str(x), 0))
    cat_dims.append(len(le_c.classes_))

# Quantile Transformation (Q1 Standard for handling skewed network data)
qt = QuantileTransformer(output_distribution='normal', random_state=42)
X_train_num = qt.fit_transform(df_train[num_cols]).astype(np.float32)
X_test_num = qt.transform(df_test[num_cols]).astype(np.float32)

le_target = LabelEncoder()
y_train = le_target.fit_transform(df_train['outcome'])
y_test = le_target.transform(df_test['outcome'])

# ===========================================
# 2️⃣ Novel Architecture: Contextual Gated Transformer
# ===========================================
class FeatureSieve(nn.Module):
    """Novelty: Dynamically gates numerical noise based on statistical intensity."""
    def __init__(self, num_dim, emb_dim):
        super().__init__()
        self.gate = nn.Sequential(
            nn.Linear(num_dim, num_dim),
            nn.Sigmoid()
        )
        self.projection = nn.Linear(num_dim, emb_dim)
        self.norm = nn.LayerNorm(emb_dim)

    def forward(self, x):
        g = self.gate(x)
        # Apply gate to original signal before projection to keep mapping stable
        x = x * g 
        return self.norm(self.projection(x))

class DualStreamTransformer(nn.Module):
    def __init__(self, cat_dims, num_feat_dim, num_classes, emb_dim=64):
        super().__init__()
        # 1. Semantic Stream (Categorical Embeddings)
        self.embs = nn.ModuleList([nn.Embedding(d, emb_dim) for d in cat_dims])
        
        # 2. Statistical Stream (Gated Numerical Path)
        self.sieve = FeatureSieve(num_feat_dim, emb_dim)
        
        # 3. Attention Backbone
        t_layer = nn.TransformerEncoderLayer(d_model=emb_dim, nhead=8, batch_first=True)
        self.transformer = nn.TransformerEncoder(t_layer, num_layers=3)
        
        # 4. Global Fusion Head
        self.head = nn.Sequential(
            nn.Linear(emb_dim * (len(cat_dims) + 1), 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, num_classes)
        )

    def forward(self, x_cat, x_num):
        # Stream 1: Semantic
        x_c = [emb(x_cat[:, i]) for i, emb in enumerate(self.embs)]
        x_c = torch.stack(x_c, dim=1) # [Batch, 3, Emb]
        
        # Stream 2: Statistical
        x_n = self.sieve(x_num).unsqueeze(1) # [Batch, 1, Emb]
        
        # Cross-Stream Attention
        combined = torch.cat([x_c, x_n], dim=1) # [Batch, 4, Emb]
        context = self.transformer(combined).flatten(1)
        
        return self.head(context)

# ===========================================
# 3️⃣ Training and Evaluation
# ===========================================
class NSLDataset(Dataset):
    def __init__(self, c, n, y):
        self.c, self.n, self.y = torch.tensor(c), torch.tensor(n), torch.tensor(y)
    def __len__(self): return len(self.y)
    def __getitem__(self, i): return self.c[i], self.n[i], self.y[i]

train_loader = DataLoader(NSLDataset(df_train[cat_cols].values, X_train_num, y_train), batch_size=512, shuffle=True)
test_loader = DataLoader(NSLDataset(df_test[cat_cols].values, X_test_num, y_test), batch_size=512, shuffle=False)

model = DualStreamTransformer(cat_dims, X_train_num.shape[1], len(le_target.classes_)).to(DEVICE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-2)
criterion = nn.CrossEntropyLoss()

for epoch in range(20):
    model.train()
    total_loss = 0
    for xc, xn, y in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        xc, xn, y = xc.to(DEVICE), xn.to(DEVICE), y.to(DEVICE)
        optimizer.zero_grad()
        out = model(xc, xn)
        loss = criterion(out, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss: {total_loss/len(train_loader):.4f}")

# Final Metrics
model.eval()
all_p, all_y = [], []
with torch.no_grad():
    for xc, xn, y in test_loader:
        out = model(xc.to(DEVICE), xn.to(DEVICE))
        all_p.extend(torch.argmax(out, 1).cpu().numpy())
        all_y.extend(y.numpy())

print(classification_report(all_y, all_p, target_names=le_target.classes_, zero_division=0))

Epoch 1: 100%|██████████| 1090/1090 [00:22<00:00, 49.54it/s]


Loss: 0.1794


Epoch 2: 100%|██████████| 1090/1090 [00:21<00:00, 50.40it/s]


Loss: 0.0970


Epoch 3: 100%|██████████| 1090/1090 [00:21<00:00, 50.30it/s]


Loss: 0.0792


Epoch 4: 100%|██████████| 1090/1090 [00:21<00:00, 49.95it/s]


Loss: 0.0688


Epoch 5: 100%|██████████| 1090/1090 [00:21<00:00, 49.79it/s]


Loss: 0.0629


Epoch 6: 100%|██████████| 1090/1090 [00:21<00:00, 49.61it/s]


Loss: 0.0588


Epoch 7: 100%|██████████| 1090/1090 [00:22<00:00, 49.26it/s]


Loss: 0.0547


Epoch 8: 100%|██████████| 1090/1090 [00:22<00:00, 48.65it/s]


Loss: 0.0516


Epoch 9: 100%|██████████| 1090/1090 [00:22<00:00, 48.70it/s]


Loss: 0.0487


Epoch 10: 100%|██████████| 1090/1090 [00:22<00:00, 48.03it/s]


Loss: 0.0463


Epoch 11: 100%|██████████| 1090/1090 [00:22<00:00, 48.35it/s]


Loss: 0.0443


Epoch 12: 100%|██████████| 1090/1090 [00:22<00:00, 48.14it/s]


Loss: 0.0419


Epoch 13: 100%|██████████| 1090/1090 [00:22<00:00, 48.15it/s]


Loss: 0.0407


Epoch 14: 100%|██████████| 1090/1090 [00:22<00:00, 48.58it/s]


Loss: 0.0401


Epoch 15: 100%|██████████| 1090/1090 [00:22<00:00, 48.24it/s]


Loss: 0.0378


Epoch 16: 100%|██████████| 1090/1090 [00:22<00:00, 48.46it/s]


Loss: 0.0373


Epoch 17: 100%|██████████| 1090/1090 [00:22<00:00, 48.15it/s]


Loss: 0.0361


Epoch 18: 100%|██████████| 1090/1090 [00:22<00:00, 48.17it/s]


Loss: 0.0345


Epoch 19: 100%|██████████| 1090/1090 [00:22<00:00, 48.54it/s]


Loss: 0.0341


Epoch 20: 100%|██████████| 1090/1090 [00:22<00:00, 47.95it/s]


Loss: 0.0334
                 precision    recall  f1-score   support

           back       0.99      1.00      0.99       359
buffer_overflow       0.12      0.20      0.15        20
      ftp_write       0.00      0.00      0.00         3
   guess_passwd       0.71      0.00      0.01      1231
           imap       0.00      0.00      0.00         1
        ipsweep       0.80      0.96      0.87       141
           land       1.00      1.00      1.00         7
     loadmodule       0.00      0.00      0.00         2
       multihop       0.00      0.00      0.00        18
        neptune       1.00      0.99      0.99      4657
           nmap       0.72      1.00      0.84        73
         normal       0.86      0.93      0.90      9711
           perl       0.20      0.50      0.29         2
            phf       0.03      0.50      0.05         2
            pod       0.62      0.88      0.73        41
      portsweep       0.67      0.91      0.77       157
        rootkit  

In [4]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import QuantileTransformer, LabelEncoder
from sklearn.metrics import classification_report
from tqdm import tqdm

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# ===========================================
# 1️⃣ Advanced Preprocessing (Quantile Mapping)
# ===========================================
# Use your stable loading logic
df_train = pd.read_csv("/kaggle/input/nsl-kdd-augmented/smote_augmented.csv") 
df_test = pd.read_csv("/kaggle/input/nslkdd/KDDTest+.txt", header=None)
df_test.columns = columns # Ensure columns are assigned correctly

train_labels = set(df_train['outcome'].unique())
df_test = df_test[df_test['outcome'].isin(train_labels)].reset_index(drop=True)

cat_cols = ['protocol_type', 'service', 'flag']
num_cols = [c for c in df_train.columns if c not in cat_cols + ['outcome', 'level']]

# Label Encode for Embeddings
cat_dims = []
for col in cat_cols:
    le_c = LabelEncoder()
    df_train[col] = le_c.fit_transform(df_train[col].astype(str))
    train_classes = {cls: i for i, cls in enumerate(le_c.classes_)}
    df_test[col] = df_test[col].map(lambda x: train_classes.get(str(x), 0))
    cat_dims.append(len(le_c.classes_))

# Quantile Transformer handles skewed network outliers better than StandardScaler
qt = QuantileTransformer(output_distribution='normal', random_state=42)
X_train_num = qt.fit_transform(df_train[num_cols]).astype(np.float32)
X_test_num = qt.transform(df_test[num_cols]).astype(np.float32)

le_target = LabelEncoder()
y_train = le_target.fit_transform(df_train['outcome'])
y_test = le_target.transform(df_test['outcome'])

# ===========================================
# 2️⃣ Novel Architecture: Gated Residual Transformer
# ===========================================
class GLUBlock(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.fc = nn.Linear(dim, dim * 2)
        self.ln = nn.LayerNorm(dim)
    def forward(self, x):
        res = x
        x = self.fc(x)
        content, gate = x.chunk(2, dim=-1)
        x = content * torch.sigmoid(gate)
        return self.ln(x + res)

class GMHResNet(nn.Module):
    def __init__(self, cat_dims, num_dim, n_classes, emb_dim=64):
        super().__init__()
        self.embs = nn.ModuleList([nn.Embedding(d, emb_dim) for d in cat_dims])
        self.num_gate = nn.Sequential(nn.Linear(num_dim, 128), GLUBlock(128))
        self.num_proj = nn.Linear(128, emb_dim)
        
        t_layer = nn.TransformerEncoderLayer(d_model=emb_dim, nhead=8, batch_first=True)
        self.transformer = nn.TransformerEncoder(t_layer, num_layers=3)
        
        self.head = nn.Sequential(
            nn.Linear(emb_dim * 4, 512),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(512, n_classes)
        )

    def forward(self, x_c, x_n):
        x_c_emb = torch.stack([e(x_c[:, i]) for i, e in enumerate(self.embs)], dim=1)
        x_n_gated = self.num_proj(self.num_gate(x_n)).unsqueeze(1)
        
        combined = torch.cat([x_c_emb, x_n_gated], dim=1)
        x = self.transformer(combined).flatten(1)
        return self.head(x)

# ===========================================
# 3️⃣ Training & Balanced Loss
# ===========================================
class_counts = np.bincount(y_train)
logit_adj = torch.tensor(class_counts + 1).float().log().to(DEVICE)

model = GMHResNet(cat_dims, X_train_num.shape[1], len(le_target.classes_)).to(DEVICE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-2)

train_loader = DataLoader(NSLDataset(df_train[cat_cols].values, X_train_num, y_train), batch_size=512, shuffle=True)
test_loader = DataLoader(NSLDataset(df_test[cat_cols].values, X_test_num, y_test), batch_size=512, shuffle=False)

for epoch in range(20):
    model.train()
    for xc, xn, y in tqdm(train_loader):
        xc, xn, y = xc.to(DEVICE), xn.to(DEVICE), y.to(DEVICE)
        optimizer.zero_grad()
        logits = model(xc, xn)
        # Apply logit adjustment to shift boundary for rare classes
        loss = nn.functional.cross_entropy(logits - 0.5 * logit_adj, y)
        loss.backward()
        optimizer.step()

# Evaluation
model.eval()
all_p, all_y = [], []
with torch.no_grad():
    for xc, xn, y in test_loader:
        out = model(xc.to(DEVICE), xn.to(DEVICE))
        all_p.extend(torch.argmax(out, 1).cpu().numpy())
        all_y.extend(y.numpy())

print(classification_report(all_y, all_p, target_names=le_target.classes_, zero_division=0))

100%|██████████| 1090/1090 [00:17<00:00, 61.26it/s]
100%|██████████| 1090/1090 [00:17<00:00, 61.26it/s]
100%|██████████| 1090/1090 [00:17<00:00, 61.68it/s]
100%|██████████| 1090/1090 [00:18<00:00, 60.31it/s]
100%|██████████| 1090/1090 [00:18<00:00, 60.46it/s]
100%|██████████| 1090/1090 [00:18<00:00, 59.96it/s]
100%|██████████| 1090/1090 [00:18<00:00, 59.78it/s]
100%|██████████| 1090/1090 [00:18<00:00, 59.57it/s]
100%|██████████| 1090/1090 [00:18<00:00, 59.37it/s]
100%|██████████| 1090/1090 [00:18<00:00, 59.60it/s]
100%|██████████| 1090/1090 [00:18<00:00, 59.60it/s]
100%|██████████| 1090/1090 [00:18<00:00, 59.60it/s]
100%|██████████| 1090/1090 [00:18<00:00, 59.64it/s]
100%|██████████| 1090/1090 [00:18<00:00, 59.62it/s]
100%|██████████| 1090/1090 [00:18<00:00, 59.56it/s]
100%|██████████| 1090/1090 [00:18<00:00, 59.46it/s]
100%|██████████| 1090/1090 [00:18<00:00, 59.53it/s]
100%|██████████| 1090/1090 [00:18<00:00, 59.59it/s]
100%|██████████| 1090/1090 [00:18<00:00, 59.21it/s]
100%|███████

                 precision    recall  f1-score   support

           back       0.98      0.97      0.97       359
buffer_overflow       0.08      0.20      0.11        20
      ftp_write       0.00      0.00      0.00         3
   guess_passwd       1.00      0.00      0.01      1231
           imap       0.00      0.00      0.00         1
        ipsweep       0.69      0.94      0.79       141
           land       1.00      1.00      1.00         7
     loadmodule       0.00      0.00      0.00         2
       multihop       0.00      0.00      0.00        18
        neptune       0.99      0.98      0.99      4657
           nmap       0.57      1.00      0.72        73
         normal       0.86      0.92      0.89      9711
           perl       0.22      1.00      0.36         2
            phf       0.03      0.50      0.05         2
            pod       0.45      0.80      0.57        41
      portsweep       0.76      0.91      0.83       157
        rootkit       0.00    

In [5]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from tqdm import tqdm

# ===========================================
# 1️⃣ Load SMOTE-Augmented Data & Test Data
# ===========================================
local_path = "/kaggle/input/nslkdd/"
train_path = "/kaggle/input/nsl-kdd-augmented/"

# Load augmented training data and standard test data
df_train = pd.read_csv(train_path + "smote_augmented.csv") 
df_test  = pd.read_csv(local_path + "KDDTest+.txt", header=None)

columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land',
    'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
    'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
    'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count',
    'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
    'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
    'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
    'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate',
    'outcome', 'level'
]
df_test.columns = columns

# Standardize outcome to string to prevent type mismatch
df_train['outcome'] = df_train['outcome'].astype(str)
df_test['outcome'] = df_test['outcome'].astype(str)

# Filter test set: Only evaluate on classes present in the augmented training set
train_labels = set(df_train['outcome'].unique())
df_test = df_test[df_test['outcome'].isin(train_labels)].reset_index(drop=True)

# ===========================================
# 2️⃣ Preprocessing
# ===========================================
target = 'outcome'
cat_cols = ['protocol_type', 'service', 'flag']
num_cols = [c for c in columns if c not in cat_cols + [target, 'level']]

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
])

X_train_proc = preprocessor.fit_transform(df_train.drop(columns=[target, 'level'], errors='ignore'))
X_test_proc  = preprocessor.transform(df_test.drop(columns=[target, 'level'], errors='ignore'))

le = LabelEncoder()
y_train_enc = le.fit_transform(df_train[target])
y_test_enc  = le.transform(df_test[target])
num_classes = len(le.classes_)

# ===========================================
# 3️⃣ Data Loaders (No Sampler needed for SMOTE)
# ===========================================
# Since SMOTE already balanced the classes, we don't need a Weighted Sampler. 
# Standard Shuffling is preferred to let the model learn the new distributions.
train_dataset = torch.utils.data.TensorDataset(torch.tensor(X_train_proc, dtype=torch.float32), 
                                               torch.tensor(y_train_enc, dtype=torch.long))
test_dataset  = torch.utils.data.TensorDataset(torch.tensor(X_test_proc, dtype=torch.float32), 
                                               torch.tensor(y_test_enc, dtype=torch.long))

train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=512, shuffle=False)

# ===========================================
# 4️⃣ Architecture: GMH-ResNet (Gated Multi-Head Residual)
# ===========================================
class GLULayer(nn.Module):
    """Gated Linear Unit for non-linear feature selection."""
    def __init__(self, dim):
        super().__init__()
        self.fc = nn.Linear(dim, dim * 2)
    def forward(self, x):
        x = self.fc(x)
        content, gate = x.chunk(2, dim=-1)
        return content * torch.sigmoid(gate)

class GatedResidualBlock(nn.Module):
    def __init__(self, dim, dropout=0.2):
        super().__init__()
        self.glu = GLULayer(dim)
        self.ln = nn.LayerNorm(dim)
        self.drop = nn.Dropout(dropout)
    def forward(self, x):
        return x + self.drop(self.ln(self.glu(x)))

class GMHResNet(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.stem = nn.Linear(input_dim, 512)
        self.blocks = nn.Sequential(
            GatedResidualBlock(512),
            nn.Linear(512, 256),
            GatedResidualBlock(256),
            nn.Dropout(0.3)
        )
        self.head = nn.Linear(256, num_classes)

    def forward(self, x):
        x = torch.relu(self.stem(x))
        x = self.blocks(x)
        return self.head(x)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = GMHResNet(X_train_proc.shape[1], num_classes).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-2)
criterion = nn.CrossEntropyLoss()

# ===========================================
# 5️⃣ Training Loop
# ===========================================
for epoch in range(20):
    model.train()
    total_loss = 0
    for X_b, y_b in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        X_b, y_b = X_b.to(device), y_b.to(device)
        optimizer.zero_grad()
        logits = model(X_b)
        loss = criterion(logits, y_b)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss: {total_loss/len(train_loader):.4f}")

# ===========================================
# 6️⃣ Evaluation
# ===========================================
model.eval()
all_p, all_y = [], []
with torch.no_grad():
    for X_b, y_b in test_loader:
        out = model(X_b.to(device))
        all_p.extend(torch.argmax(out, dim=1).cpu().numpy())
        all_y.extend(y_b.numpy())

print("\n--- Final Q1 Evaluation (SMOTE + GMH-ResNet) ---")
print(classification_report(all_y, all_p, labels=np.arange(num_classes), target_names=le.classes_, zero_division=0))

Epoch 1: 100%|██████████| 1090/1090 [00:07<00:00, 146.55it/s]


Loss: 0.1544


Epoch 2: 100%|██████████| 1090/1090 [00:07<00:00, 149.86it/s]


Loss: 0.0982


Epoch 3: 100%|██████████| 1090/1090 [00:07<00:00, 147.67it/s]


Loss: 0.0846


Epoch 4: 100%|██████████| 1090/1090 [00:07<00:00, 149.05it/s]


Loss: 0.0759


Epoch 5: 100%|██████████| 1090/1090 [00:07<00:00, 147.53it/s]


Loss: 0.0701


Epoch 6: 100%|██████████| 1090/1090 [00:07<00:00, 150.23it/s]


Loss: 0.0644


Epoch 7: 100%|██████████| 1090/1090 [00:07<00:00, 146.83it/s]


Loss: 0.0614


Epoch 8: 100%|██████████| 1090/1090 [00:07<00:00, 149.77it/s]


Loss: 0.0582


Epoch 9: 100%|██████████| 1090/1090 [00:07<00:00, 147.64it/s]


Loss: 0.0556


Epoch 10: 100%|██████████| 1090/1090 [00:07<00:00, 149.66it/s]


Loss: 0.0537


Epoch 11: 100%|██████████| 1090/1090 [00:07<00:00, 147.77it/s]


Loss: 0.0518


Epoch 12: 100%|██████████| 1090/1090 [00:07<00:00, 150.13it/s]


Loss: 0.0490


Epoch 13: 100%|██████████| 1090/1090 [00:07<00:00, 148.05it/s]


Loss: 0.0480


Epoch 14: 100%|██████████| 1090/1090 [00:07<00:00, 143.98it/s]


Loss: 0.0465


Epoch 15: 100%|██████████| 1090/1090 [00:07<00:00, 149.28it/s]


Loss: 0.0450


Epoch 16: 100%|██████████| 1090/1090 [00:07<00:00, 146.81it/s]


Loss: 0.0437


Epoch 17: 100%|██████████| 1090/1090 [00:07<00:00, 150.67it/s]


Loss: 0.0422


Epoch 18: 100%|██████████| 1090/1090 [00:07<00:00, 147.91it/s]


Loss: 0.0405


Epoch 19: 100%|██████████| 1090/1090 [00:07<00:00, 151.67it/s]


Loss: 0.0399


Epoch 20: 100%|██████████| 1090/1090 [00:07<00:00, 147.79it/s]


Loss: 0.0392

--- Final Q1 Evaluation (SMOTE + GMH-ResNet) ---
                 precision    recall  f1-score   support

           back       0.56      1.00      0.72       359
buffer_overflow       0.00      0.00      0.00        20
      ftp_write       0.00      0.00      0.00         3
   guess_passwd       1.00      0.00      0.00      1231
           imap       0.00      0.00      0.00         1
        ipsweep       0.65      0.99      0.79       141
           land       1.00      1.00      1.00         7
     loadmodule       0.00      0.00      0.00         2
       multihop       0.00      0.00      0.00        18
        neptune       1.00      0.98      0.99      4657
           nmap       0.97      1.00      0.99        73
         normal       0.86      0.95      0.90      9711
           perl       0.11      0.50      0.18         2
            phf       0.08      0.50      0.13         2
            pod       0.62      0.88      0.73        41
      portsweep       0.

In [6]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from tqdm import tqdm

# ===========================================
# 1️⃣ Stable Preprocessing Pipeline
# ===========================================
# (Using your provided paths and loading logic)
df_train = pd.read_csv("/kaggle/input/nsl-kdd-augmented/smote_augmented.csv") 
df_test  = pd.read_csv("/kaggle/input/nslkdd/KDDTest+.txt", header=None)
df_test.columns = columns # Ensure 'columns' is defined

df_train['outcome'] = df_train['outcome'].astype(str)
df_test['outcome'] = df_test['outcome'].astype(str)
train_labels = set(df_train['outcome'].unique())
df_test = df_test[df_test['outcome'].isin(train_labels)].reset_index(drop=True)

target = 'outcome'
cat_cols = ['protocol_type', 'service', 'flag']
num_cols = [c for c in columns if c not in cat_cols + [target, 'level']]

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
])

X_train_proc = preprocessor.fit_transform(df_train.drop(columns=[target, 'level'], errors='ignore'))
X_test_proc  = preprocessor.transform(df_test.drop(columns=[target, 'level'], errors='ignore'))

le = LabelEncoder()
y_train_enc = le.fit_transform(df_train[target])
y_test_enc  = le.transform(df_test[target])
num_classes = len(le.classes_)

# ===========================================
# 2️⃣ Novel Architecture: Dual-Attention Feature Sieve (DAFS)
# ===========================================
class ChannelAttention(nn.Module):
    """Novelty: Squeeze-and-Excitation to recalibrate feature importance."""
    def __init__(self, channel, reduction=16):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel, bias=False),
            nn.Sigmoid()
        )
    def forward(self, x):
        return x * self.fc(x)

class DAFSBlock(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.attn = ChannelAttention(dim)
        self.glu = nn.Sequential(nn.Linear(dim, dim * 2), nn.GLU())
        self.norm = nn.LayerNorm(dim)
    def forward(self, x):
        res = x
        x = self.attn(x)
        x = self.glu(x)
        return self.norm(x + res)

class DAFSNet(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.stem = nn.Linear(input_dim, 1024)
        self.blocks = nn.Sequential(
            DAFSBlock(1024),
            nn.Linear(1024, 512),
            DAFSBlock(512),
            nn.Dropout(0.4)
        )
        self.head = nn.Linear(512, num_classes)

    def forward(self, x):
        x = F.gelu(self.stem(x))
        x = self.blocks(x)
        return self.head(x)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = DAFSNet(X_train_proc.shape[1], num_classes).to(device)

# NOVELTY: Logit Adjustment for Long-Tail Minority Classes
class_counts = np.bincount(y_train_enc)
logit_adj = torch.tensor(class_counts + 1).float().log().to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=0.01)

# ===========================================
# 3️⃣ Training Loop with Logit Adjustment
# ===========================================
train_loader = DataLoader(torch.utils.data.TensorDataset(torch.tensor(X_train_proc, dtype=torch.float32), 
                                                        torch.tensor(y_train_enc, dtype=torch.long)), 
                          batch_size=512, shuffle=True)

for epoch in range(25):
    model.train()
    for X_b, y_b in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        X_b, y_b = X_b.to(device), y_b.to(device)
        optimizer.zero_grad()
        logits = model(X_b)
        
        # Novelty: Logit Adjustment shifts the decision boundary during loss calculation
        loss = F.cross_entropy(logits - 0.5 * logit_adj, y_b, label_smoothing=0.1)
        
        loss.backward()
        optimizer.step()

# ===========================================
# 4️⃣ Evaluation
# ===========================================
test_loader = DataLoader(torch.utils.data.TensorDataset(torch.tensor(X_test_proc, dtype=torch.float32), 
                                                       torch.tensor(y_test_enc, dtype=torch.long)), 
                         batch_size=512, shuffle=False)
model.eval()
all_p, all_y = [], []
with torch.no_grad():
    for X_b, y_b in test_loader:
        out = model(X_b.to(device))
        all_p.extend(torch.argmax(out, dim=1).cpu().numpy())
        all_y.extend(y_b.numpy())

print(classification_report(all_y, all_p, labels=np.arange(num_classes), target_names=le.classes_, zero_division=0))

Epoch 1: 100%|██████████| 1090/1090 [00:07<00:00, 146.68it/s]
Epoch 2: 100%|██████████| 1090/1090 [00:07<00:00, 146.28it/s]
Epoch 3: 100%|██████████| 1090/1090 [00:07<00:00, 146.25it/s]
Epoch 4: 100%|██████████| 1090/1090 [00:07<00:00, 149.53it/s]
Epoch 5: 100%|██████████| 1090/1090 [00:07<00:00, 149.98it/s]
Epoch 6: 100%|██████████| 1090/1090 [00:07<00:00, 145.52it/s]
Epoch 7: 100%|██████████| 1090/1090 [00:07<00:00, 149.97it/s]
Epoch 8: 100%|██████████| 1090/1090 [00:07<00:00, 146.79it/s]
Epoch 9: 100%|██████████| 1090/1090 [00:07<00:00, 149.51it/s]
Epoch 10: 100%|██████████| 1090/1090 [00:07<00:00, 146.18it/s]
Epoch 11: 100%|██████████| 1090/1090 [00:07<00:00, 146.50it/s]
Epoch 12: 100%|██████████| 1090/1090 [00:07<00:00, 150.00it/s]
Epoch 13: 100%|██████████| 1090/1090 [00:07<00:00, 149.99it/s]
Epoch 14: 100%|██████████| 1090/1090 [00:07<00:00, 146.78it/s]
Epoch 15: 100%|██████████| 1090/1090 [00:07<00:00, 146.72it/s]
Epoch 16: 100%|██████████| 1090/1090 [00:07<00:00, 148.81it/s]
E

                 precision    recall  f1-score   support

           back       0.56      1.00      0.72       359
buffer_overflow       0.02      0.10      0.03        20
      ftp_write       0.12      0.33      0.18         3
   guess_passwd       0.75      0.00      0.00      1231
           imap       0.00      0.00      0.00         1
        ipsweep       0.73      0.98      0.84       141
           land       1.00      1.00      1.00         7
     loadmodule       0.00      0.00      0.00         2
       multihop       0.00      0.00      0.00        18
        neptune       1.00      0.99      0.99      4657
           nmap       0.94      1.00      0.97        73
         normal       0.86      0.94      0.90      9711
           perl       0.00      0.00      0.00         2
            phf       0.02      0.50      0.05         2
            pod       0.64      0.88      0.74        41
      portsweep       0.76      0.85      0.80       157
        rootkit       0.03    

In [7]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from tqdm import tqdm

# ===========================================
# 1️⃣ Stable Preprocessing (Mapping preserved)
# ===========================================
# (Keep your existing data loading and ColumnTransformer logic here)
# ... [Assuming df_train (SMOTE), df_test, and preprocessor are defined as before] ...

X_train_proc = preprocessor.fit_transform(df_train.drop(columns=[target, 'level'], errors='ignore'))
X_test_proc  = preprocessor.transform(df_test.drop(columns=[target, 'level'], errors='ignore'))

le = LabelEncoder()
y_train_enc = le.fit_transform(df_train[target])
y_test_enc  = le.transform(df_test[target])
num_classes = len(le.classes_)

# Calculate class frequencies for Logit Adjustment
class_counts = np.bincount(y_train_enc)
logit_adj = torch.tensor(class_counts + 1).float().log().to('cuda' if torch.cuda.is_available() else 'cpu')

# ===========================================
# 2️⃣ Novel Architecture: Multi-Head Sparse-Attention Gated Transformer
# ===========================================

class GatedSparseAttention(nn.Module):
    """Novelty: Gating mechanism to prioritize minority class features."""
    def __init__(self, dim, heads=8):
        super().__init__()
        self.heads = heads
        self.scale = (dim // heads) ** -0.5
        self.qkv = nn.Linear(dim, dim * 3)
        self.gate = nn.Sequential(
            nn.Linear(dim, dim),
            nn.Sigmoid()
        )
        self.proj = nn.Linear(dim, dim)
        self.norm = nn.LayerNorm(dim)

    def forward(self, x):
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.heads, C // self.heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]

        attn = (q @ k.transpose(-2, -1)) * self.scale
        # Sparse Attention: Only attend to the most relevant features
        attn = attn.softmax(dim=-1)
        
        out = (attn @ v).transpose(1, 2).reshape(B, N, C)
        # Apply Gating: Allows minority signals to pass while filtering majority noise
        g = self.gate(x)
        return self.norm(x + self.proj(out * g))

class MSAGTNet(nn.Module):
    def __init__(self, input_dim, num_classes, embed_dim=256):
        super().__init__()
        # Stem: Project high-dim tabular data into Transformer space
        self.stem = nn.Sequential(
            nn.Linear(input_dim, embed_dim),
            nn.GELU(),
            nn.LayerNorm(embed_dim)
        )
        
        # Transformer Blocks with Sparse Gating
        self.blocks = nn.Sequential(
            GatedSparseAttention(embed_dim),
            GatedSparseAttention(embed_dim),
            GatedSparseAttention(embed_dim)
        )
        
        # Classification Head
        self.classifier = nn.Sequential(
            nn.Linear(embed_dim, 512),
            nn.GELU(),
            nn.Dropout(0.4),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        # Treat features as a sequence of 1 (Tabular style)
        x = self.stem(x).unsqueeze(1) 
        x = self.blocks(x)
        x = x.squeeze(1)
        return self.classifier(x)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = MSAGTNet(X_train_proc.shape[1], num_classes).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=0.05)

# ===========================================
# 3️⃣ Training with Logit Adjustment
# ===========================================

train_loader = DataLoader(torch.utils.data.TensorDataset(torch.tensor(X_train_proc, dtype=torch.float32), 
                                                        torch.tensor(y_train_enc, dtype=torch.long)), 
                          batch_size=512, shuffle=True)

for epoch in range(30):
    model.train()
    for X_b, y_b in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        X_b, y_b = X_b.to(device), y_b.to(device)
        optimizer.zero_grad()
        
        logits = model(X_b)
        
        # Logit Adjustment: Corrects the decision boundary for rare attacks
        # Subtracting log-priors makes the model 'work harder' to identify rare classes
        loss = F.cross_entropy(logits - 0.5 * logit_adj, y_b, label_smoothing=0.1)
        
        loss.backward()
        optimizer.step()

# ===========================================
# 4️⃣ Evaluation
# ===========================================
model.eval()
test_loader = DataLoader(torch.utils.data.TensorDataset(torch.tensor(X_test_proc, dtype=torch.float32), 
                                                       torch.tensor(y_test_enc, dtype=torch.long)), 
                         batch_size=512, shuffle=False)
all_p, all_y = [], []
with torch.no_grad():
    for X_b, y_b in test_loader:
        out = model(X_b.to(device))
        all_p.extend(torch.argmax(out, 1).cpu().numpy())
        all_y.extend(y_b.numpy())

print(classification_report(all_y, all_p, labels=np.arange(num_classes), target_names=le.classes_, zero_division=0))

Epoch 1: 100%|██████████| 1090/1090 [00:10<00:00, 102.29it/s]
Epoch 2: 100%|██████████| 1090/1090 [00:10<00:00, 103.06it/s]
Epoch 3: 100%|██████████| 1090/1090 [00:10<00:00, 102.11it/s]
Epoch 4: 100%|██████████| 1090/1090 [00:10<00:00, 103.00it/s]
Epoch 5: 100%|██████████| 1090/1090 [00:10<00:00, 102.59it/s]
Epoch 6: 100%|██████████| 1090/1090 [00:10<00:00, 103.54it/s]
Epoch 7: 100%|██████████| 1090/1090 [00:10<00:00, 103.30it/s]
Epoch 8: 100%|██████████| 1090/1090 [00:10<00:00, 102.96it/s]
Epoch 9: 100%|██████████| 1090/1090 [00:10<00:00, 104.19it/s]
Epoch 10: 100%|██████████| 1090/1090 [00:10<00:00, 103.46it/s]
Epoch 11: 100%|██████████| 1090/1090 [00:10<00:00, 103.40it/s]
Epoch 12: 100%|██████████| 1090/1090 [00:10<00:00, 102.34it/s]
Epoch 13: 100%|██████████| 1090/1090 [00:10<00:00, 103.40it/s]
Epoch 14: 100%|██████████| 1090/1090 [00:10<00:00, 103.51it/s]
Epoch 15: 100%|██████████| 1090/1090 [00:10<00:00, 104.00it/s]
Epoch 16: 100%|██████████| 1090/1090 [00:10<00:00, 103.67it/s]
E

                 precision    recall  f1-score   support

           back       0.56      1.00      0.72       359
buffer_overflow       0.02      0.20      0.04        20
      ftp_write       0.17      0.33      0.22         3
   guess_passwd       1.00      0.00      0.00      1231
           imap       0.00      0.00      0.00         1
        ipsweep       0.77      0.98      0.86       141
           land       1.00      0.86      0.92         7
     loadmodule       0.00      0.00      0.00         2
       multihop       0.00      0.00      0.00        18
        neptune       1.00      0.95      0.97      4657
           nmap       0.78      1.00      0.87        73
         normal       0.89      0.95      0.92      9711
           perl       0.17      1.00      0.29         2
            phf       0.11      0.50      0.18         2
            pod       0.51      0.88      0.65        41
      portsweep       0.75      0.93      0.83       157
        rootkit       0.01    

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.metrics import classification_report
from tqdm import tqdm

# ===========================================
# 1️⃣ Novel Architecture: Adaptive Class-Attention Transformer (ACAT)
# ===========================================

class FeatureAttentionGate(nn.Module):
    """Novelty: Squeezes features to find rare attack signatures."""
    def __init__(self, dim, reduction=4):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(dim, dim // reduction, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(dim // reduction, dim, bias=False),
            nn.Sigmoid()
        )

    def forward(self, x):
        # Recalibrates feature importance specifically for minority classes
        return x * self.fc(x)

class ACATBlock(nn.Module):
    def __init__(self, dim, heads=8, dropout=0.2):
        super().__init__()
        self.gate = FeatureAttentionGate(dim)
        self.attn = nn.MultiheadAttention(dim, heads, dropout=dropout, batch_first=True)
        self.norm1 = nn.LayerNorm(dim)
        self.norm2 = nn.LayerNorm(dim)
        self.ffn = nn.Sequential(
            nn.Linear(dim, dim * 4),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(dim * 4, dim)
        )

    def forward(self, x):
        # Gating happens before attention to boost rare signals
        x_g = self.gate(x)
        attn_out, _ = self.attn(x_g, x_g, x_g)
        x = self.norm1(x + attn_out)
        ffn_out = self.ffn(x)
        x = self.norm2(x + ffn_out)
        return x

class ACATNet(nn.Module):
    def __init__(self, input_dim, num_classes, embed_dim=256):
        super().__init__()
        self.embedding_stem = nn.Sequential(
            nn.Linear(input_dim, embed_dim),
            nn.LayerNorm(embed_dim),
            nn.GELU()
        )
        
        # Deep Transformer backbone with Class-Attention blocks
        self.transformer_blocks = nn.ModuleList([
            ACATBlock(embed_dim) for _ in range(4)
        ])
        
        self.classifier = nn.Sequential(
            nn.Linear(embed_dim, 512),
            nn.LayerNorm(512),
            nn.GELU(),
            nn.Dropout(0.4),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        # Transform tabular vector into a sequence of 1 for Transformer processing
        x = self.embedding_stem(x).unsqueeze(1) 
        for block in self.transformer_blocks:
            x = block(x)
        return self.classifier(x.squeeze(1))

# ===========================================
# 2️⃣ Focal Logit-Adjusted Loss
# ===========================================

class FocalLogitLoss(nn.Module):
    """Novelty: Combines Focal Loss (difficulty) with Logit Adjustment (frequency)."""
    def __init__(self, logit_adj, gamma=2.0):
        super().__init__()
        self.logit_adj = logit_adj
        self.gamma = gamma

    def forward(self, logits, targets):
        # Adjust logits to push decision boundaries away from 'Normal'
        logits_adj = logits - self.logit_adj
        
        ce_loss = F.cross_entropy(logits_adj, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = ((1 - pt) ** self.gamma * ce_loss).mean()
        return focal_loss

# ===========================================
# 3️⃣ Execution Strategy
# ===========================================

# Calculate Logit Adjustment from your y_train_enc
counts = np.bincount(y_train_enc)
logit_adj = torch.tensor(np.log(counts + 1e-6)).float().to(device)

model = ACATNet(X_train_proc.shape[1], num_classes).to(device)
criterion = FocalLogitLoss(logit_adj, gamma=2.5) # Higher gamma focuses more on rare classes
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=0.05)

# Training Loop
for epoch in range(30):
    model.train()
    for X_b, y_b in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        X_b, y_b = X_b.to(device), y_b.to(device)
        optimizer.zero_grad()
        logits = model(X_b)
        loss = criterion(logits, y_b)
        loss.backward()
        optimizer.step()

# Evaluation
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for X_b, y_b in test_loader:
        out = model(X_b.to(device))
        all_preds.extend(torch.argmax(out, dim=1).cpu().numpy())
        all_labels.extend(y_b.numpy())

print(classification_report(all_labels, all_preds, target_names=le.classes_, zero_division=0))

Epoch 1: 100%|██████████| 1090/1090 [00:15<00:00, 69.45it/s]
Epoch 2: 100%|██████████| 1090/1090 [00:15<00:00, 69.03it/s]
Epoch 3: 100%|██████████| 1090/1090 [00:15<00:00, 69.21it/s]
Epoch 4: 100%|██████████| 1090/1090 [00:15<00:00, 69.14it/s]
Epoch 5: 100%|██████████| 1090/1090 [00:15<00:00, 69.32it/s]
Epoch 6: 100%|██████████| 1090/1090 [00:15<00:00, 69.06it/s]
Epoch 7: 100%|██████████| 1090/1090 [00:15<00:00, 69.32it/s]
Epoch 8: 100%|██████████| 1090/1090 [00:15<00:00, 69.53it/s]
Epoch 9: 100%|██████████| 1090/1090 [00:15<00:00, 69.46it/s]
Epoch 10: 100%|██████████| 1090/1090 [00:15<00:00, 70.02it/s]
Epoch 11: 100%|██████████| 1090/1090 [00:15<00:00, 68.69it/s]
Epoch 12: 100%|██████████| 1090/1090 [00:15<00:00, 68.88it/s]
Epoch 13: 100%|██████████| 1090/1090 [00:15<00:00, 68.34it/s]
Epoch 14: 100%|██████████| 1090/1090 [00:15<00:00, 69.34it/s]
Epoch 15: 100%|██████████| 1090/1090 [00:15<00:00, 69.74it/s]
Epoch 16: 100%|██████████| 1090/1090 [00:15<00:00, 69.16it/s]
Epoch 17: 100%|██

                 precision    recall  f1-score   support

           back       0.56      1.00      0.72       359
buffer_overflow       0.02      0.15      0.03        20
      ftp_write       0.00      0.00      0.00         3
   guess_passwd       0.67      0.00      0.00      1231
           imap       0.00      0.00      0.00         1
        ipsweep       0.75      0.98      0.85       141
           land       0.88      1.00      0.93         7
     loadmodule       0.00      0.00      0.00         2
       multihop       0.00      0.00      0.00        18
        neptune       1.00      0.99      1.00      4657
           nmap       0.73      1.00      0.84        73
         normal       0.86      0.90      0.88      9711
           perl       0.22      1.00      0.36         2
            phf       0.01      0.50      0.02         2
            pod       0.51      0.88      0.65        41
      portsweep       0.72      0.77      0.74       157
        rootkit       0.07    