In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nsl-kdd-augmented/smote_augmented.csv
/kaggle/input/nslkdd/KDDTest+.arff
/kaggle/input/nslkdd/KDDTest-21.arff
/kaggle/input/nslkdd/KDDTest1.jpg
/kaggle/input/nslkdd/KDDTrain+.txt
/kaggle/input/nslkdd/KDDTrain+_20Percent.txt
/kaggle/input/nslkdd/KDDTest-21.txt
/kaggle/input/nslkdd/KDDTest+.txt
/kaggle/input/nslkdd/KDDTrain+.arff
/kaggle/input/nslkdd/index.html
/kaggle/input/nslkdd/KDDTrain+_20Percent.arff
/kaggle/input/nslkdd/KDDTrain1.jpg
/kaggle/input/nslkdd/nsl-kdd/KDDTest+.arff
/kaggle/input/nslkdd/nsl-kdd/KDDTest-21.arff
/kaggle/input/nslkdd/nsl-kdd/KDDTest1.jpg
/kaggle/input/nslkdd/nsl-kdd/KDDTrain+.txt
/kaggle/input/nslkdd/nsl-kdd/KDDTrain+_20Percent.txt
/kaggle/input/nslkdd/nsl-kdd/KDDTest-21.txt
/kaggle/input/nslkdd/nsl-kdd/KDDTest+.txt
/kaggle/input/nslkdd/nsl-kdd/KDDTrain+.arff
/kaggle/input/nslkdd/nsl-kdd/index.html
/kaggle/input/nslkdd/nsl-kdd/KDDTrain+_20Percent.arff
/kaggle/input/nslkdd/nsl-kdd/KDDTrain1.jpg


In [16]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report
from tqdm import tqdm

# ===========================================
# 1️⃣ Load & Stable Mapping (From your working code)
# ===========================================
local_path = "/kaggle/input/nslkdd/" 
columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land',
           'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
           'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
           'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count',
           'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
           'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
           'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
           'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
           'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate',
           'outcome', 'level']

df_train = pd.read_csv(local_path + "KDDTrain+.txt", header=None, names=columns)
df_test  = pd.read_csv(local_path + "KDDTest+.txt", header=None, names=columns)

df_train['outcome'] = df_train['outcome'].astype(str)
df_test['outcome'] = df_test['outcome'].astype(str)
train_labels = set(df_train['outcome'].unique())
df_test = df_test[df_test['outcome'].isin(train_labels)].reset_index(drop=True)

# ===========================================
# 2️⃣ Hybrid Preprocessing for Transformer
# ===========================================
cat_cols = ['protocol_type', 'service', 'flag']
num_cols = [c for c in columns if c not in cat_cols + ['outcome', 'level']]

# Label Encode Categorical for Embeddings
cat_dims = []
for col in cat_cols:
    le_cat = LabelEncoder()
    df_train[col] = le_cat.fit_transform(df_train[col])
    df_test[col] = df_test[col].map(lambda s: s if s in le_cat.classes_ else le_cat.classes_[0])
    df_test[col] = le_cat.transform(df_test[col])
    cat_dims.append(len(le_cat.classes_))

# Scale Numerical
scaler = StandardScaler()
X_train_num = scaler.fit_transform(df_train[num_cols]).astype(np.float32)
X_test_num  = scaler.transform(df_test[num_cols]).astype(np.float32)

le = LabelEncoder()
y_train_enc = le.fit_transform(df_train['outcome'])
y_test_enc  = le.transform(df_test['outcome'])
num_classes = len(le.classes_)

# ===========================================
# 3️⃣ Novel Architecture: Gated-Transformer Fusion
# ===========================================
class GatedLinearUnit(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc = nn.Linear(input_dim, input_dim * 2)
    def forward(self, x):
        x = self.fc(x)
        x, gate = x.chunk(2, dim=-1)
        return x * torch.sigmoid(gate)

class GTFModel(nn.Module):
    def __init__(self, cat_dims, num_feat_dim, num_classes, emb_dim=32):
        super().__init__()
        # Categorical Path (Transformer)
        self.embs = nn.ModuleList([nn.Embedding(d, emb_dim) for d in cat_dims])
        layer = nn.TransformerEncoderLayer(d_model=emb_dim, nhead=8, batch_first=True)
        self.transformer = nn.TransformerEncoder(layer, num_layers=2)
        
        # Numerical Path (Gated)
        self.num_gate = nn.Sequential(
            nn.Linear(num_feat_dim, 128),
            GatedLinearUnit(128),
            nn.LayerNorm(128)
        )
        
        # Fusion
        self.classifier = nn.Sequential(
            nn.Linear(len(cat_dims)*emb_dim + 128, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, num_classes)
        )

    def forward(self, x_cat, x_num):
        x_c = torch.stack([emb(x_cat[:, i]) for i, emb in enumerate(self.embs)], dim=1)
        x_c = self.transformer(x_c).flatten(1)
        x_n = self.num_gate(x_num)
        return self.classifier(torch.cat([x_c, x_n], dim=1))

# ===========================================
# 4️⃣ Balanced Data Loading
# ===========================================
class HybridDS(Dataset):
    def __init__(self, c, n, y):
        self.c, self.n, self.y = torch.tensor(c), torch.tensor(n), torch.tensor(y)
    def __len__(self): return len(self.y)
    def __getitem__(self, i): return self.c[i], self.n[i], self.y[i]

class_counts = np.bincount(y_train_enc)
class_weights = 1.0 / np.sqrt(class_counts + 1)
sampler = WeightedRandomSampler(class_weights[y_train_enc], len(y_train_enc))

train_loader = DataLoader(HybridDS(df_train[cat_cols].values, X_train_num, y_train_enc), batch_size=256, sampler=sampler)
test_loader  = DataLoader(HybridDS(df_test[cat_cols].values, X_test_num, y_test_enc), batch_size=256, shuffle=False)

# ===========================================
# 5️⃣ Training Loop
# ===========================================
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = GTFModel(cat_dims, X_train_num.shape[1], num_classes).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

# Use standard weighted CE (proven stable in your code)
loss_weights = torch.tensor(class_weights / class_weights.sum() * num_classes, dtype=torch.float32).to(device)
criterion = nn.CrossEntropyLoss(weight=loss_weights)

for epoch in range(15):
    model.train()
    for xc, xn, y in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        xc, xn, y = xc.to(device), xn.to(device), y.to(device)
        optimizer.zero_grad()
        loss = criterion(model(xc, xn), y)
        loss.backward()
        optimizer.step()

# ===========================================
# 6️⃣ Final Evaluation
# ===========================================
model.eval()
all_p, all_y = [], []
with torch.no_grad():
    for xc, xn, y in test_loader:
        out = model(xc.to(device), xn.to(device))
        all_p.extend(torch.argmax(out, 1).cpu().numpy())
        all_y.extend(y.numpy())

print("\n--- Final Q1 Results ---")
print(classification_report(all_y, all_p, labels=np.arange(num_classes), target_names=le.classes_, zero_division=0))

Epoch 1: 100%|██████████| 493/493 [00:04<00:00, 113.66it/s]
Epoch 2: 100%|██████████| 493/493 [00:04<00:00, 121.91it/s]
Epoch 3: 100%|██████████| 493/493 [00:04<00:00, 119.88it/s]
Epoch 4: 100%|██████████| 493/493 [00:04<00:00, 115.13it/s]
Epoch 5: 100%|██████████| 493/493 [00:04<00:00, 119.92it/s]
Epoch 6: 100%|██████████| 493/493 [00:04<00:00, 116.88it/s]
Epoch 7: 100%|██████████| 493/493 [00:04<00:00, 120.01it/s]
Epoch 8: 100%|██████████| 493/493 [00:04<00:00, 119.99it/s]
Epoch 9: 100%|██████████| 493/493 [00:04<00:00, 115.92it/s]
Epoch 10: 100%|██████████| 493/493 [00:04<00:00, 118.75it/s]
Epoch 11: 100%|██████████| 493/493 [00:04<00:00, 122.19it/s]
Epoch 12: 100%|██████████| 493/493 [00:04<00:00, 116.25it/s]
Epoch 13: 100%|██████████| 493/493 [00:04<00:00, 120.65it/s]
Epoch 14: 100%|██████████| 493/493 [00:04<00:00, 122.44it/s]
Epoch 15: 100%|██████████| 493/493 [00:04<00:00, 114.97it/s]



--- Final Q1 Results ---
                 precision    recall  f1-score   support

           back       0.99      0.97      0.98       359
buffer_overflow       0.71      0.50      0.59        20
      ftp_write       0.01      0.33      0.01         3
   guess_passwd       1.00      0.18      0.31      1231
           imap       0.00      0.00      0.00         1
        ipsweep       0.86      0.98      0.91       141
           land       1.00      1.00      1.00         7
     loadmodule       0.29      1.00      0.44         2
       multihop       0.01      0.11      0.02        18
        neptune       1.00      0.99      1.00      4657
           nmap       0.99      0.99      0.99        73
         normal       0.86      0.92      0.89      9711
           perl       0.50      0.50      0.50         2
            phf       0.50      0.50      0.50         2
            pod       0.72      0.95      0.82        41
      portsweep       0.66      0.89      0.76       157
    