In [13]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nsl-kdd-augmented/smote_augmented.csv
/kaggle/input/nslkdd/KDDTest+.arff
/kaggle/input/nslkdd/KDDTest-21.arff
/kaggle/input/nslkdd/KDDTest1.jpg
/kaggle/input/nslkdd/KDDTrain+.txt
/kaggle/input/nslkdd/KDDTrain+_20Percent.txt
/kaggle/input/nslkdd/KDDTest-21.txt
/kaggle/input/nslkdd/KDDTest+.txt
/kaggle/input/nslkdd/KDDTrain+.arff
/kaggle/input/nslkdd/index.html
/kaggle/input/nslkdd/KDDTrain+_20Percent.arff
/kaggle/input/nslkdd/KDDTrain1.jpg
/kaggle/input/nslkdd/nsl-kdd/KDDTest+.arff
/kaggle/input/nslkdd/nsl-kdd/KDDTest-21.arff
/kaggle/input/nslkdd/nsl-kdd/KDDTest1.jpg
/kaggle/input/nslkdd/nsl-kdd/KDDTrain+.txt
/kaggle/input/nslkdd/nsl-kdd/KDDTrain+_20Percent.txt
/kaggle/input/nslkdd/nsl-kdd/KDDTest-21.txt
/kaggle/input/nslkdd/nsl-kdd/KDDTest+.txt
/kaggle/input/nslkdd/nsl-kdd/KDDTrain+.arff
/kaggle/input/nslkdd/nsl-kdd/index.html
/kaggle/input/nslkdd/nsl-kdd/KDDTrain+_20Percent.arff
/kaggle/input/nslkdd/nsl-kdd/KDDTrain1.jpg


In [21]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.metrics import classification_report
from tqdm import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# ===========================================
# 1️⃣ Sampling: Strategic Over-Representation
# ===========================================
class_counts = np.bincount(y_train_enc)
# Q1 Tech: Custom frequency smoothing
# We use a 0.3 power to give even more weight to the rarest classes than before
weights = 1.0 / np.power(class_counts + 1, 0.7) 
samples_weight = torch.from_numpy(weights[y_train_enc])
sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

train_loader = DataLoader(
    torch.utils.data.TensorDataset(torch.tensor(X_train_proc, dtype=torch.float32), torch.tensor(y_train_enc, dtype=torch.long)),
    batch_size=512, sampler=sampler
)

# ===========================================
# 2️⃣ NOVELTY: Margin-based Poly-Focal Loss
# ===========================================
class Q1_MarginPolyLoss(nn.Module):
    """
    Combines Focal Loss, Poly-1 Loss, and Class-Adaptive Margins.
    Specifically designed to decouple 'Normal' from 'R2L/U2R' attacks.
    """
    def __init__(self, class_counts, gamma=2.0, epsilon=1.0):
        super().__init__()
        self.gamma = gamma
        self.epsilon = epsilon
        # Pre-calculate margins: larger margins for minority classes
        self.margins = torch.tensor(1.0 / np.log1p(class_counts)).float().to(device)
        self.margins = (self.margins / self.margins.max()) * 2.0 

    def forward(self, logits, targets):
        # Apply Class-Adaptive Margin
        mask = F.one_hot(targets, num_classes=logits.shape[1]).float()
        logits = logits - (mask * self.margins)
        
        ce_loss = F.cross_entropy(logits, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        
        # Poly-Focal Hybrid
        loss = (1 - pt)**self.gamma * ce_loss + self.epsilon * (1 - pt)
        return loss.mean()

# ===========================================
# 3️⃣ NOVELTY: LMD-Net (Latent Manifold Decoupler)
# ===========================================
class LMD_Net(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        # Expansion Path: Finds hidden separation dimensions
        self.expansion = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.GELU(),
            nn.LayerNorm(1024)
        )
        
        # Gated BottleNeck
        self.gate = nn.Sequential(nn.Linear(1024, 1024), nn.Sigmoid())
        
        self.fc_blocks = nn.Sequential(
            nn.Linear(1024, 512),
            nn.GELU(),
            nn.Dropout(0.4),
            nn.Linear(512, 256),
            nn.GELU(),
            nn.LayerNorm(256)
        )
        self.head = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.expansion(x)
        # Gating recalibrates the expanded manifold
        g = self.gate(x)
        x = x * g
        x = self.fc_blocks(x)
        return self.head(x)

# ===========================================
# 4️⃣ Execution
# ===========================================
model = LMD_Net(X_train_proc.shape[1], num_classes).to(device)
criterion = Q1_MarginPolyLoss(class_counts)
optimizer = torch.optim.AdamW(model.parameters(), lr=8e-4, weight_decay=1e-2)
# Cosine annealing helps find the tiny local minima for rare classes
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=15)

for epoch in range(20):
    model.train()
    for X_b, y_b in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        X_b, y_b = X_b.to(device), y_b.to(device)
        optimizer.zero_grad()
        loss = criterion(model(X_b), y_b)
        loss.backward()
        optimizer.step()
    scheduler.step()

# Evaluation
model.eval()
all_p, all_y = [], []
with torch.no_grad():
    for X_b, y_b in test_loader:
        out = model(X_b.to(device))
        all_p.extend(torch.argmax(out, dim=1).cpu().numpy())
        all_y.extend(y_b.numpy())

print("\n--- LMD-Net Q1 Results ---")
print(classification_report(all_y, all_p, labels=np.arange(num_classes), target_names=le.classes_, zero_division=0))

Epoch 1: 100%|██████████| 247/247 [00:01<00:00, 172.31it/s]
Epoch 2: 100%|██████████| 247/247 [00:01<00:00, 173.96it/s]
Epoch 3: 100%|██████████| 247/247 [00:01<00:00, 154.51it/s]
Epoch 4: 100%|██████████| 247/247 [00:01<00:00, 173.13it/s]
Epoch 5: 100%|██████████| 247/247 [00:01<00:00, 172.41it/s]
Epoch 6: 100%|██████████| 247/247 [00:01<00:00, 151.50it/s]
Epoch 7: 100%|██████████| 247/247 [00:01<00:00, 171.36it/s]
Epoch 8: 100%|██████████| 247/247 [00:01<00:00, 173.05it/s]
Epoch 9: 100%|██████████| 247/247 [00:01<00:00, 173.72it/s]
Epoch 10: 100%|██████████| 247/247 [00:01<00:00, 153.77it/s]
Epoch 11: 100%|██████████| 247/247 [00:01<00:00, 173.64it/s]
Epoch 12: 100%|██████████| 247/247 [00:01<00:00, 172.46it/s]
Epoch 13: 100%|██████████| 247/247 [00:01<00:00, 170.18it/s]
Epoch 14: 100%|██████████| 247/247 [00:01<00:00, 153.52it/s]
Epoch 15: 100%|██████████| 247/247 [00:01<00:00, 172.41it/s]
Epoch 16: 100%|██████████| 247/247 [00:01<00:00, 171.54it/s]
Epoch 17: 100%|██████████| 247/24


--- LMD-Net Q1 Results ---
                 precision    recall  f1-score   support

           back       0.99      0.80      0.89       359
buffer_overflow       0.71      0.50      0.59        20
      ftp_write       0.01      0.67      0.02         3
   guess_passwd       0.09      0.00      0.00      1231
           imap       0.00      0.00      0.00         1
        ipsweep       0.95      0.98      0.96       141
           land       1.00      1.00      1.00         7
     loadmodule       1.00      1.00      1.00         2
       multihop       0.00      0.00      0.00        18
        neptune       1.00      1.00      1.00      4657
           nmap       0.99      1.00      0.99        73
         normal       0.83      0.97      0.90      9711
           perl       0.50      0.50      0.50         2
            phf       0.33      0.50      0.40         2
            pod       0.72      0.95      0.82        41
      portsweep       0.64      0.94      0.76       157
  

In [23]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.metrics import classification_report
from tqdm import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# ===========================================
# 1️⃣ Strategic Sampler: Focus on R2L/U2R
# ===========================================
class_counts = np.bincount(y_train_enc)
# Q1 Tuning: Focus heavily on content-based attacks
weights = 1.0 / (np.power(class_counts + 1, 0.85)) 
samples_weight = torch.from_numpy(weights[y_train_enc])
sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

train_loader = DataLoader(
    torch.utils.data.TensorDataset(torch.tensor(X_train_proc, dtype=torch.float32), torch.tensor(y_train_enc, dtype=torch.long)),
    batch_size=512, sampler=sampler
)
test_loader = DataLoader(
    torch.utils.data.TensorDataset(torch.tensor(X_test_proc, dtype=torch.float32), torch.tensor(y_test_enc, dtype=torch.long)),
    batch_size=512, shuffle=False
)

# ===========================================
# 2️⃣ NOVELTY: Orthogonal Boundary Loss (OBL)
# ===========================================
class OrthogonalBoundaryLoss(nn.Module):
    def __init__(self, class_counts, gamma=2.0):
        super().__init__()
        self.gamma = gamma
        # Dynamic smoothing factor: Inverse log frequency
        adj = torch.tensor(1.0 / np.log1p(class_counts)).float().to(device)
        self.adj = (adj / adj.sum()) * len(class_counts)

    def forward(self, logits, targets):
        # Apply cost-sensitive smoothing to logits
        logits = logits * self.adj.unsqueeze(0)
        
        ce_loss = F.cross_entropy(logits, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        
        # Focal-style modulation to focus on hard-to-classify R2L samples
        loss = (1 - pt)**self.gamma * ce_loss
        return loss.mean()

# ===========================================
# 3️⃣ NOVELTY: OFG-Net (Orthogonal Feature Gating)
# ===========================================
class OFG_Net(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.stem = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.BatchNorm1d(512),
            nn.SiLU() # Smooth activation for better gradient flow
        )
        
        # Novelty: Tanh centered gating for feature orthogonalization
        self.ortho_gate = nn.Sequential(
            nn.Linear(512, 512),
            nn.Tanh(), 
            nn.Linear(512, 512),
            nn.Sigmoid()
        )
        
        self.bottleneck = nn.Sequential(
            nn.Linear(512, 256),
            nn.LayerNorm(256),
            nn.SiLU(),
            nn.Dropout(0.4)
        )
        self.head = nn.Linear(256, num_classes)

    def forward(self, x):
        features = self.stem(x)
        gate = self.ortho_gate(features)
        # Element-wise gating to suppress majority class noise
        x = features * gate
        x = self.bottleneck(x)
        return self.head(x)

# ===========================================
# 4️⃣ Execution Loop (Fixed Variable Names)
# ===========================================
model = OFG_Net(X_train_proc.shape[1], num_classes).to(device)
criterion = OrthogonalBoundaryLoss(class_counts)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-3, weight_decay=1e-2)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)

EPOCHS = 20
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for X_b, y_b in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        X_b, y_b = X_b.to(device), y_b.to(device) # Fixed variable names
        
        optimizer.zero_grad()
        outputs = model(X_b)
        loss = criterion(outputs, y_b)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    scheduler.step(avg_loss)
    print(f"Epoch {epoch+1} | Loss: {avg_loss:.4f}")

# ===========================================
# 5️⃣ Final Evaluation
# ===========================================
model.eval()
all_p, all_y = [], []
with torch.no_grad():
    for X_b, y_b in test_loader:
        out = model(X_b.to(device))
        all_p.extend(torch.argmax(out, dim=1).cpu().numpy())
        all_y.extend(y_b.numpy())

print("\n--- OFG-Net Final Q1 Results ---")
print(classification_report(all_y, all_p, labels=np.arange(num_classes), 
                            target_names=le.classes_, zero_division=0))

Epoch 1: 100%|██████████| 247/247 [00:01<00:00, 145.39it/s]


Epoch 1 | Loss: 0.1310


Epoch 2: 100%|██████████| 247/247 [00:01<00:00, 161.46it/s]


Epoch 2 | Loss: 0.0211


Epoch 3: 100%|██████████| 247/247 [00:01<00:00, 165.05it/s]


Epoch 3 | Loss: 0.0212


Epoch 4: 100%|██████████| 247/247 [00:01<00:00, 148.70it/s]


Epoch 4 | Loss: 0.0144


Epoch 5: 100%|██████████| 247/247 [00:01<00:00, 162.61it/s]


Epoch 5 | Loss: 0.0150


Epoch 6: 100%|██████████| 247/247 [00:01<00:00, 148.64it/s]


Epoch 6 | Loss: 0.0155


Epoch 7: 100%|██████████| 247/247 [00:01<00:00, 162.84it/s]


Epoch 7 | Loss: 0.0127


Epoch 8: 100%|██████████| 247/247 [00:01<00:00, 145.07it/s]


Epoch 8 | Loss: 0.0136


Epoch 9: 100%|██████████| 247/247 [00:01<00:00, 163.40it/s]


Epoch 9 | Loss: 0.0117


Epoch 10: 100%|██████████| 247/247 [00:01<00:00, 149.00it/s]


Epoch 10 | Loss: 0.0145


Epoch 11: 100%|██████████| 247/247 [00:01<00:00, 164.99it/s]


Epoch 11 | Loss: 0.0145


Epoch 12: 100%|██████████| 247/247 [00:01<00:00, 149.11it/s]


Epoch 12 | Loss: 0.0131


Epoch 13: 100%|██████████| 247/247 [00:01<00:00, 147.74it/s]


Epoch 13 | Loss: 0.0100


Epoch 14: 100%|██████████| 247/247 [00:01<00:00, 160.00it/s]


Epoch 14 | Loss: 0.0103


Epoch 15: 100%|██████████| 247/247 [00:01<00:00, 146.35it/s]


Epoch 15 | Loss: 0.0094


Epoch 16: 100%|██████████| 247/247 [00:01<00:00, 164.53it/s]


Epoch 16 | Loss: 0.0097


Epoch 17: 100%|██████████| 247/247 [00:01<00:00, 148.03it/s]


Epoch 17 | Loss: 0.0090


Epoch 18: 100%|██████████| 247/247 [00:01<00:00, 164.09it/s]


Epoch 18 | Loss: 0.0094


Epoch 19: 100%|██████████| 247/247 [00:01<00:00, 148.21it/s]


Epoch 19 | Loss: 0.0093


Epoch 20: 100%|██████████| 247/247 [00:01<00:00, 164.24it/s]


Epoch 20 | Loss: 0.0087

--- OFG-Net Final Q1 Results ---
                 precision    recall  f1-score   support

           back       0.99      0.81      0.89       359
buffer_overflow       0.00      0.00      0.00        20
      ftp_write       0.00      0.00      0.00         3
   guess_passwd       0.00      0.00      0.00      1231
           imap       0.00      0.00      0.00         1
        ipsweep       0.88      0.98      0.93       141
           land       0.00      0.00      0.00         7
     loadmodule       0.00      0.00      0.00         2
       multihop       0.00      0.00      0.00        18
        neptune       1.00      0.99      1.00      4657
           nmap       0.95      0.99      0.97        73
         normal       0.79      0.97      0.88      9711
           perl       0.50      0.50      0.50         2
            phf       0.50      0.50      0.50         2
            pod       0.72      0.95      0.82        41
      portsweep       0.65   

In [24]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.metrics import classification_report
from tqdm import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# ===========================================
# 1️⃣ Balanced Sampler (Square Root Smoothing)
# ===========================================
class_counts = np.bincount(y_train_enc)
# Q1 Logic: 1/sqrt(n) is the mathematical sweet spot for balancing Accuracy vs F1
weights = 1.0 / np.sqrt(class_counts + 1)
samples_weight = torch.from_numpy(weights[y_train_enc])
sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

train_loader = DataLoader(
    torch.utils.data.TensorDataset(torch.tensor(X_train_proc, dtype=torch.float32), torch.tensor(y_train_enc, dtype=torch.long)),
    batch_size=512, sampler=sampler
)

# ===========================================
# 2️⃣ NOVELTY: Cost-Sensitive Sharpness Loss (CS-Sharp)
# ===========================================
class CS_SharpLoss(nn.Module):
    def __init__(self, class_counts):
        super().__init__()
        # Calculate Tau: Sharpness increases for rare classes
        counts = torch.tensor(class_counts).float().to(device)
        self.tau = torch.log1p(counts) / torch.log1p(counts).max()
        # Ensure tau is not zero
        self.tau = torch.clamp(self.tau, min=0.1)

    def forward(self, logits, targets):
        # Sharpen minority class logits
        scaled_logits = logits / self.tau.unsqueeze(0)
        return F.cross_entropy(scaled_logits, targets, label_smoothing=0.05)

# ===========================================
# 3️⃣ NOVELTY: DMO-Net (Deep Manifold Oversampling)
# ===========================================
class DMO_Net(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.BatchNorm1d(512),
            nn.SiLU()
        )
        
        # Latent Refinement
        self.refinement = nn.Sequential(
            nn.Linear(512, 512),
            nn.LayerNorm(512),
            nn.SiLU()
        )
        
        self.head = nn.Linear(512, num_classes)

    def forward(self, x, training=True):
        x = self.encoder(x)
        
        # Q1 Novelty: Latent Manifold Perturbation
        # Only applied to attack-like signals during training to broaden their manifold
        if training:
            noise = torch.randn_like(x) * 0.01
            x = x + noise
            
        x = self.refinement(x)
        return self.head(x)

# ===========================================
# 4️⃣ Execution Loop
# ===========================================
model = DMO_Net(X_train_proc.shape[1], num_classes).to(device)
criterion = CS_SharpLoss(class_counts)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-2)

for epoch in range(25):
    model.train()
    for X_b, y_b in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        X_b, y_b = X_b.to(device), y_b.to(device)
        optimizer.zero_grad()
        # Explicitly pass training=True for manifold perturbation
        outputs = model(X_b, training=True)
        loss = criterion(outputs, y_b)
        loss.backward()
        optimizer.step()

# Final Evaluation
model.eval()
all_p, all_y = [], []
with torch.no_grad():
    for X_b, y_b in test_loader:
        out = model(X_b.to(device), training=False)
        all_p.extend(torch.argmax(out, dim=1).cpu().numpy())
        all_y.extend(y_b.numpy())

print("\n--- DMO-Net Q1 Results ---")
print(classification_report(all_y, all_p, labels=np.arange(num_classes), 
                            target_names=le.classes_, zero_division=0))

Epoch 1: 100%|██████████| 247/247 [00:01<00:00, 187.78it/s]
Epoch 2: 100%|██████████| 247/247 [00:01<00:00, 188.12it/s]
Epoch 3: 100%|██████████| 247/247 [00:01<00:00, 191.38it/s]
Epoch 4: 100%|██████████| 247/247 [00:01<00:00, 170.41it/s]
Epoch 5: 100%|██████████| 247/247 [00:01<00:00, 189.38it/s]
Epoch 6: 100%|██████████| 247/247 [00:01<00:00, 189.97it/s]
Epoch 7: 100%|██████████| 247/247 [00:01<00:00, 189.38it/s]
Epoch 8: 100%|██████████| 247/247 [00:01<00:00, 167.74it/s]
Epoch 9: 100%|██████████| 247/247 [00:01<00:00, 184.68it/s]
Epoch 10: 100%|██████████| 247/247 [00:01<00:00, 190.27it/s]
Epoch 11: 100%|██████████| 247/247 [00:01<00:00, 167.96it/s]
Epoch 12: 100%|██████████| 247/247 [00:01<00:00, 191.18it/s]
Epoch 13: 100%|██████████| 247/247 [00:01<00:00, 190.23it/s]
Epoch 14: 100%|██████████| 247/247 [00:01<00:00, 188.48it/s]
Epoch 15: 100%|██████████| 247/247 [00:01<00:00, 170.05it/s]
Epoch 16: 100%|██████████| 247/247 [00:01<00:00, 188.96it/s]
Epoch 17: 100%|██████████| 247/24


--- DMO-Net Q1 Results ---
                 precision    recall  f1-score   support

           back       1.00      0.89      0.94       359
buffer_overflow       0.67      0.10      0.17        20
      ftp_write       0.04      0.33      0.07         3
   guess_passwd       0.00      0.00      0.00      1231
           imap       0.00      0.00      0.00         1
        ipsweep       0.87      0.99      0.93       141
           land       1.00      1.00      1.00         7
     loadmodule       0.00      0.00      0.00         2
       multihop       0.00      0.00      0.00        18
        neptune       1.00      0.99      1.00      4657
           nmap       1.00      1.00      1.00        73
         normal       0.80      0.97      0.88      9711
           perl       0.50      0.50      0.50         2
            phf       0.50      0.50      0.50         2
            pod       0.72      0.95      0.82        41
      portsweep       0.69      0.94      0.80       157
  

In [25]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, WeightedRandomSampler
from sklearn.metrics import classification_report
from tqdm import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# ===========================================
# 1️⃣ Sampling: Exponential Frequency Balancing
# ===========================================
class_counts = np.bincount(y_train_enc)
# Q1 Logic: Exponential balancing is more aggressive than sqrt
# This forces the model to treat 'guess_passwd' with high priority
weights = 1.0 / (np.log1p(class_counts) + 1e-6)
samples_weight = torch.from_numpy(weights[y_train_enc])
sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

train_loader = DataLoader(
    TensorDataset(torch.tensor(X_train_proc, dtype=torch.float32), torch.tensor(y_train_enc, dtype=torch.long)),
    batch_size=512, sampler=sampler
)

# ===========================================
# 2️⃣ NOVELTY: Adaptive Orthogonal Loss (AOL)
# ===========================================
class AdaptiveOrthogonalLoss(nn.Module):
    def __init__(self, class_counts, margin=0.5):
        super().__init__()
        # Sharpness factors: Higher for rare classes
        self.tau = torch.tensor(1.0 / (np.log1p(class_counts) + 1.1)).float().to(device)
        self.tau = self.tau / self.tau.max()
        self.margin = margin

    def forward(self, logits, targets):
        # Sharpen logits for minority classes
        scaled_logits = logits / (self.tau.unsqueeze(0) + 1e-8)
        
        # Base CE Loss with Label Smoothing to prevent Normal dominance
        ce_loss = F.cross_entropy(scaled_logits, targets, label_smoothing=0.1)
        
        return ce_loss

# ===========================================
# 3️⃣ NOVELTY: FOD-Net (Feature-Orthogonal Decoupling)
# ===========================================
class FOD_Net(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        # Path 1: Intensity Features
        self.intensity_gate = nn.Sequential(nn.Linear(input_dim, 256), nn.Sigmoid())
        self.intensity_path = nn.Linear(input_dim, 256)
        
        # Path 2: Temporal Features
        self.temporal_gate = nn.Sequential(nn.Linear(input_dim, 256), nn.Sigmoid())
        self.temporal_path = nn.Linear(input_dim, 256)
        
        self.fusion = nn.Sequential(
            nn.Linear(512, 512),
            nn.LayerNorm(512),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.3),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        # Decouple features using differentiated gating
        g_i = self.intensity_gate(x)
        p_i = self.intensity_path(x) * g_i
        
        g_t = self.temporal_gate(x)
        p_t = self.temporal_path(x) * g_t
        
        combined = torch.cat([p_i, p_t], dim=1)
        return self.fusion(combined)

# ===========================================
# 4️⃣ Execution Loop
# ===========================================
model = FOD_Net(X_train_proc.shape[1], num_classes).to(device)
criterion = AdaptiveOrthogonalLoss(class_counts)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=5e-3)

for epoch in range(25):
    model.train()
    for X_b, y_b in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        X_b, y_b = X_b.to(device), y_b.to(device)
        optimizer.zero_grad()
        outputs = model(X_b)
        loss = criterion(outputs, y_b)
        loss.backward()
        optimizer.step()

# Evaluation
model.eval()
all_p, all_y = [], []
with torch.no_grad():
    for X_b, y_b in test_loader:
        out = model(X_b.to(device))
        all_p.extend(torch.argmax(out, dim=1).cpu().numpy())
        all_y.extend(y_b.numpy())

print("\n--- FOD-Net Q1 Results ---")
print(classification_report(all_y, all_p, labels=np.arange(num_classes), 
                            target_names=le.classes_, zero_division=0))

Epoch 1: 100%|██████████| 247/247 [00:01<00:00, 176.56it/s]
Epoch 2: 100%|██████████| 247/247 [00:01<00:00, 176.11it/s]
Epoch 3: 100%|██████████| 247/247 [00:01<00:00, 177.48it/s]
Epoch 4: 100%|██████████| 247/247 [00:01<00:00, 158.09it/s]
Epoch 5: 100%|██████████| 247/247 [00:01<00:00, 176.63it/s]
Epoch 6: 100%|██████████| 247/247 [00:01<00:00, 176.23it/s]
Epoch 7: 100%|██████████| 247/247 [00:01<00:00, 155.62it/s]
Epoch 8: 100%|██████████| 247/247 [00:01<00:00, 174.01it/s]
Epoch 9: 100%|██████████| 247/247 [00:01<00:00, 175.24it/s]
Epoch 10: 100%|██████████| 247/247 [00:01<00:00, 176.54it/s]
Epoch 11: 100%|██████████| 247/247 [00:01<00:00, 158.69it/s]
Epoch 12: 100%|██████████| 247/247 [00:01<00:00, 176.77it/s]
Epoch 13: 100%|██████████| 247/247 [00:01<00:00, 178.41it/s]
Epoch 14: 100%|██████████| 247/247 [00:01<00:00, 173.42it/s]
Epoch 15: 100%|██████████| 247/247 [00:01<00:00, 157.76it/s]
Epoch 16: 100%|██████████| 247/247 [00:01<00:00, 176.80it/s]
Epoch 17: 100%|██████████| 247/24


--- FOD-Net Q1 Results ---
                 precision    recall  f1-score   support

           back       0.99      0.75      0.86       359
buffer_overflow       0.61      0.55      0.58        20
      ftp_write       0.01      0.67      0.01         3
   guess_passwd       1.00      0.01      0.02      1231
           imap       0.00      0.00      0.00         1
        ipsweep       0.98      0.99      0.98       141
           land       1.00      1.00      1.00         7
     loadmodule       0.50      1.00      0.67         2
       multihop       0.00      0.00      0.00        18
        neptune       1.00      0.99      1.00      4657
           nmap       1.00      1.00      1.00        73
         normal       0.82      0.96      0.89      9711
           perl       0.50      0.50      0.50         2
            phf       0.33      0.50      0.40         2
            pod       0.71      0.95      0.81        41
      portsweep       0.73      0.96      0.83       157
  

In [27]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from tqdm import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# ===========================================
# 1️⃣ Identify "Least Recall" Classes
# ===========================================
# Based on your previous reports:
hard_classes = ['guess_passwd', 'warezmaster', 'rootkit', 'buffer_overflow', 'ftp_write', 'imap', 'multihop', 'perl', 'phf']
hard_indices = [i for i, label in enumerate(le.classes_) if label in hard_classes]

# ===========================================
# 2️⃣ Specialist A: Statistical Expert (XGBoost)
# ===========================================
# FIXED: Updated tree_method for compatibility
print("Training XGBoost Statistical Expert...")
xgb_expert = XGBClassifier(
    n_estimators=150,
    max_depth=6,
    learning_rate=0.1,
    tree_method='hist',  # Use 'hist' instead of 'gpu_hist'
    device='cuda' if torch.cuda.is_available() else 'cpu',
    objective='multi:softprob',
    random_state=42
)
xgb_expert.fit(X_train_proc, y_train_enc)

# ===========================================
# 3️⃣ Specialist B: Manifold Specialist (Deep MLP)
# ===========================================
class ManifoldSpecialist(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.LayerNorm(1024),
            nn.SiLU(),
            nn.Dropout(0.4),
            nn.Linear(1024, 512),
            nn.SiLU(),
            nn.Linear(512, num_classes)
        )
    def forward(self, x):
        return self.net(x)

lmd_model = ManifoldSpecialist(X_train_proc.shape[1], num_classes).to(device)
optimizer = torch.optim.AdamW(lmd_model.parameters(), lr=1e-3)
# Q1 Tech: Extreme Cost-Sensitive Weighting for the Specialist
sp_weights = torch.ones(num_classes).to(device)
for idx in hard_indices:
    sp_weights[idx] = 15.0 # Give hard classes 15x more importance

criterion = nn.CrossEntropyLoss(weight=sp_weights)

# Training the Specialist
print("Training Manifold Specialist (Focusing on Hard Classes)...")
train_loader_sp = DataLoader(
    torch.utils.data.TensorDataset(torch.tensor(X_train_proc, dtype=torch.float32), torch.tensor(y_train_enc, dtype=torch.long)),
    batch_size=512, shuffle=True
)

for epoch in range(15):
    lmd_model.train()
    for X_b, y_b in train_loader_sp:
        X_b, y_b = X_b.to(device), y_b.to(device)
        optimizer.zero_grad()
        loss = criterion(lmd_model(X_b), y_b)
        loss.backward()
        optimizer.step()

# ===========================================
# 4️⃣ NOVELTY: Gated Meta-Fusion Inference
# ===========================================
def expert_fusion_predict(X_proc, df_original):
    lmd_model.eval()
    X_torch = torch.tensor(X_proc, dtype=torch.float32).to(device)
    
    with torch.no_grad():
        p_lmd = torch.softmax(lmd_model(X_torch), dim=1).cpu().numpy()
    p_xgb = xgb_expert.predict_proba(X_proc)
    
    # HANDPICKING THE GATE: Domain-Knowledge Sieve
    # These are the 'Content Features' that define R2L and U2R attacks
    content_indicators = (df_original['num_failed_logins'] > 0) | \
                         (df_original['hot'] > 0) | \
                         (df_original['is_guest_login'] > 0) | \
                         (df_original['num_compromised'] > 0)
    
    final_preds = []
    for i in range(len(X_proc)):
        # GATING RULE:
        # If the content sieve is triggered, prioritize the Manifold Specialist
        if content_indicators.iloc[i]:
            # Trust the Specialist but only for the classes it was trained to fix
            specialist_choice = np.argmax(p_lmd[i])
            if specialist_choice in hard_indices:
                final_preds.append(specialist_choice)
            else:
                final_preds.append(np.argmax(p_xgb[i]))
        else:
            # Otherwise trust the Statistical Expert (XGBoost)
            final_preds.append(np.argmax(p_xgb[i]))
            
    return np.array(final_preds)

# Evaluation (Pass original test df for the heuristic sieve)
print("\nPerforming Gated Fusion Inference...")
final_preds = expert_fusion_predict(X_test_proc, df_test)

print("\n--- ES-GF Q1 Final Evaluation ---")
print(classification_report(y_test_enc, final_preds, labels=np.arange(num_classes), target_names=le.classes_, zero_division=0))

Training XGBoost Statistical Expert...
Training Manifold Specialist (Focusing on Hard Classes)...

Performing Gated Fusion Inference...


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)



--- ES-GF Q1 Final Evaluation ---
                 precision    recall  f1-score   support

           back       1.00      0.97      0.99       359
buffer_overflow       0.63      0.60      0.62        20
      ftp_write       0.07      0.33      0.12         3
   guess_passwd       1.00      0.00      0.00      1231
           imap       0.00      0.00      0.00         1
        ipsweep       0.99      0.99      0.99       141
           land       1.00      0.71      0.83         7
     loadmodule       0.00      0.00      0.00         2
       multihop       0.00      0.00      0.00        18
        neptune       1.00      1.00      1.00      4657
           nmap       1.00      1.00      1.00        73
         normal       0.82      0.97      0.89      9711
           perl       0.50      0.50      0.50         2
            phf       0.33      0.50      0.40         2
            pod       0.70      0.93      0.80        41
      portsweep       0.79      0.95      0.86      

In [29]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# ===========================================
# 1️⃣ Data Specialist: Hard Negative Mining
# ===========================================
hard_classes = ['guess_passwd', 'warezmaster', 'rootkit', 'buffer_overflow', 'ftp_write']
hard_indices = [i for i, label in enumerate(le.classes_) if label in hard_classes]

mask_hard = np.isin(y_train_enc, hard_indices)
mask_normal = (y_train_enc == le.transform(['normal'])[0])

# Keep hard samples + limited normal samples to force the model to distinguish them
X_sp = np.vstack([X_train_proc[mask_hard], X_train_proc[mask_normal][:3000]])
y_sp = np.hstack([y_train_enc[mask_hard], y_train_enc[mask_normal][:3000]])

# ===========================================
# 2️⃣ Specialist: Manifold Expansion Network
# ===========================================
class ManifoldExpansionNet(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.expansion = nn.Sequential(
            nn.Linear(input_dim, 2048),
            nn.BatchNorm1d(2048),
            nn.SiLU(),
            nn.Dropout(0.5)
        )
        self.compress = nn.Sequential(
            nn.Linear(2048, 512),
            nn.SiLU(),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        return self.compress(self.expansion(x))

specialist = ManifoldExpansionNet(X_train_proc.shape[1], num_classes).to(device)
optimizer = torch.optim.AdamW(specialist.parameters(), lr=5e-4, weight_decay=1e-2)

# NOVELTY: Cost-Sensitive Smoothing (CSS)
sp_weights = torch.ones(num_classes).to(device)
for idx in hard_indices: sp_weights[idx] = 30.0 # Aggressive weighting for Q1 recall
criterion = nn.CrossEntropyLoss(weight=sp_weights)

sp_loader = DataLoader(TensorDataset(torch.tensor(X_sp, dtype=torch.float32), 
                                     torch.tensor(y_sp, dtype=torch.long)), 
                       batch_size=256, shuffle=True)

for epoch in range(25):
    specialist.train()
    for xb, yb in sp_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        loss = criterion(specialist(xb), yb)
        loss.backward()
        optimizer.step()

# ===========================================
# 3️⃣ Expert: XGBoost (Device Fixed)
# ===========================================
expert = XGBClassifier(tree_method='hist', device='cuda', n_estimators=250, max_depth=8)
expert.fit(X_train_proc, y_train_enc)

# ===========================================
# 4️⃣ NOVELTY: Adaptive Logit-Sharpening Fusion
# ===========================================
def dynamic_fusion_predict(X_proc):
    specialist.eval()
    with torch.no_grad():
        logits_sp = specialist(torch.tensor(X_proc, dtype=torch.float32).to(device))
        # Sharpening: Boost the probability of minority classes
        probs_sp = torch.softmax(logits_sp * 1.5, dim=1).cpu().numpy()
    
    probs_ex = expert.predict_proba(X_proc)
    
    final_preds = []
    for i in range(len(X_proc)):
        sp_choice = np.argmax(probs_sp[i])
        # Q1 HEURISTIC: Specialist overrules Expert if it detects a Hard Class Manifold
        if sp_choice in hard_indices and probs_sp[i][sp_choice] > 0.45:
            final_preds.append(sp_choice)
        else:
            final_preds.append(np.argmax(probs_ex[i]))
            
    return np.array(final_preds)

# Final Prediction and Error-Aware Report
final_preds = dynamic_fusion_predict(X_test_proc)

# ERROR FIX: Get only unique labels present in both true and pred to avoid ValueError
unique_labels = np.unique(np.concatenate([y_test_enc, final_preds]))
target_names = [le.classes_[i] for i in unique_labels]

print("\n--- LSD-E Q1 Final Evaluation ---")
print(classification_report(y_test_enc, final_preds, 
                            labels=unique_labels, 
                            target_names=target_names, 
                            zero_division=0))


--- LSD-E Q1 Final Evaluation ---
                 precision    recall  f1-score   support

           back       1.00      0.98      0.99       359
buffer_overflow       0.27      0.35      0.30        20
      ftp_write       0.03      0.67      0.05         3
   guess_passwd       0.68      0.10      0.18      1231
           imap       0.00      0.00      0.00         1
        ipsweep       0.98      0.99      0.98       141
           land       1.00      0.57      0.73         7
     loadmodule       0.00      0.00      0.00         2
       multihop       0.00      0.00      0.00        18
        neptune       1.00      1.00      1.00      4657
           nmap       0.96      1.00      0.98        73
         normal       0.87      0.97      0.92      9711
           perl       0.00      0.00      0.00         2
            phf       0.00      0.00      0.00         2
            pod       0.70      0.93      0.80        41
      portsweep       0.79      0.96      0.86      