In [2]:
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from collections import Counter

In [3]:
!pip install imblearn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [8]:
df = pd.read_csv("../dataset/processed_dataset.csv", low_memory=False)
df = df.dropna()

df.columns


Index(['Unnamed: 0', 'arp.opcode', 'arp.hw.size', 'icmp.checksum',
       'icmp.seq_le', 'http.content_length', 'http.response', 'tcp.ack',
       'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin',
       'tcp.connection.rst', 'tcp.connection.syn', 'tcp.connection.synack',
       'tcp.dstport', 'tcp.flags', 'tcp.flags.ack', 'tcp.len', 'tcp.seq',
       'tcp.srcport', 'udp.stream', 'udp.time_delta', 'dns.qry.name',
       'dns.qry.qu', 'dns.retransmission', 'dns.retransmit_request',
       'mqtt.conflag.cleansess', 'mqtt.conflags', 'mqtt.hdrflags', 'mqtt.len',
       'mqtt.msgtype', 'mqtt.proto_len', 'mqtt.topic_len', 'mqtt.ver',
       'Attack_label', 'Attack_type', 'http.request.method_0',
       'http.request.method_0.0', 'http.request.method_GET',
       'http.request.method_OPTIONS', 'http.request.method_POST',
       'http.request.method_PROPFIND', 'http.request.method_PUT',
       'http.request.method_SEARCH', 'http.request.method_TRACE',
       'http.referer_() { _; } >_[$($()

In [17]:
scaler = StandardScaler()
df = df.drop(columns=["Attack_label", "Attack_class"])
X_scaled = scaler.fit_transform(df.drop(columns=["Attack_type"]))
y = df["Attack_type"]
print("Feature scaling complete!")

Feature scaling complete!


In [None]:
pca = PCA(n_components=0.99)  # Increased from 0.95 to 0.99
X_pca = pca.fit_transform(X_scaled)
joblib.dump(pca,'pca.pkl')
print(f"Original features: {X_scaled.shape[1]}, Reduced features: {X_pca.shape[1]}")

Original features: 97, Reduced features: 64


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)
print(f"Training set: {X_train.shape}, Validation set: {X_val.shape}, Testing set: {X_test.shape}")


Training set: (1299563, 64), Validation set: (324891, 64), Testing set: (406114, 64)


In [20]:
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print(f"Resampled training set: {X_train_resampled.shape}")

Resampled training set: (12862388, 64)


In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

Using device: mps


In [26]:
# Convert labels to string for encoding
y_train_str = y_train.astype(str)
y_val_str = y_val.astype(str)
y_test_str = y_test.astype(str)

# Combine all label sets to preserve unseen classes for encoding
combined_labels = np.concatenate([y_train_str, y_val_str, y_test_str])
label_encoder = LabelEncoder()
label_encoder.fit(combined_labels)

# Transform labels to integers
y_train_encoded = label_encoder.transform(y_train_str)
y_val_encoded = label_encoder.transform(y_val_str)
y_test_encoded = label_encoder.transform(y_test_str)

# Create a mapping of label name to encoded value
label_to_id = {name: idx for idx, name in enumerate(label_encoder.classes_)}
id_to_label = {idx: name for name, idx in label_to_id.items()}

# Select underperforming classes to boost
underperforming = ['XSS', 'MITM', 'Password', 'Port_Scanning']

# Target number of samples for each
smote_strategy = {
    label_to_id[label]: 50000 for label in underperforming
}

# Apply SMOTE only to selected classes
smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train_encoded)

# Check counts
print("New class distribution after targeted SMOTE:")
print({id_to_label[k]: v for k, v in Counter(y_train_resampled).items()})

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(df['Attack_type'])

# Save updated label encoder
import joblib
joblib.dump(label_encoder, 'label_encoder.pkl')



New class distribution after targeted SMOTE:
{'Normal': 918742, 'DDoS_ICMP': 74516, 'DDoS_HTTP': 31895, 'Uploading': 23832, 'Password': 50000, 'SQL_injection': 32623, 'DDoS_UDP': 77803, 'Backdoor': 15382, 'DDoS_TCP': 32040, 'Port_Scanning': 50000, 'Vulnerability_scanner': 32018, 'Ransomware': 6203, 'XSS': 50000, 'MITM': 50000}


['label_encoder.pkl']

In [None]:
# try:
#     label_encoder = joblib.load("label_encoder.pkl")
#     y_train_tensor = torch.load("y_train_tensor.pt")
#     y_val_tensor = torch.load("y_val_tensor.pt")
#     y_test_tensor = torch.load("y_test_tensor.pt")
# except:
#     # Create and fit label encoder
#     label_encoder = LabelEncoder()
#     y_train_resampled_str = y_train_resampled.astype(str)
#     y_val_str = y_val.astype(str)
#     y_test_str = y_test.astype(str)
    
#     y_train_encoded = label_encoder.fit_transform(y_train_resampled_str)
#     y_val_encoded = label_encoder.transform(y_val_str)
#     y_test_encoded = label_encoder.transform(y_test_str)
    
#     # Convert to tensors
#     y_train_tensor = torch.tensor(y_train_encoded, dtype=torch.long)
#     y_val_tensor = torch.tensor(y_val_encoded, dtype=torch.long)
#     y_test_tensor = torch.tensor(y_test_encoded, dtype=torch.long)
    
#     # Save for future use
#     joblib.dump(label_encoder, "label_encoder.pkl")
#     torch.save(y_train_tensor, "y_train_tensor.pt")
#     torch.save(y_val_tensor, "y_val_tensor.pt")
#     torch.save(y_test_tensor, "y_test_tensor.pt")

In [27]:
print("X_train_resampled shape:", X_train_resampled.shape)
print("y_train_resampled shape:", y_train_resampled.shape)


X_train_resampled shape: (1445054, 64)
y_train_resampled shape: (1445054,)


In [28]:
y_train_tensor = torch.tensor(y_train_resampled, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val_encoded, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_encoded, dtype=torch.float32)

In [29]:
torch.save(y_train_tensor, "y_train_tensor.pt")
torch.save(y_val_tensor, "y_val_tensor.pt")
torch.save(y_test_tensor, "y_test_tensor.pt")

In [30]:
X_train_tensor = torch.tensor(X_train_resampled, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

In [31]:
torch.save(X_train_tensor, "X_train_tensor.pt")
torch.save(X_val_tensor, "X_val_tensor.pt")
torch.save(X_test_tensor, "X_test_tensor.pt")

In [32]:
print("X_train_resampled shape:", X_train_tensor.shape)
print("y_train_resampled shape:", y_train_tensor.shape)


X_train_resampled shape: torch.Size([1445054, 64])
y_train_resampled shape: torch.Size([1445054])


In [33]:
unique_classes = np.unique(y_train_resampled)
class_weights = compute_class_weight('balanced', classes=unique_classes, y=y_train_resampled)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

In [34]:
batch_size = 512  # Increased from 128 to 512
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)


In [35]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=0)

In [36]:
accumulation_steps = 4


In [37]:
class ImprovedDNN(nn.Module):
    def __init__(self, input_dim, num_classes, dropout_rate=0.3):
        super(ImprovedDNN, self).__init__()
        
        # Wider architecture with batch normalization
        self.fc1 = nn.Linear(input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.fc4 = nn.Linear(128, 64)
        self.bn4 = nn.BatchNorm1d(64)
        self.fc5 = nn.Linear(64, num_classes)

        # Activation & regularization
        self.dropout = nn.Dropout(dropout_rate)
        self.act = nn.LeakyReLU(0.1)  # LeakyReLU instead of ReLU

    def forward(self, x):
        x = self.act(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = self.act(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = self.act(self.bn3(self.fc3(x)))
        x = self.dropout(x)
        x = self.act(self.bn4(self.fc4(x)))
        x = self.dropout(x)
        x = self.fc5(x)
        return x

In [39]:
input_dim = X_train_resampled.shape[1]
num_classes = len(set(y_train_resampled))
print(input_dim,num_classes)

model = ImprovedDNN(input_dim, num_classes).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)


64 14


In [43]:
num_epochs = 1
best_val_acc = 0
patience = 1
patience_counter = 0

for epoch in range(num_epochs):
    # Training phase
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    optimizer.zero_grad()  # Reset gradients at the beginning of epoch
    
    with tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch") as tepoch:
        for i, (X_batch, y_batch) in enumerate(tepoch):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            
            # Normalize loss to account for accumulation
            loss = loss / accumulation_steps
            loss.backward()
            
            # Only step and zero_grad after accumulation_steps
            if (i + 1) % accumulation_steps == 0:
                # Add gradient clipping
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
                optimizer.zero_grad()
            
            running_loss += loss.item() * accumulation_steps
            _, predicted = torch.max(outputs, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()
            
            tepoch.set_postfix(loss=loss.item() * accumulation_steps, accuracy=correct/total)
    
    # Step optimizer for remaining gradients
    if len(train_loader) % accumulation_steps != 0:
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()
    
    train_acc = correct / total
    print(f"Epoch [{epoch+1}/{num_epochs}] - Loss: {running_loss/len(train_loader):.4f} - Train Accuracy: {train_acc:.4f}")
    
    # Validation phase
    model.eval()
    val_correct = 0
    val_total = 0
    val_loss = 0
    
    with torch.no_grad():
        for X_val, y_val in val_loader:
            X_val, y_val = X_val.to(device), y_val.to(device)
            outputs = model(X_val)
            loss = criterion(outputs, y_val)
            val_loss += loss.item()
            
            _, predicted = torch.max(outputs, 1)
            val_total += y_val.size(0)
            val_correct += (predicted == y_val).sum().item()
    
    val_acc = val_correct / val_total
    val_loss = val_loss / len(val_loader)
    
    print(f"Validation Loss: {val_loss:.4f} - Validation Accuracy: {val_acc:.4f}")
    
    # Learning rate scheduler
    scheduler.step(val_loss)
    
    # Early stopping
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model, 'best_model.pth')
        print(f"Full model saved with validation accuracy: {best_val_acc:.4f}")
        patience_counter = 0

    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered!")
            break

# Load the best model for evaluation
model = torch.load('best_model.pth',weights_only = False)

Epoch 1/1: 100%|██████████| 2823/2823 [00:18<00:00, 156.24batch/s, accuracy=0.991, loss=0.0494]


Epoch [1/1] - Loss: 0.0661 - Train Accuracy: 0.9915
Validation Loss: 0.0496 - Validation Accuracy: 0.9944
Full model saved with validation accuracy: 0.9944


In [44]:
model.eval()

ImprovedDNN(
  (fc1): Linear(in_features=64, out_features=512, bias=True)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=256, out_features=128, bias=True)
  (bn3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc4): Linear(in_features=128, out_features=64, bias=True)
  (bn4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc5): Linear(in_features=64, out_features=14, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (act): LeakyReLU(negative_slope=0.1)
)

In [22]:
def evaluate(model, test_loader, device, label_encoder):
    model.eval()
    all_preds = []
    all_targets = []

    with torch.no_grad():
        for X_test, y_test in tqdm(test_loader, desc="Evaluating"):
            X_test, y_test = X_test.to(device), y_test.to(device)
            outputs = model(X_test)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(y_test.cpu().numpy())

    class_names = label_encoder.classes_

    # Safely map predictions to class names
    pred_classes = [class_names[int(i)] for i in all_preds]
    target_classes = [class_names[int(i)] for i in all_targets]

    print("\n📊 Classification Report:")
    print(classification_report(target_classes, pred_classes))

    cm = confusion_matrix(target_classes, pred_classes)
    return cm, classification_report(target_classes, pred_classes, output_dict=True)


In [23]:
print("Evaluating model on test set...")
cm, report = evaluate(model, test_loader, device, label_encoder)
print("Test evaluation complete!")

# Save the model and results
torch.save(model.state_dict(), 'final_model.pt')
joblib.dump(report, 'classification_report.pkl')
print("Model and evaluation results saved to disk.")

Evaluating model on test set...


Evaluating: 100%|██████████| 794/794 [00:01<00:00, 456.27it/s]



📊 Classification Report:
                       precision    recall  f1-score   support

             Backdoor       1.00      0.98      0.99      4807
            DDoS_HTTP       0.96      0.89      0.93      9967
            DDoS_ICMP       1.00      1.00      1.00     23286
             DDoS_TCP       0.98      0.92      0.95     10012
             DDoS_UDP       1.00      1.00      1.00     24313
                 MITM       1.00      0.86      0.92         7
               Normal       1.00      1.00      1.00    287107
             Password       1.00      1.00      1.00     10016
        Port_Scanning       0.82      1.00      0.90      3996
           Ransomware       1.00      0.97      0.98      1939
        SQL_injection       1.00      1.00      1.00     10195
            Uploading       1.00      1.00      1.00      7447
Vulnerability_scanner       1.00      1.00      1.00     10006
                  XSS       0.72      0.88      0.79      3016

             accuracy      