In [1]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import roc_auc_score, average_precision_score
import time
from deepod.models.dsvdd import DeepSVDD
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, MaxAbsScaler, QuantileTransformer, PowerTransformer, Normalizer
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

In [2]:
train_data = pd.read_csv("./NB15/train_data.csv")
test_data = pd.read_csv("./NB15/test_data.csv")

X_train = train_data.drop(columns=['attack_cat','label'])
y_train = train_data['label']
X_test = test_data.drop(columns=['attack_cat','label'])
y_test = test_data['label']

X_train.replace({True: 1, False: 0}, inplace=True)
X_test.replace({True: 1, False: 0}, inplace=True)

columns_to_normalize = ['dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss',
                        'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt',
                        'synack', 'ackdat', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src',
                        'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
                        'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst',
                        'is_sm_ips_ports']

scaler = PowerTransformer()

X_train[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
X_test[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

# X_train_normalized = scaler.fit_transform(X_train)
# X_test_normalized = scaler.transform(X_test)

# 创建 iForest 模型并训练
start_time = time.time()
iforest = IsolationForest(contamination=0.2)
iforest.fit(X_train)
iforest_time = time.time() - start_time

# 使用 iForest 检测异常值并计算评估指标
iforest_scores = iforest.decision_function(X_test)
iforest_auc = roc_auc_score(y_test, -iforest_scores)
iforest_auprc = average_precision_score(y_test, -iforest_scores)

# 创建 LOF 模型并训练
start_time = time.time()
lof = LocalOutlierFactor(contamination=0.2)
lof.fit(X_train)
lof_time = time.time() - start_time

# 使用 LOF 检测异常值并计算评估指标
lof_scores = -lof.fit_predict(X_test)
lof_auc = roc_auc_score(y_test, lof_scores)
lof_auprc = average_precision_score(y_test, lof_scores)

print("iForest - AUROC:", iforest_auc)
print("iForest - AUPRC:", iforest_auprc)
print("iForest - Time:", iforest_time)
print("LOF - AUROC:", lof_auc)
print("LOF - AUPRC:", lof_auprc)
print("LOF - Time:", lof_time)



iForest - AUROC: 0.7723691489901032
iForest - AUPRC: 0.3522226460423786
iForest - Time: 4.514748811721802
LOF - AUROC: 0.5613005113863193
LOF - AUPRC: 0.20389021714464825
LOF - Time: 24.905144453048706


In [3]:
class DeepSVDDModel(nn.Module):
    def __init__(self, input_dim, rep_dim):
        super(DeepSVDDModel, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(True),
            nn.Linear(128, 64),
            nn.ReLU(True),
            nn.Linear(64, rep_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(rep_dim, 64),
            nn.ReLU(True),
            nn.Linear(64, 128),
            nn.ReLU(True),
            nn.Linear(128, input_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        x_recon = self.decoder(z)
        return z, x_recon

class DeepSVDD:
    def __init__(self, rep_dim=128, lr=1e-3, device='cpu'):
        self.rep_dim = rep_dim
        self.lr = lr
        self.device = device

    def fit(self, X_train):
        train_loader = DataLoader(TensorDataset(torch.Tensor(X_train)), batch_size=64, shuffle=True)
        self.model = DeepSVDDModel(input_dim=X_train.shape[1], rep_dim=self.rep_dim).to(self.device)
        optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)
        criterion = nn.MSELoss()

        start_time = time.time()
        for epoch in range(100):
            epoch_loss = 0.0
            for batch_x in train_loader:
                batch_x = batch_x[0].to(self.device)
                optimizer.zero_grad()
                _, x_recon = self.model(batch_x)
                loss = criterion(batch_x, x_recon)
                loss.backward()
                optimizer.step()
                epoch_loss += loss.item()
            if epoch % 10 == 0:
                print(f"Epoch [{epoch+1}/100], Loss: {epoch_loss/len(train_loader):.4f}")
        # print(f"Training Time: {time.time() - start_time:.2f}s")

    def decision_function(self, X):
        self.model.eval()
        with torch.no_grad():
            X_tensor = torch.Tensor(X).to(self.device)
            z, _ = self.model(X_tensor)
            dist = torch.sum((z - torch.mean(z, dim=0)) ** 2, dim=1)
        return dist.cpu().numpy()


X_train_np = X_train.values
X_test_np = X_test.values
y_train_np = y_train.values
y_test_np = y_test.values

# 训练 DeepSVDD 模型
start_time = time.time()
deep_svdd = DeepSVDD(rep_dim=32, lr=1e-3, device='cpu')
deep_svdd.fit(X_train_np)
dsvdd_time = time.time() - start_time

# 在测试集上计算异常分数
scores = deep_svdd.decision_function(X_test_np)

print("DSVDD - AUROC:", roc_auc_score(y_test_np, y_score=-scores))  # 注意要将分数取负号
print("DSVDD - AUPRC:", average_precision_score(y_test_np, y_score=-scores))
print("DSVDD - Time:", dsvdd_time)

Epoch [1/100], Loss: 173403.7567
Epoch [11/100], Loss: 156.0690
Epoch [21/100], Loss: 147.9480
Epoch [31/100], Loss: 69.1753
Epoch [41/100], Loss: 57.4499
Epoch [51/100], Loss: 89.2137
Epoch [61/100], Loss: 70.4890
Epoch [71/100], Loss: 52.5665
Epoch [81/100], Loss: 91.1125
Epoch [91/100], Loss: 52.3805
DSVDD - AUROC: 0.49870464081200283
DSVDD - AUPRC: 0.17851105887810353
DSVDD - Time: 485.87465953826904
