## 数据处理
读取数据第一步，在数据处理文件中，已经将所有信号都补全到150位，长度不足处用-1进行了填充。

In [30]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, LabelEncoder
import torch
from torch.utils.data import Dataset, DataLoader

# 加载数据
def load_data(folder, label):
    all_data = []
    all_labels = []
    for filename in os.listdir(folder):
        if filename.endswith(".csv"):
            filepath = os.path.join(folder, filename)
            df = pd.read_csv(filepath, usecols=["(V)", "state"])  # 仅读取 V 和 state 列
            all_data.append(df.values)  # 直接读取为numpy数组
            all_labels.append(np.full(len(df), label))  # 为每个样本生成标签
    data = np.stack(all_data, axis=0)  # 形状：(样本数, 150, 2)
    labels = np.concatenate(all_labels, axis=0)  # 形状：(样本数 * 150,)
    return data, labels

# 加载正常数据（训练数据）
normal_data, normal_labels = load_data("padsignal", label=0)  # 正常数据标签为0

# 加载异常数据（验证数据）
anomaly_data, anomaly_labels = load_data("abpadsignal", label=1)  # 异常数据标签为1
# 假设所有CSV文件已保存在 processed_crcslice 文件夹中
# input_folder = "padsignal"

# # 读取所有CSV文件并合并为数据集
# all_data = []
# for filename in os.listdir(input_folder):
#     if filename.endswith(".csv"):
#         filepath = os.path.join(input_folder, filename)
#         df = pd.read_csv(filepath, usecols=["(V)", "state"])
#         all_data.append(df.values)  # 直接读取为numpy数组

# data = np.stack(all_data, axis=0)  # 形状：(样本数, 150, 2)

# 编码 state 列
label_encoder = LabelEncoder()
state_column = normal_data[:, :, 1].flatten()  # 提取所有state值
label_encoder.fit(state_column)
normal_data[:, :, 1] = label_encoder.transform(state_column).reshape(normal_data.shape[0], -1)

# # 标准化时间（s）和 V 列（排除填充值-1）
# mask = (data[:, :, 0] != -1) & (data[:, :, 1] != -1)  # 掩码非填充区域
# scaler = StandardScaler()
# data[:, :, 0] = scaler.fit_transform(data[:, :, 0][mask].reshape(-1, 1)).reshape(data[:, :, 0].shape)
# data[:, :, 1] = scaler.fit_transform(data[:, :, 1][mask].reshape(-1, 1)).reshape(data[:, :, 1].shape)

# 将 data 转换为 float32 类型（确保所有值为数值类型）
normal_data = normal_data.astype(np.float32)  # 形状：(样本数, 150, 2)

# 转换为PyTorch张量并生成掩码
data_tensor = torch.FloatTensor(normal_data)  # 形状：(样本数, 150, 2)
mask_tensor = torch.BoolTensor((normal_data[:, :, 0] != -1) & (normal_data[:, :, 1] != label_encoder.transform(["padding"])[0]))  # 形状：(样本数, 150)

定义数据集和数据加载器

In [31]:
class TimeSeriesDataset(Dataset):
    def __init__(self, data, mask):
        self.data = data  # 形状：(样本数, 150, 2)
        self.mask = mask  # 形状：(样本数, 150)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.mask[idx]

# 划分训练集和验证集（80-20比例）
train_size = int(0.7 * len(data_tensor))
val_size = int(0.15 * len(data_tensor))
test_size = len(data_tensor) - train_size - val_size
train_data, val_data, test_data = data_tensor[:train_size], data_tensor[:train_size+val_size], data_tensor[train_size+val_size:]
train_mask, val_mask, test_mask = mask_tensor[:train_size], mask_tensor[:train_size+val_size], mask_tensor[train_size+val_size:]

# 创建DataLoader
batch_size = 32
train_dataset = TimeSeriesDataset(train_data, train_mask)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataset = TimeSeriesDataset(val_data, val_mask)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

定义LSTM编码器-解码器模型

In [32]:
class EncDecAD(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.encoder = torch.nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.decoder = torch.nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.linear = torch.nn.Linear(hidden_dim, input_dim)
        
    def forward(self, x):
        # 编码器：输入正向序列，输出最后隐藏状态
        _, (h_n, _) = self.encoder(x)
        # 解码器：初始状态为编码器最终状态，输入反向序列
        reversed_x = torch.flip(x, dims=[1])
        output, _ = self.decoder(reversed_x, (h_n, torch.zeros_like(h_n)))
        recon = self.linear(output)
        return torch.flip(recon, dims=[1])  # 输出与输入同顺序

定义掩码损失函数

In [33]:
def masked_mse_loss(pred, target, mask):
    # pred: (batch_size, seq_len, input_dim)
    # target: (batch_size, seq_len, input_dim)
    # mask: (batch_size, seq_len)
    loss = (pred - target) ** 2
    loss = loss.mean(dim=2)  # 对特征维度取平均
    loss = loss * mask  # 应用掩码
    return loss.sum() / mask.sum()  # 仅计算有效区域的损失

# 初始化模型和优化器
model = EncDecAD(input_dim=2, hidden_dim=64)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [34]:
num_epochs = 2
best_val_loss = float("inf")

for epoch in range(num_epochs):
    # 训练阶段
    model.train()
    train_loss = 0.0
    for batch_x, batch_mask in train_loader:
        optimizer.zero_grad()
        recon = model(batch_x)
        loss = masked_mse_loss(recon, batch_x, batch_mask)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    # 验证阶段
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch_x, batch_mask in val_loader:
            recon = model(batch_x)
            loss = masked_mse_loss(recon, batch_x, batch_mask)
            val_loss += loss.item()
    
    # 打印损失
    avg_train_loss = train_loss / len(train_loader)
    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
    
    # 早停和保存最佳模型
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), "best_model.pth")
        print("Saved best model.")

Epoch 1/2 | Train Loss: 0.0929 | Val Loss: 0.0010
Saved best model.
Epoch 2/2 | Train Loss: 0.0002 | Val Loss: 0.0000
Saved best model.


验证模型

In [35]:
# 预处理异常数据
anomaly_data[:, :, 1] = label_encoder.transform(anomaly_data[:, :, 1].flatten()).reshape(anomaly_data.shape[0], -1)
anomaly_data = anomaly_data.astype(np.float32)  # 形状：(样本数, 150, 2)

# 转换为PyTorch张量并生成掩码
anomaly_data_tensor = torch.FloatTensor(anomaly_data)  # 形状：(样本数, 150, 2)
anomaly_mask_tensor = torch.BoolTensor((anomaly_data[:, :, 0] != -1) & (anomaly_data[:, :, 1] != label_encoder.transform(["padding"])[0]))  # 形状：(样本数, 150)

In [36]:
# 创建包含异常数据和正常数据的混合数据集
mixed_data_tensor = torch.cat((anomaly_data_tensor, val_data), dim=0)
mixed_mask_tensor = torch.cat((anomaly_mask_tensor, val_mask), dim=0)
mixed_labels = [1] * len(anomaly_data_tensor) + [0] * len(val_data)  # 异常数据标签为1，正常数据标签为0

mixed_data_tensor = torch.cat((test_data, anomaly_data_tensor), dim=0)
mixed_mask_tensor = torch.cat((test_mask, anomaly_mask_tensor), dim=0)
mixed_labels = [0] * len(test_data) + [1] * len(anomaly_data_tensor)  # 正常数据标签为0，异常数据标签为1



In [41]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# 创建验证集 DataLoader
test1_dataset = TimeSeriesDataset(mixed_data_tensor, mixed_mask_tensor)
test_loader = DataLoader(test1_dataset, batch_size=batch_size, shuffle=False)

# 验证模型性能
def evaluate_model(model, test_loader, threshold, true_labels):
    model.eval()
    y_true = true_labels
    y_pred = []
    with torch.no_grad():
        for batch_x, batch_mask in test_loader:
            recon = model(batch_x)
            error = (recon - batch_x) ** 2
            error = error.mean(dim=2)  # 对特征维度取平均
            error = error * batch_mask  # 应用掩码
            anomaly_score = error.sum(dim=1) / batch_mask.sum(dim=1)  # 每个样本的平均重建误差
            # print(anomaly_score)
            y_pred.extend((anomaly_score > threshold).cpu().numpy().tolist())
            # y_true.extend([1] * len(batch_x))  # 异常数据的标签为1
    
    # 计算评估指标
    # print(y_true)
    # print(y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    return accuracy, precision, recall, f1

# 加载最佳模型
model.load_state_dict(torch.load("best_model1.pth"))

# # 设定阈值（可根据验证集调整）
# threshold = 0.0001  # 示例值，需实际调优
# evaluate_model(model, test_loader, threshold)
# # 加载最佳模型
# model.load_state_dict(torch.load("best_model.pth"))
# 寻找最优阈值
best_threshold = 0
best_precision = 0
thresholds = np.linspace(0.000001, 0.0001, 100)  # 示例阈值范围，需实际调优
for threshold in thresholds:
    accuracy, precision, recall, f1 = evaluate_model(model, test_loader, threshold, mixed_labels)
    if precision > best_precision:
        best_precision = precision
        best_threshold = threshold
    print("Threshold: {:.6f} | Precision: {:.4f}".format(threshold, precision))

print(f"Best Threshold: {best_threshold:.4f}, Best Precision: {best_precision:.4f}")

# # 设定阈值（可根据验证集调整）
# threshold = 0.1  # 示例值，需实际调优
# anomalies = detect_anomaly(model, val_data, val_mask, threshold)

# print(anomalies)

# 使用最佳阈值进行最终评估
accuracy, precision, recall, f1 = evaluate_model(model, test_loader, best_threshold, mixed_labels)
print(f"Final Evaluation - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

Threshold: 0.000001 | Precision: 0.0857
Threshold: 0.000002 | Precision: 0.0857
Threshold: 0.000003 | Precision: 0.0857
Threshold: 0.000004 | Precision: 0.0974
Threshold: 0.000005 | Precision: 0.2627
Threshold: 0.000006 | Precision: 0.2932
Threshold: 0.000007 | Precision: 0.3078
Threshold: 0.000008 | Precision: 0.3171
Threshold: 0.000009 | Precision: 0.3221
Threshold: 0.000010 | Precision: 0.3167
Threshold: 0.000011 | Precision: 0.3180
Threshold: 0.000012 | Precision: 0.3243
Threshold: 0.000013 | Precision: 0.3254
Threshold: 0.000014 | Precision: 0.3330
Threshold: 0.000015 | Precision: 0.3375
Threshold: 0.000016 | Precision: 0.3396
Threshold: 0.000017 | Precision: 0.3447
Threshold: 0.000018 | Precision: 0.3511
Threshold: 0.000019 | Precision: 0.3577
Threshold: 0.000020 | Precision: 0.3666
Threshold: 0.000021 | Precision: 0.3705
Threshold: 0.000022 | Precision: 0.3783
Threshold: 0.000023 | Precision: 0.3807
Threshold: 0.000024 | Precision: 0.3495
Threshold: 0.000025 | Precision: 0.3140
