In [1]:
# 1. 导入依赖 & 读取数据
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# 载入明细和标签，字段名与 featureprocessing-Copy1.ipynb 保持一致
df = pd.read_csv('model1_data.csv', encoding='gbk', low_memory=False)
dflabel = pd.read_csv('model1_label.csv', encoding='gbk')

# 重命名字段并合并标签
df.rename(columns={
    '卡号': 'card_id',
    '机构名称': 'org_id',
    '结算日期时间': 'settle_time',
    '明细项目交易费用': 'fee',
}, inplace=True)

dflabel.rename(columns={'卡号': 'card_id', '标签': 'label'}, inplace=True)

df = pd.merge(df, dflabel, on='card_id', how='inner')

# 将 settle_time 转为时间，方便排序
df['settle_time'] = pd.to_datetime(df['settle_time'])

print(df[['card_id', 'settle_time', '明细项目名称']].head())
len(df)

                                card_id         settle_time       明细项目名称
0  ff88846b-56ec-4d2f-a2fc-aca3c116c865 2023-03-29 09:18:48        尼可地尔片
1  ff88846b-56ec-4d2f-a2fc-aca3c116c865 2023-03-29 09:18:48  头孢克洛缓释片(II)
2  ff88846b-56ec-4d2f-a2fc-aca3c116c865 2023-03-29 09:18:48      乳果糖口服溶液
3  ff88846b-56ec-4d2f-a2fc-aca3c116c865 2023-03-29 09:18:48        艾司唑仑片
4  dccc6fc4-d367-420f-846d-ef5ece5cc1d2 2023-03-13 09:43:49    盐酸地尔硫卓缓释片


1107985

In [2]:
# 2. 构造账户级 item 序列，并建立 item 编码（含 train/val/test 划分）

from sklearn.model_selection import train_test_split

# 去除缺失项目名称
_df = df.dropna(subset=['明细项目名称']).copy()

# 按 card_id 和时间排序
_df = _df.sort_values(['card_id', 'settle_time'])

# 每个账户的项目序列
card_items = (
    _df.groupby('card_id')['明细项目名称']
    .apply(list)
)

print('账户数:', len(card_items))
print(card_items.head())

# 建立 item 词表：0 作为 PAD，其余从 1 开始
all_items = sorted(set(_df['明细项目名称'].dropna().tolist()))
item2id = {item: idx + 1 for idx, item in enumerate(all_items)}  # 1..V
id2item = {idx: item for item, idx in item2id.items()}
PAD_ID = 0
vocab_size = len(item2id) + 1  # 加上 PAD

print('vocab_size (含 PAD):', vocab_size)

# 编码为 item id 序列
sequences = []
card_ids_seq = []

for card_id, items in card_items.items():
    if len(items) < 2:
        continue
    seq = [item2id[x] for x in items if x in item2id]
    if len(seq) >= 2:
        sequences.append(seq)
        card_ids_seq.append(card_id)

print('有效序列数:', len(sequences))

# 按账户维度划分 train/val/test（8/1/1，可按需调整）
indices = list(range(len(sequences)))
train_idx, temp_idx = train_test_split(indices, test_size=0.2, random_state=42)
val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, random_state=42)

train_seqs = [sequences[i] for i in train_idx]
val_seqs = [sequences[i] for i in val_idx]
test_seqs = [sequences[i] for i in test_idx]

train_card_ids = [card_ids_seq[i] for i in train_idx]
val_card_ids = [card_ids_seq[i] for i in val_idx]
test_card_ids = [card_ids_seq[i] for i in test_idx]

len(train_seqs), len(val_seqs), len(test_seqs)

账户数: 8917
card_id
00022092-02fc-45e0-83f2-c51a0d02f2d0    [拉坦前列素滴眼液, 马来酸噻吗洛尔滴眼液, 马来酸噻吗洛尔滴眼液, 芪苈强心胶囊, 参松养...
000e9b7e-6a96-4eda-947b-425e964e1212    [银丹心脑通软胶囊, 莫匹罗星软膏, 双氯芬酸二乙胺乳胶剂, 仙灵骨葆胶囊, 麝香保心丸, ...
000f8286-aa23-42d7-8510-2fab100bcc7b    [莫匹罗星软膏, 氨酚羟考酮片, 金水宝片, 丁丙诺啡透皮贴剂, 胞磷胆碱钠片, 非那雄胺片...
00117f6c-e739-4913-b453-85a118a47123    [宣肺止嗽合剂, 左氧氟沙星片, 宣肺止嗽合剂, 左氧氟沙星片, 宣肺止嗽合剂, 左氧氟沙星...
001c5c03-1db7-4303-934e-21decf219ab1    [参松养心胶囊, 麝香保心丸, 利伐沙班片, 维生素B2片, 双歧杆菌三联活菌胶囊, 胰激肽...
Name: 明细项目名称, dtype: object
vocab_size (含 PAD): 4120
有效序列数: 8893


(7114, 889, 890)

In [3]:
# 3. 定义 Dataset / DataLoader，用于 next-item prediction 训练（带 train/val/test）

class SeqDataset(Dataset):
    def __init__(self, sequences, max_len=100):
        self.max_len = max_len
        self.sequences = []
        for seq in sequences:
            if len(seq) > max_len:
                seq = seq[-max_len:]
            if len(seq) >= 2:
                self.sequences.append(seq)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]
        # 输入为前 n-1，目标为后 n-1
        return torch.tensor(seq[:-1], dtype=torch.long), torch.tensor(seq[1:], dtype=torch.long)


def collate_fn(batch):
    # 动态 padding
    inputs, targets = zip(*batch)
    lengths = [len(x) for x in inputs]
    max_len = max(lengths)

    padded_inputs = torch.full((len(inputs), max_len), PAD_ID, dtype=torch.long)
    padded_targets = torch.full((len(inputs), max_len), PAD_ID, dtype=torch.long)

    for i, (inp, tgt) in enumerate(zip(inputs, targets)):
        L = len(inp)
        padded_inputs[i, :L] = inp
        padded_targets[i, :L] = tgt

    return padded_inputs, padded_targets, torch.tensor(lengths, dtype=torch.long)


train_dataset = SeqDataset(train_seqs, max_len=100)
val_dataset = SeqDataset(val_seqs, max_len=100)
test_dataset = SeqDataset(test_seqs, max_len=100)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False, collate_fn=collate_fn)

len(train_dataset), len(val_dataset), len(test_dataset)

(7114, 889, 890)

In [9]:
# 4. 定义 SASRec 模型（简化版 Transformer 序列推荐）

class SASRec(nn.Module):
    def __init__(self, vocab_size, max_len=100, embed_dim=128, num_heads=4,
                 num_layers=4, ff_dim=256, dropout=0.1):
        super().__init__()
        self.max_len = max_len
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=PAD_ID)
        self.pos_embedding = nn.Embedding(max_len, embed_dim)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=ff_dim,
            dropout=dropout,
            batch_first=True,
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        # x: [B, T]
        B, T = x.size()
        positions = torch.arange(T, device=x.device).unsqueeze(0).expand(B, T)

        seq_emb = self.embedding(x) + self.pos_embedding(positions)  # [B, T, E]

        # padding mask: True 表示需要 mask
        pad_mask = (x == PAD_ID)  # [B, T]

        # subsequent mask: 保证只能看到当前位置及之前的 token
        subsequent_mask = torch.triu(torch.ones(T, T, device=x.device, dtype=torch.bool), diagonal=1)

        out = self.encoder(
            seq_emb,
            mask=subsequent_mask,
            src_key_padding_mask=pad_mask,
        )  # [B, T, E]

        logits = self.fc(out)  # [B, T, V]
        return logits


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SASRec(vocab_size=vocab_size, max_len=100, embed_dim=128).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

model

SASRec(
  (embedding): Embedding(4120, 128, padding_idx=0)
  (pos_embedding): Embedding(100, 128)
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=256, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=256, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (fc): Linear(in_features=128, out_features=4120, bias=True)
)

In [11]:
# 5. 训练 SASRec（next item 重构），并在验证集上监控过拟合 + Early Stopping

import copy

num_epochs = 200  # 最大轮数，可根据资源调整
patience = 29     # 连续多少个 epoch val_loss 不下降则早停
best_val_loss = float('inf')
best_state = None

for epoch in range(1, num_epochs + 1):
    # ---------- 训练 ----------
    model.train()
    total_loss = 0.0
    total_steps = 0

    for batch in train_loader:
        inputs, targets, lengths = batch
        inputs = inputs.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        logits = model(inputs)   # [B, T, V]

        B, T, V = logits.shape
        loss = criterion(logits.view(B * T, V), targets.view(B * T))

        loss.backward()
        optimizer.step()

        total_loss += loss.item() * B
        total_steps += B

    train_avg_loss = total_loss / max(total_steps, 1)

    # ---------- 验证 ----------
    model.eval()
    val_total_loss = 0.0
    val_total_steps = 0
    with torch.no_grad():
        for batch in val_loader:
            inputs, targets, lengths = batch
            inputs = inputs.to(device)
            targets = targets.to(device)

            logits = model(inputs)
            B, T, V = logits.shape
            loss = criterion(logits.view(B * T, V), targets.view(B * T))

            val_total_loss += loss.item() * B
            val_total_steps += B

    val_avg_loss = val_total_loss / max(val_total_steps, 1)

    print(f"Epoch {epoch} | train_loss = {train_avg_loss:.4f} | val_loss = {val_avg_loss:.4f}")

    # ---------- Early Stopping & 保存最优模型 ----------
    if val_avg_loss < best_val_loss:
        best_val_loss = val_avg_loss
        best_state = copy.deepcopy(model.state_dict())
        torch.save({'model_state_dict': best_state, 'item2id': item2id}, 'sasrec_model_best.pt')
        no_improve_epochs = 0
    else:
        no_improve_epochs += 1
        if no_improve_epochs >= patience:
            print(f"Early stopping at epoch {epoch} (no val improvement for {patience} epochs).")
            break

# 训练结束后，加载最佳验证损失对应的权重
if best_state is not None:
    model.load_state_dict(best_state)

'finished'

Epoch 1 | train_loss = 4.8054 | val_loss = 5.3426
Epoch 2 | train_loss = 4.7795 | val_loss = 5.3451
Epoch 3 | train_loss = 4.7546 | val_loss = 5.3515
Epoch 4 | train_loss = 4.7317 | val_loss = 5.3559
Epoch 5 | train_loss = 4.7083 | val_loss = 5.3563
Epoch 6 | train_loss = 4.6853 | val_loss = 5.3606
Epoch 7 | train_loss = 4.6642 | val_loss = 5.3671
Epoch 8 | train_loss = 4.6414 | val_loss = 5.3716
Epoch 9 | train_loss = 4.6201 | val_loss = 5.3835
Epoch 10 | train_loss = 4.5984 | val_loss = 5.3814
Epoch 11 | train_loss = 4.5798 | val_loss = 5.3936
Epoch 12 | train_loss = 4.5595 | val_loss = 5.3967
Epoch 13 | train_loss = 4.5429 | val_loss = 5.4052
Epoch 14 | train_loss = 4.5211 | val_loss = 5.4157
Epoch 15 | train_loss = 4.5035 | val_loss = 5.4204
Epoch 16 | train_loss = 4.4895 | val_loss = 5.4327
Epoch 17 | train_loss = 4.4705 | val_loss = 5.4360
Epoch 18 | train_loss = 4.4559 | val_loss = 5.4476
Epoch 19 | train_loss = 4.4376 | val_loss = 5.4539
Epoch 20 | train_loss = 4.4201 | val_los

'finished'

In [12]:
# 6. 利用重构误差进行无监督异常检测（账户级），在测试集上评估

import math
import numpy as np

model.eval()

card_step_losses = {}
step_losses_all = []
step_recalls_all = []
step_ndcgs_all = []

with torch.no_grad():
    for seq, card_id in zip(test_seqs, test_card_ids):
        if len(seq) < 2:
            continue
        # 截断到与训练相同的 max_len
        if len(seq) > 100:
            seq_use = seq[-100:]
        else:
            seq_use = seq

        inp = torch.tensor(seq_use[:-1], dtype=torch.long, device=device).unsqueeze(0)  # [1, T]
        tgt = torch.tensor(seq_use[1:], dtype=torch.long, device=device).unsqueeze(0)  # [1, T]

        logits = model(inp)  # [1, T, V]
        log_probs = torch.log_softmax(logits, dim=-1)  # [1, T, V]

        T = tgt.size(1)
        for t in range(T):
            true_id = tgt[0, t].item()
            if true_id == PAD_ID:
                continue

            lp = log_probs[0, t, true_id].item()
            loss_t = -lp
            step_losses_all.append(loss_t)

            # 排名：按概率排序，计算 Recall@10 和 NDCG@10
            probs_t = log_probs[0, t].exp().cpu().numpy()  # 转回概率
            ranked_ids = np.argsort(-probs_t)

            rank = np.where(ranked_ids == true_id)[0]
            if len(rank) > 0:
                rank = int(rank[0]) + 1  # 从 1 开始
            else:
                rank = None

            if rank is not None and rank <= 10:
                step_recalls_all.append(1.0)
                step_ndcgs_all.append(1.0 / math.log2(rank + 1))
            else:
                step_recalls_all.append(0.0)
                step_ndcgs_all.append(0.0)

            card_step_losses.setdefault(card_id, []).append(loss_t)

# 账户级异常分数：平均 loss（越大越异常）
card_ids_scored = []
card_scores = []
card_steps = []

for cid, losses in card_step_losses.items():
    card_ids_scored.append(cid)
    card_scores.append(float(np.mean(losses)))
    card_steps.append(len(losses))

len(card_ids_scored), np.mean(card_scores)

(890, 5.432594548070851)

In [13]:
# 7. 账户级结果整理与评估（AUC、PR-AUC、Precision、Recall、F1）

from sklearn.metrics import roc_auc_score, average_precision_score, precision_score, recall_score, f1_score

result_df = pd.DataFrame({
    'card_id': card_ids_scored,
    'sasrec_score': card_scores,   # 平均重构 loss，越大越异常
    'num_steps': card_steps,
})

# 简单策略：使用 99 分位数作为异常阈值（可根据需要调整）
threshold = result_df['sasrec_score'].quantile(0.8)
result_df['sasrec_label'] = (result_df['sasrec_score'] >= threshold).astype(int)  # 1=异常, 0=正常

# 账户级真实标签：按 card_id 聚合明细标签，这里用 max 规则（账户内只要有一条是 1，就认为账户为 1）
card_label = (
    df.groupby('card_id')['label']
    .max()
    .reindex(result_df['card_id'])  # 按 result_df 对齐
)

# 转成 numpy 数组
y_true = card_label.values.astype(int)

# sasrec_score 本身就是“越大越异常”，可以直接作为异常分数
anomaly_prob = result_df['sasrec_score'].values

# AUC-ROC
auc = roc_auc_score(y_true, anomaly_prob)

# PR-AUC（Average Precision）
pr_auc = average_precision_score(y_true, anomaly_prob)

# 二值预测：使用 sasrec_label（1=异常，0=正常）
y_pred = result_df['sasrec_label'].values

precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)

metrics = {
    'AUC': auc,
    'PR_AUC': pr_auc,
    'Precision': precision,
    'Recall': recall,
    'F1': f1,
}

metrics

{'AUC': 0.3961035701737403,
 'PR_AUC': 0.18319172815778018,
 'Precision': 0.1404494382022472,
 'Recall': 0.12315270935960591,
 'F1': 0.13123359580052493}

In [14]:
# 8. 序列级指标：Avg_Loss、Recall@10、NDCG@10

import numpy as np
import math

avg_loss = float(np.mean(step_losses_all)) if step_losses_all else float('nan')
recall_at_10 = float(np.mean(step_recalls_all)) if step_recalls_all else float('nan')
ndcg_at_10 = float(np.mean(step_ndcgs_all)) if step_ndcgs_all else float('nan')

seq_metrics = {
    'Avg_Loss': avg_loss,
    'Recall@10': recall_at_10,
    'NDCG@10': ndcg_at_10,
}

seq_metrics

{'Avg_Loss': 5.3694311141630715,
 'Recall@10': 0.2855103971322834,
 'NDCG@10': 0.16067002431333097}