In [1]:
# 1. 导入依赖 & 读取数据
import pandas as pd

# 载入明细和标签，字段名与 featureprocessing-Copy1.ipynb 保持一致
df = pd.read_csv('model1_data.csv', encoding='gbk', low_memory=False)
dflabel = pd.read_csv('model1_label.csv', encoding='gbk')

# 重命名字段并合并标签
df.rename(columns={
    '卡号': 'card_id',
    '机构名称': 'org_id',
    '结算日期时间': 'settle_time',
    '明细项目交易费用': 'fee',
}, inplace=True)

dflabel.rename(columns={'卡号': 'card_id', '标签': 'label'}, inplace=True)

df = pd.merge(df, dflabel, on='card_id', how='inner')

# 将 settle_time 转为时间，方便排序
df['settle_time'] = pd.to_datetime(df['settle_time'])

print(df[['card_id', 'settle_time', '明细项目名称']].head())
len(df)

                                card_id         settle_time       明细项目名称
0  ff88846b-56ec-4d2f-a2fc-aca3c116c865 2023-03-29 09:18:48        尼可地尔片
1  ff88846b-56ec-4d2f-a2fc-aca3c116c865 2023-03-29 09:18:48  头孢克洛缓释片(II)
2  ff88846b-56ec-4d2f-a2fc-aca3c116c865 2023-03-29 09:18:48      乳果糖口服溶液
3  ff88846b-56ec-4d2f-a2fc-aca3c116c865 2023-03-29 09:18:48        艾司唑仑片
4  dccc6fc4-d367-420f-846d-ef5ece5cc1d2 2023-03-13 09:43:49    盐酸地尔硫卓缓释片


1107985

In [2]:
# 2. 构造账户级项目序列，并统计一阶转移概率
import numpy as np
from collections import Counter, defaultdict

# 去除缺失项目名称
_df = df.dropna(subset=['明细项目名称']).copy()

# 按 card_id 和时间排序
_df = _df.sort_values(['card_id', 'settle_time'])

# 每个账户的项目序列
card_items = (
    _df.groupby('card_id')['明细项目名称']
    .apply(list)
)

print('账户数:', len(card_items))
print(card_items.head())

# 统计全局一阶转移计数
trans_counts = Counter()      # (item_i, item_j) -> count
from_counts = Counter()       # item_i -> 出现为前一项的次数

for items in card_items:
    if len(items) < 2:
        continue
    for i in range(len(items) - 1):
        a, b = items[i], items[i + 1]
        trans_counts[(a, b)] += 1
        from_counts[a] += 1

len(trans_counts), len(from_counts)

账户数: 8917
card_id
00022092-02fc-45e0-83f2-c51a0d02f2d0    [拉坦前列素滴眼液, 马来酸噻吗洛尔滴眼液, 马来酸噻吗洛尔滴眼液, 芪苈强心胶囊, 参松养...
000e9b7e-6a96-4eda-947b-425e964e1212    [银丹心脑通软胶囊, 莫匹罗星软膏, 双氯芬酸二乙胺乳胶剂, 仙灵骨葆胶囊, 麝香保心丸, ...
000f8286-aa23-42d7-8510-2fab100bcc7b    [莫匹罗星软膏, 氨酚羟考酮片, 金水宝片, 丁丙诺啡透皮贴剂, 胞磷胆碱钠片, 非那雄胺片...
00117f6c-e739-4913-b453-85a118a47123    [宣肺止嗽合剂, 左氧氟沙星片, 宣肺止嗽合剂, 左氧氟沙星片, 宣肺止嗽合剂, 左氧氟沙星...
001c5c03-1db7-4303-934e-21decf219ab1    [参松养心胶囊, 麝香保心丸, 利伐沙班片, 维生素B2片, 双歧杆菌三联活菌胶囊, 胰激肽...
Name: 明细项目名称, dtype: object


(263190, 4110)

In [3]:
# 3. 计算转移概率，并为每个账户计算 Markov 异常分数
import math

# 转移概率 p(b|a) = count(a->b) / count(a)
trans_prob = {}
for (a, b), c in trans_counts.items():
    trans_prob[(a, b)] = c / from_counts[a]

# 为避免对从未见过的转移取 log(0)，设置一个最小平滑概率
MIN_PROB = 1e-8

card_scores = []   # 每个账户的平均负对数似然（越大越异常）
card_steps = []    # 有效转移步数
card_ids = []

for card_id, items in card_items.items():
    if len(items) < 2:
        continue
    step_losses = []
    for i in range(len(items) - 1):
        a, b = items[i], items[i + 1]
        p = trans_prob.get((a, b), MIN_PROB)
        loss = -math.log(p)
        step_losses.append(loss)
    if step_losses:
        card_ids.append(card_id)
        card_steps.append(len(step_losses))
        card_scores.append(float(np.mean(step_losses)))

len(card_ids), np.mean(card_scores)

(8893, 4.92348885739929)

In [7]:
# 4. 结果整理与导出，并生成二值异常标签

result_df = pd.DataFrame({
    'card_id': card_ids,
    'markov_score': card_scores,   # 平均负对数转移概率，越大越异常
    'num_steps': card_steps,
})

# 简单策略：按照 score 的 99 分位数作为异常阈值（可调）
threshold = result_df['markov_score'].quantile(0.8)

result_df['markov_label'] = (result_df['markov_score'] >= threshold).astype(int)  # 1=异常, 0=正常

# 按异常程度从大到小排序
result_df = result_df.sort_values('markov_score', ascending=False)

# 保存结果
result_df.to_csv('markov_results.csv', index=False, encoding='utf-8')

threshold, result_df.head(10)

(5.291790148222901,
                                    card_id  markov_score  num_steps  \
 7863  e24cbbc5-25ba-47db-a017-1512953e8e4b      6.763885          1   
 4403  81b9fa94-b033-465b-a690-5b8aa9653e89      6.585250          2   
 8030  e7124ecf-b13b-459e-982d-6692ad65802b      6.563940          3   
 8659  f8d8f53d-56c5-4493-a6d0-2ad76297e4b1      6.474662          1   
 4272  7df572b9-1716-4c63-a7c4-e0a595fe2a5f      6.412784          3   
 5748  a6cf9085-763c-425d-948d-1b21adc1eaa9      6.365301          4   
 3384  63d4d8fe-672e-4fad-bdc5-0c8d87f5c8de      6.311119          6   
 6192  b3d17606-0bf2-4322-af7c-a59195424fba      6.258737          3   
 1421  2a3d04ce-4476-4759-a8f9-d12960add7a9      6.180301          9   
 7669  dcd1fd22-6a1e-474b-abdb-782edb1ad483      6.147415          3   
 
       markov_label  
 7863             1  
 4403             1  
 8030             1  
 8659             1  
 4272             1  
 5748             1  
 3384             1  
 6192     

In [8]:
# 5. 计算评估指标：AUC、PR-AUC、Precision、Recall、F1

from sklearn.metrics import roc_auc_score, average_precision_score, precision_score, recall_score, f1_score

# 账户级真实标签：按 card_id 聚合明细标签，这里用 max 规则（账户内只要有一条是 1，就认为账户为 1）
card_label = (
    df.groupby('card_id')['label']
    .max()
    .reindex(result_df['card_id'])  # 按 result_df 对齐
)

# 转成 numpy 数组
y_true = card_label.values.astype(int)

# Markov 的 score 本身就是“越大越异常”，可以直接作为异常分数
anomaly_prob = result_df['markov_score'].values

# AUC-ROC
auc = roc_auc_score(y_true, anomaly_prob)

# PR-AUC（Average Precision）
pr_auc = average_precision_score(y_true, anomaly_prob)

# 二值预测：使用上一步生成的 markov_label（1=异常，0=正常）
y_pred = result_df['markov_label'].values

precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)

metrics = {
    'AUC': auc,
    'PR_AUC': pr_auc,
    'Precision': precision,
    'Recall': recall,
    'F1': f1,
}

metrics

{'AUC': 0.4039592790354521,
 'PR_AUC': 0.15754908927594946,
 'Precision': 0.10567734682405847,
 'Recall': 0.10549943883277217,
 'F1': 0.10558831788823364}

In [9]:
# 6. 计算序列级的 Avg_Loss、Recall@10、NDCG@10（next-item prediction 视角）

import numpy as np
import math

# 为每个前项 a 预先构建按 p(b|a) 降序排列的候选列表
from collections import defaultdict

candidates_by_from = defaultdict(list)  # a -> list of (b, p)
for (a, b), p in trans_prob.items():
    candidates_by_from[a].append((b, p))

for a in candidates_by_from:
    candidates_by_from[a].sort(key=lambda x: x[1], reverse=True)  # 按概率从大到小

step_losses = []   # 所有时间步的 -log p
step_recalls = []  # 每步的 recall@10（0/1）
step_ndcgs = []    # 每步的 ndcg@10

for items in card_items:
    if len(items) < 2:
        continue
    for i in range(len(items) - 1):
        a, b_true = items[i], items[i + 1]

        # 概率与 loss
        p = trans_prob.get((a, b_true), MIN_PROB)
        loss = -math.log(p)
        step_losses.append(loss)

        # 排名：在 a 的候选列表中找到 b_true 的位置
        cand_list = candidates_by_from.get(a, [])
        rank = None
        for idx, (b_cand, _) in enumerate(cand_list):
            if b_cand == b_true:
                rank = idx + 1  # 从 1 开始计数
                break

        # Recall@10
        if rank is not None and rank <= 10:
            step_recalls.append(1.0)
            # NDCG@10：单一正例，DCG = 1 / log2(rank+1)
            step_ndcgs.append(1.0 / math.log2(rank + 1))
        else:
            step_recalls.append(0.0)
            step_ndcgs.append(0.0)

# 聚合指标
avg_loss = float(np.mean(step_losses)) if step_losses else float('nan')
recall_at_10 = float(np.mean(step_recalls)) if step_recalls else float('nan')
ndcg_at_10 = float(np.mean(step_ndcgs)) if step_ndcgs else float('nan')

seq_metrics = {
    'Avg_Loss': avg_loss,      # 所有时间步的平均 -log p
    'Recall@10': recall_at_10,
    'NDCG@10': ndcg_at_10,
}

seq_metrics

{'Avg_Loss': 4.922502239857984,
 'Recall@10': 0.2799448042269208,
 'NDCG@10': 0.1569723509274994}