In [1]:
#import modules
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl
import dgl.nn as dglnn
import networkx as nx
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import precision_recall_curve, auc, precision_score, recall_score, f1_score
from sklearn.manifold import TSNE
from tqdm import trange
import matplotlib.pyplot as plt
import numpy as np
import random
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 1. 载入数据
df     = pd.read_csv('model1_data.csv',    encoding='gbk', low_memory=False)
dflabel= pd.read_csv('model1_label.csv',   encoding='gbk')

In [3]:
df.rename(columns={'卡号':'card_id', '机构名称':'org_id',
                   '结算日期时间':'settle_time', '明细项目交易费用':'fee'}, inplace=True)
dflabel.rename(columns={'卡号':'card_id', '标签':'label'}, inplace=True)
df = pd.merge(df, dflabel, on='card_id', how='inner')

In [4]:
print(df.info())
print(df.loc[0])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1107985 entries, 0 to 1107984
Data columns (total 29 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   Unnamed: 0_x  1107985 non-null  int64  
 1   卡组编号          1107985 non-null  object 
 2   卡群编号          1107985 non-null  object 
 3   流水号           1107985 non-null  int64  
 4   card_id       1107985 non-null  object 
 5   姓名            1107985 non-null  object 
 6   身份证号          1107985 non-null  object 
 7   机构代码          1107985 non-null  object 
 8   org_id        1107985 non-null  object 
 9   科室代码          1106256 non-null  object 
 10  科室名称          1106256 non-null  object 
 11  医师工号          1107985 non-null  object 
 12  医师姓名          1107985 non-null  object 
 13  费用类别          1107985 non-null  int64  
 14  明细项目编码        1107985 non-null  object 
 15  明细项目名称        1102479 non-null  object 
 16  明细项目单价        1107985 non-null  float64
 17  明细项目数量        1107985 non-n

In [6]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from tqdm import tqdm

# 加载模型和分词器
model_name = "PULSE-7bv5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

# 定义嵌入提取函数
def get_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=32)
    with torch.no_grad():
        outputs = model(**inputs)
    # 使用最后一层隐藏状态的平均值作为嵌入向量
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    
unique_items = df['明细项目名称'].dropna().unique()


Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.85s/it]


In [7]:
unique_items.size

4119

In [None]:
# 4. 构建药品名称到向量的映射
item_emb_dict = {}
for name in tqdm(unique_items, desc="Embedding items"):
    try:
        item_emb_dict[name]=get_embedding(name)
    except:
        continue

Embedding items:   8%|▊         | 340/4119 [04:30<28:59,  2.17it/s]  

In [None]:
# 5. 将药品嵌入赋值到每条记录
df['item_emb'] = df['明细项目名称'].map(item_emb_dict)

# 6. 聚合卡级别表示（均值池化）
card_emb = (
    df.dropna(subset=['item_emb'])
    .groupby('card_idx')['item_emb']
    .apply(list)
)


In [None]:
seq_lens = card_emb.apply(len)
print(seq_lens.describe())
import matplotlib.pyplot as plt

plt.hist(seq_lens, bins=30)
plt.xlabel("Sequence Length")
plt.ylabel("Count")
plt.title("Distribution of item sequence lengths")
plt.grid(True)
plt.show()

In [None]:
cutoff = seq_lens.quantile(0.98)
print(f"建议截断长度（98%分位）: {cutoff}")

In [None]:
# 2. 获取嵌入维度和最大序列长度
embedding_dim = len(card_emb.iloc[0][0])  # 例如 768
max_len = int(cutoff)

# 3. 定义 zero padding 函数
def pad_sequence(seq, max_len, dim):
    padded = np.zeros((max_len, dim), dtype=np.float32)
    for i, vec in enumerate(seq[:max_len]):
        padded[i] = vec
    return padded

# 4. 对所有卡执行 padding，得到统一长度的矩阵序列
card_emb = card_emb.apply(lambda seq: pad_sequence(seq, max_len, embedding_dim))
print(card_emb.head())
print(card_emb.info())
print(card_emb[1].shape)

In [None]:
card_feats.to_csv('card_feats.csv', index=False, encoding='utf-8')
card_emb.to_csv('card_emb .csv', index=False, encoding='utf-8')
org_feats.to_csv('org_feats.csv', index=False, encoding='utf-8')

In [None]:
import torch
import torch.nn as nn


# 1. 建立 item 到 ID 的映射
item2id = {item: idx for idx, item in enumerate(item_emb_dict.keys())}
id2item = {idx: item for item, idx in item2id.items()}

# 2. 构建 embedding 矩阵
embedding_matrix = np.stack([item_emb_dict[item] for item in item2id])
embedding_tensor = torch.tensor(embedding_matrix, dtype=torch.float)

# 3. 构建 Lookup Embedding Layer（冻结权重）
lookup_table = nn.Embedding.from_pretrained(embedding_tensor, freeze=True)

# 示例：查找某个项目向量
item_name = '尼可地尔片'
item_id = item2id[item_name]
vector = lookup_table(torch.tensor(item_id))

In [None]:
import json
# 构建 nn.Embedding 层（冻结）
embedding_layer = nn.Embedding.from_pretrained(embedding_tensor, freeze=True)

# 保存 embedding 层参数
torch.save(embedding_layer.state_dict(), 'item_embedding.pt')

# 保存 item2id 映射
with open('item2id.json', 'w', encoding='utf-8') as f:
    json.dump(item2id, f, ensure_ascii=False, indent=2)

In [None]:
#第三步：后续加载使用

# 加载映射字典
with open('item2id.json', 'r', encoding='utf-8') as f:
    item2id = json.load(f)

# 获取嵌入维度
embedding_dim = 4096  # 或根据 saved file 推断

# 重建 nn.Embedding 层并加载参数
embedding_layer = nn.Embedding(num_embeddings=len(item2id), embedding_dim=embedding_dim)
embedding_layer.load_state_dict(torch.load('item_embedding.pt'))
embedding_layer.eval()

# 示例使用：获取某个项目的 embedding
item_name = "尼可地尔片"
item_id = item2id[item_name]
vector = embedding_layer(torch.tensor(item_id))


In [None]:
vector

In [None]:
# 6. 聚合卡级别表示（均值池化）
card_item = (
    df.dropna(subset=['明细项目名称'])
    .groupby('card_idx')['明细项目名称']
    .apply(list)
)

In [None]:
card_item.head()

In [None]:
card_item.to_csv('card_item.csv', index=False, encoding='utf-8')