In [2]:
# Cell 0: 环境初始化

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from pypinyin import lazy_pinyin
import pandas as pd
import numpy as np
import torch.optim as optim
from tqdm.auto import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import os
import json

device = 'cuda' if torch.cuda.is_available() else 'cpu'


print("环境信息")
print(f"PyTorch: {torch.__version__}")
print(f"设备: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

print("\n环境配置完成！\n")

环境信息
PyTorch: 2.0.0+cu118
设备: cpu

环境配置完成！



In [6]:
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [3]:
import os

%cd autodl-tmp

/root/autodl-tmp


In [4]:
# Cell 4: 加载数据集（采样版）
import pandas as pd
import os
from datasets import Dataset, DatasetDict
import random



data_dir = "Data_数据"
dept_files = {
    "Oncology_肿瘤科": "肿瘤科5-10000.csv",
    "IM_内科": "内科5000-33000.csv",
    "OAGD_妇产科": "妇产科6-28000.csv",
}

all_data = []
for dept, filename in dept_files.items():
    df = pd.read_csv(
        os.path.join(data_dir, dept, filename), 
        encoding='gb18030'  # 不限制nrows，加载全部
    )
    all_data.append(df[['ask', 'answer']])
    print(f"{dept}: {len(df)} 条")

combined_df = pd.concat(all_data, ignore_index=True)
print(f"原始总计: {len(combined_df)} 条")

# 随机采样80,000条
SAMPLE_SIZE = 80000
if len(combined_df) > SAMPLE_SIZE:
    combined_df = combined_df.sample(n=SAMPLE_SIZE, random_state=42).reset_index(drop=True)
    print(f"随机采样: {SAMPLE_SIZE} 条")

medical_qa = DatasetDict({
    'train': Dataset.from_pandas(combined_df, preserve_index=False)
})

print(f"最终使用: {len(medical_qa['train'])} 条")

Oncology_肿瘤科: 75553 条
IM_内科: 220606 条
OAGD_妇产科: 183751 条
原始总计: 479910 条
随机采样: 80000 条
最终使用: 80000 条


In [9]:
# Cell 5: 数据准备（保存版）
import pickle
import os
from sklearn.model_selection import train_test_split
from pypinyin import lazy_pinyin
from tqdm import tqdm

# 检查是否有保存的数据
if os.path.exists('data_30k.pkl'):
    print("加载已保存数据...")
    with open('data_30k.pkl', 'rb') as f:
        data = pickle.load(f)
    
    char_data = data['char_data']
    pinyin_data = data['pinyin_data']
    train_q = data['train_q']
    val_q = data['val_q']
    test_q = data['test_q']
    train_d = data['train_d']
    val_d = data['val_d']
    test_d = data['test_d']
    
    print("加载完成（耗时<1秒）")

else:
    print("处理数据...")
    
    # 数据规模
    TOTAL = 30000
    
    # 提取数据
    queries = medical_qa['train']['ask'][:TOTAL]
    docs = medical_qa['train']['answer'][:TOTAL]
    
    # 划分
    train_val_q, test_q, train_val_d, test_d = train_test_split(
        queries, docs, test_size=0.1, random_state=42, shuffle=True
    )
    
    train_q, val_q, train_d, val_d = train_test_split(
        train_val_q, train_val_d, test_size=0.1111, random_state=42, shuffle=True  # 0.1111 = 3000/27000
    )
    
    # 字符级
    char_data = {
        'train': [{'query': train_q[i], 'document': train_d[i]} for i in range(len(train_q))],
        'val': [{'query': val_q[i], 'document': val_d[i]} for i in range(len(val_q))],
        'test': [{'query': test_q[i], 'document': test_d[i]} for i in range(len(test_q))]
    }
    
    # 拼音级
    def to_pinyin(text):
        return ' '.join(lazy_pinyin(text))
    
    print("\n转换拼音（约2分钟）...")
    pinyin_data = {
        'train': [
            {'query': to_pinyin(train_q[i]), 'document': to_pinyin(train_d[i])} 
            for i in tqdm(range(len(train_q)), desc="训练集")
        ],
        'val': [
            {'query': to_pinyin(val_q[i]), 'document': to_pinyin(val_d[i])} 
            for i in tqdm(range(len(val_q)), desc="验证集")
        ],
        'test': [
            {'query': to_pinyin(test_q[i]), 'document': to_pinyin(test_d[i])} 
            for i in tqdm(range(len(test_q)), desc="测试集")
        ]
    }
    
    # 保存
    print("\n保存数据...")
    with open('data_30k.pkl', 'wb') as f:
        pickle.dump({
            'char_data': char_data,
            'pinyin_data': pinyin_data,
            'train_q': train_q,
            'val_q': val_q,
            'test_q': test_q,
            'train_d': train_d,
            'val_d': val_d,
            'test_d': test_d,
        }, f)
    
    print("数据已保存至 data_30k.pkl")

# 统一输出
print("\n" + "="*60)
print("数据概览:")
print(f"  字符级: 训练{len(char_data['train'])} / 验证{len(char_data['val'])} / 测试{len(char_data['test'])}")
print(f"  拼音级: 训练{len(pinyin_data['train'])} / 验证{len(pinyin_data['val'])} / 测试{len(pinyin_data['test'])}")
print("="*60)

处理数据...


NameError: name 'medical_qa' is not defined

In [6]:
# Cell 5.5: 检查数据重复和泄露
print("数据质量检查:")

# 1. 检查训练集内部重复
train_query_set = set(train_q)
print(f"训练集查询: {len(train_q)} 条")
print(f"训练集唯一查询: {len(train_query_set)} 条")
print(f"训练集重复率: {(len(train_q) - len(train_query_set)) / len(train_q) * 100:.2f}%")

# 2. 检查验证集内部重复
val_query_set = set(val_q)
print(f"\n验证集查询: {len(val_q)} 条")
print(f"验证集唯一查询: {len(val_query_set)} 条")
print(f"验证集重复率: {(len(val_q) - len(val_query_set)) / len(val_q) * 100:.2f}%")

# 3. 检查训练集和验证集重叠
overlap = train_query_set.intersection(val_query_set)
print(f"\n训练集和验证集重叠: {len(overlap)} 条")
print(f"重叠比例: {len(overlap) / len(val_query_set) * 100:.2f}%")

# 4. 显示几个重叠的例子
if len(overlap) > 0:
    print("\n重叠样本示例:")
    for i, q in enumerate(list(overlap)[:3]):
        print(f"  {i+1}. {q[:50]}...")

数据质量检查:
训练集查询: 24000 条
训练集唯一查询: 20690 条
训练集重复率: 13.79%

验证集查询: 3000 条
验证集唯一查询: 2617 条
验证集重复率: 12.77%

训练集和验证集重叠: 37 条
重叠比例: 1.41%

重叠样本示例:
  1. 我得了子宫内膜异位症。...
  2. 问题描述:...
  3. 一年多前确诊的食道癌，确诊后直接做的手术，当时术后也做了化疗巩固了一下，在一个月前复查的时候说是复发...


In [3]:
# Cell 6: 定义训练组件
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset

MODEL_NAME = '/root/bert-base-chinese'

class PairDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        
        def encode(text):
            return self.tokenizer(
                text,
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )
        
        q_enc = encode(item['query'])
        d_enc = encode(item['document'])
        
        return {
            'query_input_ids': q_enc['input_ids'].squeeze(0),
            'query_attention_mask': q_enc['attention_mask'].squeeze(0),
            'doc_input_ids': d_enc['input_ids'].squeeze(0),
            'doc_attention_mask': d_enc['attention_mask'].squeeze(0)
        }

class EmbeddingModel(nn.Module):
    def __init__(self, model_name=MODEL_NAME):
        super().__init__()
        self.encoder = BertModel.from_pretrained(model_name)
    
    def mean_pooling(self, token_embeddings, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask
    
    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = self.mean_pooling(outputs.last_hidden_state, attention_mask)
        return F.normalize(embeddings, p=2, dim=1)

def cosine_loss(query_emb, doc_emb):
    similarity = F.cosine_similarity(query_emb, doc_emb, dim=1)
    # 防止数值不稳定
    similarity = torch.clamp(similarity, -1.0, 1.0)
    loss = 1 - similarity.mean()
    # 添加小的epsilon防止为0
    loss = loss + 1e-8
    return loss

print("✓ 组件定义完成")

✓ 组件定义完成


In [8]:
# Cell 7: 创建 DataLoader（添加测试集）
from torch.utils.data import DataLoader

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

# 字符级
train_char_dataset = PairDataset(char_data['train'], tokenizer)
val_char_dataset = PairDataset(char_data['val'], tokenizer)
test_char_dataset = PairDataset(char_data['test'], tokenizer)  # 新增

train_char_loader = DataLoader(train_char_dataset, batch_size=32, shuffle=True)  # batch_size增大
val_char_loader = DataLoader(val_char_dataset, batch_size=32, shuffle=False)
test_char_loader = DataLoader(test_char_dataset, batch_size=32, shuffle=False)  # 新增

# 拼音级
train_pinyin_dataset = PairDataset(pinyin_data['train'], tokenizer)
val_pinyin_dataset = PairDataset(pinyin_data['val'], tokenizer)
test_pinyin_dataset = PairDataset(pinyin_data['test'], tokenizer)  # 新增

train_pinyin_loader = DataLoader(train_pinyin_dataset, batch_size=32, shuffle=True)
val_pinyin_loader = DataLoader(val_pinyin_dataset, batch_size=32, shuffle=False)
test_pinyin_loader = DataLoader(test_pinyin_dataset, batch_size=32, shuffle=False)  # 新增

print(f"字符级: 训练{len(train_char_loader)} / 验证{len(val_char_loader)} / 测试{len(test_char_loader)} batches")
print(f"拼音级: 训练{len(train_pinyin_loader)} / 验证{len(val_pinyin_loader)} / 测试{len(test_pinyin_loader)} batches")

NameError: name 'char_data' is not defined

In [9]:
# Cell 8: 训练函数（移除验证，最终版）
import torch.optim as optim
from tqdm import tqdm

def train_model(model, train_loader, epochs=5, lr=2e-5, name="模型"):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    
    train_losses = []
    
    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        
        pbar = tqdm(train_loader, desc=f"[{name}] Epoch {epoch+1}/{epochs}")
        for batch in pbar:
            query_ids = batch['query_input_ids'].to(device)
            query_mask = batch['query_attention_mask'].to(device)
            doc_ids = batch['doc_input_ids'].to(device)
            doc_mask = batch['doc_attention_mask'].to(device)
            
            query_emb = model(query_ids, query_mask)
            doc_emb = model(doc_ids, doc_mask)
            loss = cosine_loss(query_emb, doc_emb)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_train_loss += loss.item()
            pbar.set_postfix({'loss': f'{loss.item():.4f}'})
        
        avg_train_loss = total_train_loss / len(train_loader)
        train_losses.append(avg_train_loss)
        
        print(f"\n[{name}] Epoch {epoch+1}/{epochs}  训练loss: {avg_train_loss:.4f}")
        torch.save(model.state_dict(), f'{name}_epoch{epoch+1}.pt')
        print()
    
    return model, train_losses

print("训练函数定义完成")

训练函数定义完成


In [10]:
# Cell 9: 训练字符级模型
char_model = EmbeddingModel(MODEL_NAME)

char_model, char_train_losses = train_model(  # 只返回train_losses
    char_model,
    train_char_loader,  
    epochs=3,
    lr=2e-5,
    name="字符级"
)

print("字符级模型训练完成")

Some weights of the model checkpoint at /root/bert-base-chinese were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[字符级] Epoch 1/3: 100%|██████████| 750/750 [03:25<00:00,  3.64it/s, loss=0.0013]



[字符级] Epoch 1/3  训练loss: 0.0038



[字符级] Epoch 2/3: 100%|██████████| 750/750 [03:25<00:00,  3.66it/s, loss=0.0007]



[字符级] Epoch 2/3  训练loss: 0.0007



[字符级] Epoch 3/3: 100%|██████████| 750/750 [03:25<00:00,  3.65it/s, loss=0.0003]



[字符级] Epoch 3/3  训练loss: 0.0004

字符级模型训练完成


In [13]:
# 训练拼音级模型
pinyin_model = EmbeddingModel(MODEL_NAME)

pinyin_model, pinyin_train_losses = train_model(  # 只返回train_losses
    pinyin_model,
    train_pinyin_loader,  # 只传入train_loader
    epochs=3,
    lr=2e-5,
    name="拼音级"
)

print("拼音级模型训练完成")

Some weights of the model checkpoint at /root/bert-base-chinese were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[拼音级] Epoch 1/3: 100%|██████████| 750/750 [04:28<00:00,  2.79it/s, loss=0.0011]



[拼音级] Epoch 1/3  训练loss: 0.0024



[拼音级] Epoch 2/3: 100%|██████████| 750/750 [04:28<00:00,  2.79it/s, loss=0.0027]



[拼音级] Epoch 2/3  训练loss: 0.0005



[拼音级] Epoch 3/3: 100%|██████████| 750/750 [04:27<00:00,  2.80it/s, loss=0.0002]



[拼音级] Epoch 3/3  训练loss: 0.0004

拼音级模型训练完成


In [4]:
# Cell 11: 定义评估函数
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from pypinyin import lazy_pinyin

def evaluate_retrieval(model, queries, documents, tokenizer, use_pinyin=False, k=10):
    """
    评估检索性能
    
    Args:
        model: 训练好的模型
        queries: 查询列表
        documents: 文档列表
        tokenizer: tokenizer
        use_pinyin: 是否使用拼音
        k: Top-K
    
    Returns:
        dict: 评估指标 {Recall@K, MRR}
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.eval()
    
    # 如果是拼音，需要转换
    if use_pinyin:
        def to_pinyin(text):
            return ' '.join(lazy_pinyin(text))
        queries = [to_pinyin(q) for q in tqdm(queries, desc="  转换查询为拼音")]
        documents = [to_pinyin(d) for d in tqdm(documents, desc="  转换文档为拼音")]
    
    # 编码查询
    query_embeddings = []
    with torch.no_grad():
        for q in tqdm(queries, desc="  编码查询"):
            inputs = tokenizer(q, return_tensors='pt', max_length=128, 
                             truncation=True, padding='max_length')
            q_emb = model(
                inputs['input_ids'].to(device),
                inputs['attention_mask'].to(device)
            )
            query_embeddings.append(q_emb.cpu().numpy())
    
    # 编码文档
    doc_embeddings = []
    with torch.no_grad():
        for d in tqdm(documents, desc="  编码文档"):
            inputs = tokenizer(d, return_tensors='pt', max_length=128,
                             truncation=True, padding='max_length')
            d_emb = model(
                inputs['input_ids'].to(device),
                inputs['attention_mask'].to(device)
            )
            doc_embeddings.append(d_emb.cpu().numpy())
    
    query_embeddings = np.vstack(query_embeddings)
    doc_embeddings = np.vstack(doc_embeddings)
    
    # 计算相似度
    print("  计算相似度矩阵...")
    similarities = cosine_similarity(query_embeddings, doc_embeddings)
    
    # 计算 Recall@K 和 MRR
    print(f"  计算 Recall@{k} 和 MRR...")
    recall_at_k = []
    mrr_scores = []
    
    for i in range(len(queries)):
        top_k_indices = np.argsort(similarities[i])[::-1][:k]
        
        if i in top_k_indices:
            recall_at_k.append(1)
            rank = np.where(top_k_indices == i)[0][0] + 1
            mrr_scores.append(1.0 / rank)
        else:
            recall_at_k.append(0)
            mrr_scores.append(0)
    
    results = {
        f'Recall@{k}': np.mean(recall_at_k),
        'MRR': np.mean(mrr_scores)
    }
    
    return results

print("评估函数定义完成")

评估函数定义完成


In [None]:
# Cell 12: 在验证集上评估两个模型
print("="*60)
print("验证集评估")
print("="*60)

# ============ 确保必要组件已定义 ============
try:
    EmbeddingModel
    tokenizer
except NameError:
    print("重新定义 EmbeddingModel...")
    import torch.nn as nn
    import torch.nn.functional as F
    from transformers import BertTokenizer, BertModel
    
    MODEL_NAME = '/root/bert-base-chinese'
    
    class EmbeddingModel(nn.Module):
        def __init__(self, model_name=MODEL_NAME):
            super().__init__()
            self.encoder = BertModel.from_pretrained(model_name)
        
        def mean_pooling(self, token_embeddings, attention_mask):
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
            sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
            sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
            return sum_embeddings / sum_mask
        
        def forward(self, input_ids, attention_mask):
            outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
            embeddings = self.mean_pooling(outputs.last_hidden_state, attention_mask)
            return F.normalize(embeddings, p=2, dim=1)
    
    tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
    print("✓ 组件定义完成\n")

# ============ 加载训练好的模型（强制 CPU）============
char_model = EmbeddingModel(MODEL_NAME)
char_model.load_state_dict(torch.load('字符级_epoch3.pt', map_location='cpu'))  # 添加 map_location='cpu'
char_model.eval()  # 设置为评估模式
print("✓ 已加载字符级模型（CPU）")

pinyin_model = EmbeddingModel(MODEL_NAME)
pinyin_model.load_state_dict(torch.load('拼音级_epoch3.pt', map_location='cpu'))  # 添加 map_location='cpu'
pinyin_model.eval()  # 设置为评估模式
print("✓ 已加载拼音级模型（CPU）\n")

# ============ 评估字符级模型 ============
print("-" * 60)
print("评估字符级模型（验证集）...")
print("-" * 60)
val_q_test=val_q[:500]
val_d_test=val_d[:500]
char_val_results = evaluate_retrieval(
    char_model,
    val_q,
    val_d,
    tokenizer,
    use_pinyin=False,
    k=10
)

print("\n字符级模型（验证集）:")
for metric, value in char_val_results.items():
    print(f"  {metric:<15}: {value:.4f}")

# ============ 评估拼音级模型 ============
print("\n" + "-" * 60)
print("评估拼音级模型（验证集）...")
print("-" * 60)
pinyin_val_results = evaluate_retrieval(
    pinyin_model,
    val_q,
    val_d,
    tokenizer,
    use_pinyin=True,
    k=10
)

print("\n拼音级模型（验证集）:")
for metric, value in pinyin_val_results.items():
    print(f"  {metric:<15}: {value:.4f}")

# ============ 验证集对比 ============
print("\n" + "="*60)
print("验证集对比:")
print("="*60)
for metric in char_val_results.keys():
    char_val = char_val_results[metric]
    pinyin_val = pinyin_val_results[metric]
    diff = pinyin_val - char_val
    diff_pct = (diff / char_val) * 100 if char_val > 0 else 0
    
    print(f"\n{metric}")
    print(f"  字符级: {char_val:.4f}")
    print(f"  拼音级: {pinyin_val:.4f}")
    print(f"  差异:   {diff:+.4f} ({diff_pct:+.1f}%)")

# 保存结果供可视化使用
char_results = char_val_results
pinyin_results = pinyin_val_results

print("\n✓ 验证集评估完成")

验证集评估


Some weights of the model checkpoint at /root/bert-base-chinese were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


✓ 已加载字符级模型（CPU）


Some weights of the model checkpoint at /root/bert-base-chinese were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
import os
os.getcwd()
%cd autodl-tmp

/root/autodl-tmp
