In [None]:
# Cell 5: 生成字符级和拼音级数据（扩展版）
from pypinyin import lazy_pinyin
import json

print("生成三个版本的数据...")

# ============ 1. 字符级数据（原始）============
char_data = []
for i in range(len(medical_qa['train'])):
    char_data.append({
        'query': medical_qa['train'][i]['ask'],
        'document': medical_qa['train'][i]['answer'],
        'department': medical_qa['train'][i]['department']
    })

# ============ 2. 拼音级数据 ============
def text_to_pinyin(text):
    """将中文转换为拼音"""
    return ' '.join(lazy_pinyin(text))

pinyin_data = []
for i in range(len(medical_qa['train'])):
    pinyin_data.append({
        'query': text_to_pinyin(medical_qa['train'][i]['ask']),
        'document': text_to_pinyin(medical_qa['train'][i]['answer']),
        'department': medical_qa['train'][i]['department'],
        'original_query': medical_qa['train'][i]['ask']
    })

# ============ 3. 笔画级数据（新增）============
# 简化版：把每个汉字映射到笔画数
def char_to_stroke_count(char):
    """获取汉字笔画数（简化版）"""
    # 这里可以用更复杂的笔画库
    # 简化版：用 Unicode 编码估算
    code = ord(char)
    return str((code % 30) + 1)  # 简化映射

def text_to_stroke_repr(text):
    """将文本转换为笔画表示"""
    result = []
    for char in text:
        if '\u4e00' <= char <= '\u9fff':  # 汉字
            # 方案1：用笔画数
            stroke_num = char_to_stroke_count(char)
            result.append(f"s{stroke_num}")
        else:
            result.append(char)
    return ' '.join(result)

stroke_data = []
for i in range(len(medical_qa['train'])):
    stroke_data.append({
        'query': text_to_stroke_repr(medical_qa['train'][i]['ask']),
        'document': text_to_stroke_repr(medical_qa['train'][i]['answer']),
        'department': medical_qa['train'][i]['department'],
        'original_query': medical_qa['train'][i]['ask']
    })

print(f"✓ 字符级数据: {len(char_data)} 条")
print(f"✓ 拼音级数据: {len(pinyin_data)} 条")
print(f"✓ 笔画级数据: {len(stroke_data)} 条")

# 示例对比
print("\n示例对比:")
example = medical_qa['train'][0]['ask'][:20]
print(f"原文: {example}")
print(f"字符级: {example}")
print(f"拼音级: {text_to_pinyin(example)}")
print(f"笔画级: {text_to_stroke_repr(example)}")