In [None]:
import json
from typing import List, Dict, Optional

def build_system_message(event: Dict) -> Optional[Dict]:
    """构建增强型system消息"""
    required_fields = ['event_id', 'summary', 'impact_industries', 'location']
    if not all(event.get(field) for field in required_fields):
        return None
    
    content = [
        f"事件分析系统 | ID：{event['event_id']}",
        f"摘要：{event['summary']}",
        f"影响行业：{', '.join(event['impact_industries'])}",
        f"发生地区：{event['location']}",
    ]
    
    if 'explanation' in event and len(event['explanation']) > 0:
        content.append("\n事件背景：")
        content.extend([f"- {exp}" for exp in event['explanation'][:3]])  # 取前3条解释
    
    return {
        "role": "system",
        "content": "\n".join(content)
    }

def build_qa_pair(event: Dict, impact: Dict, interpretation: Dict) -> List[Dict]:
    """构建问答对，包含完整推理信息"""
    # User问题
    user_content = [
        f"请分析 {impact.get('indicator', '未知指标')} 指标变化：",
        f"- 时间范围：{impact.get('time_horizon', '未指定')}",
        f"- 地区范围：{impact.get('affected_areas', ['未知地区'])[0]}"
    ]
    
    # Assistant回答
    assistant_content = [
        f"预测方向：{impact.get('direction', '未知')}",
        f"变化幅度：{impact.get('magnitude', '未知')}",
        f"置信水平：{impact['confidence']*100:.1f}%" if 'confidence' in impact else "",
        f"影响机制：{interpretation.get('reasoning_path', '未提供详细路径')}",
        f"因果强度：{interpretation.get('causal_strength', 0):.1f}" if 'causal_strength' in interpretation else "",
        f"数据溯源：{event.get('source_meta', {}).get('text_hash', '')[:8]}"
    ]
    
    return [
        {"role": "user", "content": "\n".join([line for line in user_content if line])},
        {"role": "assistant", "content": "\n".join([line for line in assistant_content if line])}
    ]

def validate_conversation(messages: List[Dict]) -> bool:
    """严格验证对话结构"""
    if len(messages) < 3:  # 至少system + 1轮QA
        return False
    
    # 检查角色顺序
    roles = [msg['role'] for msg in messages]
    if roles[0] != 'system':
        return False
    
    # 检查QA对交替
    qa_sequence = roles[1:]
    if len(qa_sequence) % 2 != 0:
        return False
    
    for i in range(0, len(qa_sequence), 2):
        if qa_sequence[i] != 'user' or qa_sequence[i+1] != 'assistant':
            return False
    
    # 检查内容完整性
    last_assistant = messages[-1]
    required_keys = ['预测方向', '变化幅度']
    return all(key in last_assistant['content'] for key in required_keys)

def transform_entry(data_entry: Dict) -> Optional[Dict]:
    """转换单个数据条目"""
    try:
        # 提取基础数据
        event = data_entry['event_table'][0]
        impacts = data_entry.get('economic_impact_table', [])
        interpretations = data_entry.get('interpretability_table', [{}])
        
        # 构建消息流
        messages = []
        
        # System消息
        if system_msg := build_system_message(event):
            messages.append(system_msg)
        else:
            return None
        
        # 构建QA对
        for impact in impacts:
            interpretation = next(
                (it for it in interpretations if it['event_id'] == event['event_id']),
                {}
            )
            qa_pair = build_qa_pair(event, impact, interpretation)
            messages.extend(qa_pair)
        
        # 添加溯源信息
        if source := event.get('source_meta'):
            messages.append({
                "role": "system",
                "content": f"数据溯源：{source['text_hash']} | 来源：{source['source'][:50]}..."
            })
        
        return {"messages": messages} if validate_conversation(messages) else None
    
    except Exception as e:
        print(f"转换错误：{str(e)}")
        return None

def convert_json_to_jsonl(input_path: str, output_path: str) -> None:
    """执行完整转换流程"""
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    converted = []
    for entry in data.get('data', []):
        if transformed := transform_entry(entry):
            converted.append(transformed)
    
    # 保存结果
    with open(output_path, 'w', encoding='utf-8') as f:
        for item in converted:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')
    
    print(f"转换完成，有效记录数：{len(converted)}/{len(data['data'])}")

# 使用示例
if __name__ == "__main__":
    convert_json_to_jsonl("enhanced_structured_events.json", "output_dataset.jsonl")

In [6]:
import json
from typing import List, Dict, Optional, Union
from datetime import datetime

def normalize_time(time_str: str) -> str:
    """统一时间格式处理"""
    if time_str.startswith("T+"):
        quantity = time_str[2:-1]
        unit = time_str[-1]
        unit_map = {'d': '天', 'w': '周', 'm': '月', 'y': '年'}
        return f"{quantity}{unit_map.get(unit, '')}"
    try:
        dt = datetime.fromisoformat(time_str.replace('Z', '+00:00'))
        return dt.strftime("%Y-%m-%d %H:%M:%S")
    except:
        return time_str

def build_system_message(event: Dict, source_meta: Dict) -> Optional[Dict]:
    """构建完整元数据系统消息"""
    required_fields = ['event_id', 'timestamp', 'event_type']
    if not all(event.get(f) for f in required_fields):
        return None
    
    content = [
        f"# 事件元数据 | ID：{event['event_id']}",
        f"- 类型：{event['event_type']}",
        f"- 时间：{normalize_time(event['timestamp'])}",
        f"- 地区：{event.get('location', '未知地区')}",
        f"- 来源：{event['source']}",
        "\n## 核心摘要",
        event.get('summary', '暂无摘要'),
        "\n## 详细描述",
        event.get('description', '暂无详细描述')
    ]
    
    if explanations := event.get('explanation'):
        content.append("\n## 事件背景")
        content.extend([f"{i+1}. {exp}" for i, exp in enumerate(explanations)])
    
    content.append("\n## 原始数据指纹")
    content.extend([
        f"- 采集时间：{source_meta.get('original_time', '未知')}",
        f"- 文本哈希：{source_meta.get('text_hash', '')}",
        f"- 文本片段：{source_meta.get('raw_text_snippet', '')[:200]}"
    ])
    
    return {
        "role": "system",
        "content": "\n".join(content)
    }

def build_impact_analysis(impact: Dict, interpretation: Dict) -> Dict:
    """构建完整影响分析"""
    analysis = {
        "indicator": impact.get('indicator'),
        "direction": impact.get('direction'),
        "magnitude": impact.get('magnitude'),
        "confidence": f"{impact.get('confidence', 0)*100:.1f}%" if impact.get('confidence') else None,
        "time_horizon": normalize_time(impact.get('time_horizon', '')),
        "affected_areas": impact.get('affected_areas', []),
        "causal_strength": interpretation.get('causal_strength'),
        "reasoning_path": interpretation.get('reasoning_path'),
        "impact_summary": interpretation.get('impact_summary')
    }
    return {k: v for k, v in analysis.items() if v is not None}

def transform_entry(data_entry: Dict) -> Optional[Dict]:
    """全字段转换处理器"""
    try:
        event = data_entry['event_table'][0]
        source_meta = data_entry.get('source_meta', {})
        interpretations = {
            it['event_id']: it 
            for it in data_entry.get('interpretability_table', [])
        }
        
        # 构建消息流
        messages = []
        
        # 系统消息（完整元数据）
        if not (system_msg := build_system_message(event, source_meta)):
            return None
        messages.append(system_msg)
        
        # 经济影响分析
        analysis_blocks = []
        for impact in data_entry.get('economic_impact_table', []):
            interpretation = interpretations.get(impact['event_id'], {})
            analysis = build_impact_analysis(impact, interpretation)
            
            # 用户问题
            user_content = [
                f"请分析 {analysis['indicator']} 指标变化：",
                f"时间范围：{analysis['time_horizon']}",
                f"影响地区：{', '.join(analysis['affected_areas'])}"
            ]
            
            # 助手回答
            assistant_content = []
            for k in ['direction', 'magnitude', 'confidence', 'causal_strength']:
                if k in analysis:
                    assistant_content.append(f"{k}: {analysis[k]}")
            assistant_content.extend([
                f"因果路径：{analysis.get('reasoning_path', '')}",
                f"影响总结：{analysis.get('impact_summary', '')}"
            ])
            
            messages.extend([
                {"role": "user", "content": "\n".join(user_content)},
                {"role": "assistant", "content": "\n".join(assistant_content)}
            ])
        
        return {"messages": messages}
    
    except KeyError as e:
        print(f"关键字段缺失：{str(e)}")
        return None
    except Exception as e:
        print(f"转换异常：{str(e)}")
        return None

def validate_conversation(messages: List[Dict]) -> bool:
    """增强验证逻辑"""
    required_keys = {
        'system': ['# 事件元数据', '## 核心摘要'],
        'user': ['请分析', '时间范围'],
        'assistant': ['direction:', 'magnitude:']
    }
    
    try:
        # 角色顺序验证
        roles = [msg['role'] for msg in messages]
        assert roles[0] == 'system', "必须以system消息开头"
        assert all(u == 'user' and a == 'assistant' 
                 for u, a in zip(roles[1::2], roles[2::2])), "QA对顺序错误"
        
        # 内容验证
        system_content = messages[0]['content']
        assert all(k in system_content for k in required_keys['system']), "系统消息不完整"
        
        last_assistant = next(msg for msg in reversed(messages) 
                        if msg['role'] == 'assistant')
        assert all(k in last_assistant['content'] 
               for k in required_keys['assistant']), "关键分析字段缺失"
        
        return True
    except AssertionError as e:
        print(f"验证失败：{str(e)}")
        return False

def convert_to_jsonl(input_path: str, output_path: str) -> None:
    """完整转换流程"""
    with open(input_path, 'r', encoding='utf-8') as f:
        raw_data = json.load(f).get('data', [])
    
    success = 0
    with open(output_path, 'w', encoding='utf-8') as f:
        for entry in raw_data:
            if converted := transform_entry(entry):
                if validate_conversation(converted['messages']):
                    f.write(json.dumps(converted, ensure_ascii=False) + '\n')
                    success += 1
    
    print(f"转换完成 | 成功率：{success}/{len(raw_data)} ({success/len(raw_data):.1%})")

if __name__ == "__main__":
    convert_to_jsonl("enhanced_structured_events.json", "full_output.jsonl")

转换完成 | 成功率：275/275 (100.0%)
