In [1]:
import os
import json
from typing import List, Dict, Any

def validate_conversation(conversation: List[Dict]) -> bool:
    """验证对话格式"""
    has_human = False
    has_gpt = False
    
    for msg in conversation:
        # 检查消息结构
        if not isinstance(msg, dict):
            return False
        if "from" not in msg or "value" not in msg:
            return False
        role = msg["from"]
        value = msg["value"]
        
        # 检查角色有效性
        if role not in {"system", "human", "gpt"}:
            return False
            
        # 检查内容有效性
        if not isinstance(value, str) or len(value.strip()) == 0:
            return False
        
        # 角色配对验证
        if role == "human":
            has_human = True
        elif role == "gpt":
            has_gpt = True
    
    # 必须存在human且至少一次对话配对
    return has_human and (has_gpt or has_human)

def validate_sharegpt_item(item: Dict) -> bool:
    """验证单个数据条目"""
    if not isinstance(item, dict):
        return False
    
    # 检查必须字段
    required_fields = {"conversations", "images"}
    if not required_fields.issubset(item.keys()):
        return False
    
    # 验证对话内容
    conversations = item["conversations"]
    if not isinstance(conversations, list) or len(conversations) < 1:
        return False
    
    # 验证图像路径
    # images = item["images"]
    # if not isinstance(images, str) or len(images.strip()) == 0:
    #     return False
    
    # 对话结构验证
    return validate_conversation(conversations)

def merge_sharegpt_data(input_dir: str, output_file: str) -> None:
    """
    合并指定目录下所有ShareGPT格式的JSON文件
    
    :param input_dir: 输入目录路径
    :param output_file: 输出文件路径
    """
    merged_data = []
    file_count = 0
    valid_count = 0
    invalid_count = 0

    # 遍历目录
    for filename in sorted(os.listdir(input_dir)):
        if not filename.lower().endswith('.json'):
            continue

        file_path = os.path.join(input_dir, filename)
        
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                
                # 检查文件结构
                if not isinstance(data, list):
                    print(f"⚠️ 文件 {filename} 根元素不是列表，已跳过")
                    continue
                
                file_count += 1
                file_valid = 0
                file_invalid = 0
                
                # 逐条验证
                for item in data:
                    if validate_sharegpt_item(item):
                        merged_data.append(item)
                        valid_count += 1
                        file_valid += 1
                    else:
                        invalid_count += 1
                        file_invalid += 1
                
                # 输出文件处理结果
                print(f"✅ 已处理 {filename}: 有效 {file_valid} 条 | 无效 {file_invalid} 条")
                
        except Exception as e:
            print(f"❌ 处理 {filename} 时发生错误: {str(e)}")
            continue

    # 保存合并结果
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(merged_data, f, ensure_ascii=False, indent=2, sort_keys=True)
    
    # 打印统计信息
    print("\n📊 合并完成！")
    print(f"📂 处理文件总数: {file_count}")
    print(f"✔️ 有效数据条目: {valid_count}")
    print(f"✖️ 无效数据条目: {invalid_count}")
    print(f"💾 结果已保存到: {os.path.abspath(output_file)}")


In [None]:
merge_sharegpt_data(
    input_dir="./",
    output_file="./Filtered_demo.json"
)

✅ 已处理 demo.json: 有效 6 条 | 无效 0 条

📊 合并完成！
📂 处理文件总数: 1
✔️ 有效数据条目: 6
✖️ 无效数据条目: 0
💾 结果已保存到: /home/wangsj/learn/VSCodeDataProcess/githubrepo/data/VQA_Generation/Generated_demo.json
