In [1]:
import os
import json

def merge_jsonl_files(input_dir, output_file):
    """
    将指定目录中的多个JSONL文件融合成一个文件。

    :param input_dir: 存放JSONL文件的目录路径
    :param output_file: 输出融合后JSONL文件的路径
    """
    with open(output_file, 'w', encoding='utf-8') as outfile:
        for file_name in os.listdir(input_dir):
            file_path = os.path.join(input_dir, file_name)
            if os.path.isfile(file_path) and file_name.endswith('.jsonl'):
                with open(file_path, 'r', encoding='utf-8') as infile:
                    for line in infile:
                        # 确保每一行都是合法的JSON对象
                        try:
                            json_obj = json.loads(line.strip())
                            json.dump(json_obj, outfile, ensure_ascii=False)
                            outfile.write('\n')
                        except json.JSONDecodeError as e:
                            print(f"文件 {file_name} 中有错误的JSON行: {e}")
                            continue


In [5]:
input_directory = '/data/nfs/data/Magpie-Qwen2-Pro-200K-Chinese-jsonl/data_filter'  # 替换为存放JSONL文件的目录路径
output_file_path = '/data/nfs/data/Magpie-Qwen2-Pro-200K-Chinese-jsonl/Magpie-Qwen2-Pro-200K-Chinese-all-filter.jsonl'    # 替换为输出融合后JSONL文件的路径

merge_jsonl_files(input_directory, output_file_path)
print("JSONL文件融合完成。")

JSONL文件融合完成。


In [None]:
input_directory = '/data/nfs/data/Magpie-Qwen2-Pro-1M-v0.1-jsonl/data_filter'  # 替换为存放JSONL文件的目录路径
output_file_path = '/data/nfs/data/Magpie-Qwen2-Pro-1M-v0.1-jsonl/Magpie-Qwen2-Pro-1M-v0.1-all-filter.jsonl'    # 替换为输出融合后JSONL文件的路径

merge_jsonl_files(input_directory, output_file_path)
print("JSONL文件融合完成。")

In [4]:
input_directory = '/data/nfs/data/Magpie-Llama-3.1-Pro-300K-Filtered-jsonl/data_filter'  # 替换为存放JSONL文件的目录路径
output_file_path = '/data/nfs/data/Magpie-Llama-3.1-Pro-300K-Filtered-jsonl/Magpie-Llama-3.1-Pro-300K-all-filter.jsonl'    # 替换为输出融合后JSONL文件的路径

merge_jsonl_files(input_directory, output_file_path)
print("JSONL文件融合完成。")

JSONL文件融合完成。
