In [4]:
import json
from tqdm import tqdm

# Transform dataset to a Axoltol supported format
def convert_to_sharegpt(json_file, output_file, id_prefix, start_id=0):
    with open(json_file, 'r') as file:
        data = json.load(file)

    with open(output_file, 'w') as file:
        for entry in data:
            conversation_id = f"{id_prefix}_{entry['id']+start_id}"
            instruction = entry['instruction']
            response = entry['response']
            conversations = [
                {"from": "human", "value": entry['instruction']},
                {"from": "gpt", "value": entry['response']}
            ]
            gen_input_configs = entry['gen_input_configs']
            gen_input_configs['pre_query_template'] = entry['pre_query_template']
            intent = entry['intent']
            knowledge = entry['knowledge']
            difficulty = entry['difficulty'] if entry['difficulty'] in ['very easy', 'easy', 'medium', 'hard', 'very hard'] else None
            difficulty_generator = entry['difficulty_generator']
            input_quality = entry['input_quality'] if entry['input_quality'] in ['very poor', 'poor', 'average', 'good', 'excellent'] else None
            quality_explanation = entry['quality_explanation']
            quality_generator = entry['quality_generator']
            task_category = entry['task_category'] if entry['task_category'] in ["Information seeking", "Reasoning", "Planning", "Editing", "Coding & Debugging", "Math", "Role playing", "Data analysis", "Creative writing", "Advice seeking", "Brainstorming", "Other"] else None
            other_task_category = entry['other_task_category']
            task_category_generator = entry['task_category_generator']
            llama_guard_2 = entry['llama_guard_2']
            instruct_reward = entry['instruct_reward']
            reward_model = entry['reward_model']
            language = entry['language']

            if entry['gen_input_configs']['input_generator'] != entry['gen_response_configs']['output_generator']:
                raise ValueError("Input and output generators must be the same")
            
            if id_prefix not in entry['gen_input_configs']['input_generator']:
                raise ValueError(f"Input generator must contain {id_prefix}")

            sharegpt_entry = {
                "conversation_id": conversation_id,
                "instruction": instruction,
                "response": response,
                "conversations": conversations,
                "gen_input_configs": gen_input_configs,
                "intent": intent,
                "knowledge": knowledge,
                "difficulty": difficulty,
                "difficulty_generator": difficulty_generator,
                "input_quality": input_quality,
                "quality_explanation": quality_explanation,
                "quality_generator": quality_generator,
                "task_category": task_category,
                "other_task_category": other_task_category,
                "task_category_generator": task_category_generator,
                "llama_guard_2": llama_guard_2,
                "instruct_reward": instruct_reward,
                "reward_model": reward_model,
                "language": language
            }

            file.write(json.dumps(sharegpt_entry) + '\n')
    
    print(f"Converted {len(data)} entries to {output_file}")
    return len(data)

In [None]:
#　入力ファイルを指定（7/25）
input_files = [
    "/magpie/data/Qwen2.5-3B-Instruct_topp1_temp1_Algebra/Magpie_Qwen2.5-3B-Instruct_100_1753420315_ins_res.json",
    "/magpie/data/Qwen2.5-3B-Instruct_topp1_temp1_Geometry/Magpie_Qwen2.5-3B-Instruct_100_1753423010_ins_res.json",
    "/magpie/data/Qwen2.5-3B-Instruct_topp1_temp1_Applied-Mathematics/Magpie_Qwen2.5-3B-Instruct_100_1753420927_ins_res.json",
    "/magpie/data/Qwen2.5-3B-Instruct_topp1_temp1_Discrete-Mathematics/Magpie_Qwen2.5-3B-Instruct_100_1753422329_ins_res.json",
    "/magpie/data/Qwen2.5-3B-Instruct_topp1_temp1_Calculus/Magpie_Qwen2.5-3B-Instruct_100_1753421625_ins_res.json",
    "/magpie/data/Qwen2.5-3B-Instruct_topp1_temp1_Number-Theory/Magpie_Qwen2.5-3B-Instruct_100_1753423757_ins_res.json",
]

# Convert each file to Axolotl format
idx = 0
id_prefix = "Qwen2.5-3B-Instruct-Math-Combined"
converted_files = []
for i in tqdm(range(len(input_files))):
    converted_file_name = f"{id_prefix}_sharegpt_shard{i}.jsonl"
    len_data = convert_to_sharegpt(input_files[i], converted_file_name, id_prefix, idx)
    idx += len_data
    converted_files.append(converted_file_name)

# Concatenate all files
output_file = f"{id_prefix}_sharegpt.jsonl"
with open(output_file, 'w') as outfile:
    for fname in tqdm(converted_files):
        with open(fname) as infile:
            for line in infile:
                outfile.write(line)

print(f"Concatenated {len(converted_files)} files to {output_file}")

  0%|          | 0/6 [00:00<?, ?it/s]


KeyError: 'intent'

In [17]:
input_files = [
    "Phi-3-medium-128k-instruct_topp1_temp1_1719007908/Magpie_Phi-3-medium-128k-instruct_100000_1719007908_ins_res_difficulty_quality_category_safety_reward_language.json",
    "Phi-3-medium-128k-instruct_topp1_temp1_1719008013/Magpie_Phi-3-medium-128k-instruct_100000_1719008013_ins_res_difficulty_quality_category_safety_reward_language.json",
    "Phi-3-medium-128k-instruct_topp1_temp1_1719008020/Magpie_Phi-3-medium-128k-instruct_100000_1719008020_ins_res_difficulty_quality_category_safety_reward_language.json",
    "Phi-3-medium-128k-instruct_topp1_temp1.1_1719008124/Magpie_Phi-3-medium-128k-instruct_100000_1719008124_ins_res_difficulty_quality_category_safety_reward_language.json",
    "Phi-3-medium-128k-instruct_topp1_temp1.1_1719046915/Magpie_Phi-3-medium-128k-instruct_100000_1719046915_ins_res_difficulty_quality_category_safety_reward_language.json",
    "Phi-3-medium-128k-instruct_topp1_temp1.1_1719046924/Magpie_Phi-3-medium-128k-instruct_100000_1719046924_ins_res_difficulty_quality_category_safety_reward_language.json",
    "Phi-3-medium-128k-instruct_topp0.99_temp1.2_1719046978/Magpie_Phi-3-medium-128k-instruct_100000_1719046978_ins_res_difficulty_quality_category_safety_reward_language.json",
    "Phi-3-medium-128k-instruct_topp0.99_temp1.2_1719046987/Magpie_Phi-3-medium-128k-instruct_100000_1719046987_ins_res_difficulty_quality_category_safety_reward_language.json",
    "Phi-3-medium-128k-instruct_topp0.99_temp1.2_1719100458/Magpie_Phi-3-medium-128k-instruct_100000_1719100458_ins_res_difficulty_quality_category_safety_reward_language.json",
    "Phi-3-medium-128k-instruct_topp0.99_temp1.25_1719100868/Magpie_Phi-3-medium-128k-instruct_100000_1719100868_ins_res_difficulty_quality_category_safety_reward_language.json"
]

# Convert each file to Axolotl format
idx = 0
id_prefix = "Phi-3-medium-128k-instruct"
converted_files = []
for i in tqdm(range(len(input_files))):
    converted_file_name = f"{id_prefix}_sharegpt_shard{i}.jsonl"
    len_data = convert_to_sharegpt(input_files[i], converted_file_name, id_prefix, idx)
    idx += len_data
    converted_files.append(converted_file_name)

# Concatenate all files
output_file = f"{id_prefix}_sharegpt.jsonl"
with open(output_file, 'w') as outfile:
    for fname in tqdm(converted_files):
        with open(fname) as infile:
            for line in infile:
                outfile.write(line)

print(f"Concatenated {len(converted_files)} files to {output_file}")

 10%|█         | 1/10 [00:06<00:57,  6.39s/it]

Converted 100000 entries to Phi-3-medium-128k-instruct_sharegpt_shard0.jsonl


 20%|██        | 2/10 [00:12<00:51,  6.47s/it]

Converted 100000 entries to Phi-3-medium-128k-instruct_sharegpt_shard1.jsonl


 30%|███       | 3/10 [00:19<00:46,  6.70s/it]

Converted 100000 entries to Phi-3-medium-128k-instruct_sharegpt_shard2.jsonl


 40%|████      | 4/10 [00:26<00:39,  6.64s/it]

Converted 100000 entries to Phi-3-medium-128k-instruct_sharegpt_shard3.jsonl


 50%|█████     | 5/10 [00:32<00:32,  6.54s/it]

Converted 100000 entries to Phi-3-medium-128k-instruct_sharegpt_shard4.jsonl


 60%|██████    | 6/10 [00:39<00:25,  6.47s/it]

Converted 100000 entries to Phi-3-medium-128k-instruct_sharegpt_shard5.jsonl


 70%|███████   | 7/10 [00:46<00:20,  6.76s/it]

Converted 100000 entries to Phi-3-medium-128k-instruct_sharegpt_shard6.jsonl


 80%|████████  | 8/10 [00:53<00:13,  6.82s/it]

Converted 100000 entries to Phi-3-medium-128k-instruct_sharegpt_shard7.jsonl


 90%|█████████ | 9/10 [00:59<00:06,  6.69s/it]

Converted 100000 entries to Phi-3-medium-128k-instruct_sharegpt_shard8.jsonl


100%|██████████| 10/10 [01:07<00:00,  6.72s/it]

Converted 100000 entries to Phi-3-medium-128k-instruct_sharegpt_shard9.jsonl



100%|██████████| 10/10 [00:10<00:00,  1.02s/it]


Concatenated 10 files to Phi-3-medium-128k-instruct_sharegpt.jsonl
