In [None]:
import json
from tqdm import tqdm

def convert_to_target_format(data, add_server_name_in_tool_name=True):
    """
    Convert data format:
    - function_call -> tool_call
    - function -> tool_response  
    - tools from list to JSON string
    """

    mcp_servers = data['metadata']['mcp_servers']
    tool_descs = []
    
    for mcp_server in mcp_servers:
        for function in mcp_server.get('remote_server_response', {}).get('tools', []):
            server_name = mcp_server.get('server_name', '').lower().replace(' ', '-')
            if add_server_name_in_tool_name:
                tool_name = f"{server_name}-{function.get('name', '')}"
            else:
                tool_name = function.get('name', '')
            tool_desc = {
                "type": "function",
                "function": {
                    "name": tool_name,
                    "description": function.get('description', ''),
                    "parameters": function.get('input_schema', {})
                }
            }
            tool_descs.append(tool_desc)
    
    converted_messages = []
    for msg in data['messages']:
        if msg['role'] == 'system':
            continue # skip system message since it will be handled by the training framework

        if msg['role'] == 'assistant':
            # if assistant has function_call, it means the assistant is calling a tool
            if 'function_call' in msg:
                converted_msg = {
                    "role": "tool_call",
                    "content": str(msg['function_call'])
                }
            else:
                converted_msg = {
                    "role": msg['role'],
                    "content": msg['content'] if msg['content'] != "" else msg["reasoning_content"]
                }
        elif msg['role'] == 'function':
            converted_msg = {
                "role": "tool_response", 
                "content": msg['content']
            }
        elif msg['role'] == 'user':
            converted_msg = {
                "role": msg['role'],
                "content": msg['content']
            }
        else:
            raise ValueError(f"Unknown role: {msg['role']}")
        
        converted_messages.append(converted_msg)
    
    result = {
        "tools": json.dumps(tool_descs, ensure_ascii=False),
        "messages": converted_messages
    }
    
    return result


def convert_file(input_file, saveto="json"):
    import os
    basename = os.path.basename(input_file)
    if "oss" in input_file.lower(): # Openai agent
        add_server_name_in_tool_name = False
    else:
        add_server_name_in_tool_name = True
        
    output_file = basename.replace('.json', '_converted.json')
    output_file = os.path.join(os.path.dirname(input_file), output_file)

    print(f"Reading data from {input_file}...")
    dataset = []
    if input_file.endswith('.json'):
        with open(input_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
    elif input_file.endswith('.jsonl'):
        with open(input_file, 'r', encoding='utf-8') as f:
            for line in tqdm(f, desc="Loading jsonl"):
                line = line.strip()
                if line:
                    dataset.append(json.loads(line))
    else:
        raise ValueError("Unsupported file format. Please provide a .json or .jsonl file.")
        
    converted_count = 0
    converted_dataset = []

    if saveto == "jsonl":
        with open(output_file.replace('.json', '.jsonl'), 'w', encoding='utf-8') as f:
            for data in tqdm(dataset, desc="Converting to JSONL"):
                try:
                    converted = convert_to_target_format(data, add_server_name_in_tool_name=add_server_name_in_tool_name)
                    f.write(json.dumps(converted, ensure_ascii=False) + "\n")
                    converted_dataset.append(converted)
                    converted_count += 1
                except Exception as e:
                    print(f"Error converting entry: {e}")
                    continue
        print(f"Successfully converted {converted_count} entries to JSONL format: {output_file.replace('.json', '.jsonl')}")
    elif saveto == "json":
        for data in tqdm(dataset, desc="Converting to JSON"):
            try:
                converted = convert_to_target_format(data, add_server_name_in_tool_name=add_server_name_in_tool_name)
                converted_dataset.append(converted)
                converted_count += 1
            except Exception as e:
                print(f"Error converting entry: {e}")
                continue
        with open(output_file.replace('.jsonl', '.json'), 'w', encoding='utf-8') as f:
            json.dump(converted_dataset, f, ensure_ascii=False, indent=4)
        print(f"Successfully converted {converted_count} entries to JSON format: {output_file.replace('.jsonl', '.json')}")
    else:
        raise ValueError("Unsupported saveto format. Please use 'json' or 'jsonl'.")

    return converted_dataset

In [None]:
converted_dataset = convert_file('./HF_Toucan/Qwen3_1002_No_PII.jsonl')

print(len(converted_dataset))