In [None]:
#%pip install torch --index-url https://download.pytorch.org/whl/cpu
%pip install datasets python-dotenv
from utils import *

In [None]:

from datasets import load_dataset
import json

role_map = {
    "system": "system",
    "instruction": "system",
    "assistant": "assistant",
    "agent": "assistant",
    "gpt": "assistant",
    "bot": "assistant",
    "user": "user",
    "human": "user",
    "input": "user",
    "question": "user",
    "tool": "tool",
    "function": "tool",
    "function response": "tool",
    "function_response": "tool",
}

def hermes_func_to_oai(example):
    results = []
    convo = example["conversations"]

    messages = []
    tools = None
    system = convo[0]["value"]
    tools_text = find_all_xml_tags(system, 'tools')
    if tools_text:
        tools = parse(tools_text[-1])
    
    for msg in convo[1:]:
        tool_calls_text = find_all_xml_tags(msg["value"], 'tool_call')
        tool_responses_text = find_all_xml_tags(msg["value"], 'tool_response')

        if tool_calls_text:
            for i, tool_call_text in enumerate(tool_calls_text):
                tool_call = parse(tool_call_text)
                messages.append({
                    "role": "assistant",
                    "tool_calls": [
                        {
                            "id": "call_" + example["id"] + "_" + str(i),
                            "type": "function",
                            "function": {
                                "name": str(tool_call['name']),
                                "arguments": json.dumps(tool_call['arguments'])
                            }
                        }
                    ],
                })
        elif tool_responses_text:
            for i, tool_response_text in enumerate(tool_responses_text):
                messages.append({
                    "role": "tool",
                    "content": str(tool_response_text),
                    "tool_call_id": "call_" + example["id"] + "_" + str(i),
                })
        else:   
            messages.append({
                "role": role_map[msg['from']],
                "content": msg["value"]
            })
    
    results.append({
        "messages": messages,
        "tools": tools,
        "response_format": None
    })
    
    return results


hermes_func_calling = load_dataset("NousResearch/hermes-function-calling-v1", split="train", data_files='func-calling.json')
hermes_glaive_func_calling = load_dataset("NousResearch/hermes-function-calling-v1", split="train", data_files='glaive-function-calling-5k.json')

hermes_func_calling = hermes_func_calling.map(
    sane_batch_map(hermes_func_to_oai, {"messages": [], "tools":[], "response_format": []}), 
    batched=True, batch_size=10, remove_columns=hermes_func_calling.column_names)
hermes_glaive_func_calling = hermes_glaive_func_calling.map(
    sane_batch_map(hermes_func_to_oai, {"messages": [], "tools":[], "response_format": []}), 
    batched=True, batch_size=10, remove_columns=hermes_glaive_func_calling.column_names)
hermes_glaive_func_calling[0:100]

In [None]:

from datasets import load_dataset

def hermes_agent_to_oai(example):
    results = []
    convo = example["conversations"]

    messages = []
    schema = None
    system = convo[0]["value"]
    schema_text = find_all_xml_tags(system, 'schema')
    if schema_text:
        schema = parse(schema_text[0])
    
    for msg in convo[1:]:
        messages.append({
            "role": role_map[msg['from']],
            "content": msg["value"],
        })
    
    results.append({
        "messages": messages,
        "tools": None,
        "response_format": {
            "type": "json_schema",
            "json_schema": schema
        }
    })
    
    return results

hermes_json_mode_agentic = load_dataset("NousResearch/hermes-function-calling-v1", split="train", data_files='json-mode-agentic.json')
hermes_json_mode_agentic = hermes_json_mode_agentic.map(sane_batch_map(hermes_agent_to_oai), batched=True, batch_size=10, remove_columns=hermes_json_mode_agentic.column_names)
hermes_json_mode_agentic[0:100]

In [39]:

from datasets import load_dataset
import uuid

def glaive_func_to_oai(example):
    results = []
    # generate a random id, since the dataset does not have one
    id = str(uuid.uuid4())

    system: str = example["system"]
    if 'Use them if required' in system:
        tools = []
        for tool_str in system.split('\n{')[1:]:
            tool = parse('{'+tool_str)
            tools.append(tool)
    else:
        tools = None
    
    chat: str = example["chat"]
    convo = [msg.strip() for msg in chat.split('\n\n')]

    messages = []
    for msg in convo:
        if len(msg) == 0:
            continue
        if msg.startswith('SYSTEM: '):
            messages.append({
                "role": "user",
                "content": msg[len('USER: '):].strip(),
            })
        if msg.startswith('USER: '):
            messages.append({
                "role": "user",
                "content": msg[len('USER: '):].strip(),
            })
        elif msg.startswith('ASSISTANT: '):
            txt = msg[len('ASSISTANT: '):].strip()
            if txt.endswith('<|endoftext|>'):
                txt = txt[:-len('<|endoftext|>')].strip()
            if txt.startswith('<functioncall>'):
                call_text = txt[len('<functioncall>'):].strip()
                if '", "arguments": \'' in call_text:
                    name_part = call_text.split('", "arguments": \'')[0].split('"name": "')[1]
                    arguments_part = call_text.split('", "arguments": \'')[1][:-2]
                    tool_call = {"name": name_part, "arguments": parse(arguments_part)}
                else:
                    tool_call = parse(call_text)
                    if 'parameters' in tool_call:
                        tool_call['arguments'] = tool_call['parameters']
                        del tool_call['parameters']
                messages.append({
                    "role": "assistant",
                    "tool_calls": [
                        {
                            "id": "call_" + id,
                            "type": "function",
                            "function": {
                                "name": str(tool_call['name']),
                                "arguments": json.dumps(tool_call['arguments'])
                            }
                        }
                    ],
                })
            else:
                messages.append({
                    "role": "assistant",
                    "content": txt,
                })
        elif msg.startswith('FUNCTION RESPONSE: '):
            messages.append({
                "role": "tool",
                "content": msg[len('FUNCTION RESPONSE: '):].strip(),
                "tool_call_id": "call_" + id,
            })
        else:
            if msg.endswith('<|endoftext|>'):
                msg = msg[:-len('<|endoftext|>')].strip()
            messages[-1]["content"] += "\n" + msg.strip()

    
    results.append({
        "messages": messages,
        "tools": tools,
        "response_format": None
    })
    
    return results

glaive_func = load_dataset("glaiveai/glaive-function-calling-v2", split="train")
glaive_func = glaive_func.map(sane_batch_map(glaive_func_to_oai), batched=True, batch_size=10, remove_columns=glaive_func.column_names)
glaive_func[0:200]

Map: 100%|██████████| 112960/112960 [00:09<00:00, 11912.43 examples/s]


{'messages': ['[{"role": "user", "content": "Can you book a flight for me from New York to London?"}, {"role": "assistant", "content": "I\'m sorry, but I don\'t have the capability to book flights. My current function allows me to get the exchange rate between two currencies. If you need help with that, feel free to ask!"}]',
  '[{"role": "user", "content": "Can you tell me the latest news headlines for the United States?"}, {"role": "assistant", "tool_calls": [{"id": "call_73384623-f222-452e-bf07-eaa451bceb7e", "type": "function", "function": {"name": "get_news_headlines", "arguments": "{\\"country\\": \\"United States\\"}"}}]}, {"role": "tool", "content": "{\\"headlines\\": [\\"Biden announces new vaccine mandates\\", \\"Hurricane Ida devastates Louisiana\\", \\"Apple unveils new iPhone\\", \\"NASA\'s Perseverance rover collects first Mars rock sample\\"]}", "tool_call_id": "call_73384623-f222-452e-bf07-eaa451bceb7e"}, {"role": "assistant", "content": "Here are the latest news headli

In [41]:
from datasets import load_dataset, concatenate_datasets

hermes_full = concatenate_datasets([hermes_glaive_func_calling, hermes_func_calling])
hermes_full.push_to_hub('lucaelin/hermes-function-calling-v1')
hermes_json_mode_agentic.push_to_hub('lucaelin/hermes-response-format')
glaive_func.push_to_hub('lucaelin/glaive-function-calling-v2')


Creating parquet from Arrow format: 100%|██████████| 113/113 [00:00<00:00, 167.93ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:11<00:00, 11.23s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/lucaelin/glaive-function-calling-v2/commit/ae2b4ea89353f4d2bb7e01ca0f7530173626a57c', commit_message='Upload dataset', commit_description='', oid='ae2b4ea89353f4d2bb7e01ca0f7530173626a57c', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from datasets import load_dataset, concatenate_datasets

merged_dataset = concatenate_datasets([hermes_func_calling, hermes_glaive_func_calling, hermes_json_mode_agentic, glaive_func])
merged_dataset.push_to_hub("lucaelin/covas_next")