In [12]:
from datasets import load_dataset
from datetime import datetime
from transformers import AutoModelForCausalLM, AutoTokenizer
import re
import yaml
import time
import json

In [14]:
# dataset = load_dataset("glaiveai/glaive-function-calling-v2", split="train")
val_dataset = dataset.select(range(100))

print(val_dataset[1])

{'system': 'SYSTEM: You are a helpful assistant with access to the following functions. Use them if required -\n{\n    "name": "get_news_headlines",\n    "description": "Get the latest news headlines",\n    "parameters": {\n        "type": "object",\n        "properties": {\n            "country": {\n                "type": "string",\n                "description": "The country for which to fetch news"\n            }\n        },\n        "required": [\n            "country"\n        ]\n    }\n}\n', 'chat': 'USER: Can you tell me the latest news headlines for the United States?\n\n\nASSISTANT: <functioncall> {"name": "get_news_headlines", "arguments": \'{"country": "United States"}\'} <|endoftext|>\n\n\nFUNCTION RESPONSE: {"headlines": ["Biden announces new vaccine mandates", "Hurricane Ida devastates Louisiana", "Apple unveils new iPhone", "NASA\'s Perseverance rover collects first Mars rock sample"]}\n\n\nASSISTANT: Here are the latest news headlines for the United States:\n1. Biden a

In [13]:
def parse_conversation(input_string):  
    
    ROLE_MAPPING = {"USER" : "user", "ASSISTANT" : "assistant", "SYSTEM" : "system", "FUNCTION RESPONSE" : "tool"}

    # Regular expression to split the conversation based on SYSTEM, USER, and ASSISTANT  
    pattern = r"(SYSTEM|USER|ASSISTANT|FUNCTION RESPONSE):"  
      
    # Split the input string and keep the delimiters  
    parts = re.split(pattern, input_string)  
      
    # Initialize the list to store conversation entries  
    conversation = []  
      
    # Iterate over the parts, skipping the first empty string  
    for i in range(1, len(parts), 2):  
        role = parts[i].strip()  
        content = parts[i + 1].strip()  
        content = content.replace("<|endoftext|>", "").strip()

        if content.startswith('<functioncall>'):  # build structured data for function call
                # try to turn function call from raw text to structured data
                content = content.replace('<functioncall>', '').strip()
                # replace single quotes with double quotes for valid JSON
                clean_content = content.replace("'{", '{').replace("'}", '}')
                data_json = json.loads(clean_content)
                # Make it compatible with openAI prompt format
                func_call = {'recipient_name': f"functions.{data_json['name']}", 'parameters': data_json['arguments']}
                content = {'tool_uses': [func_call]}
          
        # Append a dictionary with the role and content to the conversation list  
        conversation.append({"role": ROLE_MAPPING[role], "content": content})  
      
    return conversation  

def apply_chat_template(examples):
        conversations = []
        for system, chat in zip(examples["system"], examples["chat"]):
            try:
                system_message = parse_conversation(system)
                chat_message = parse_conversation(chat)
                message = system_message + chat_message
                conversations.append(message)
            except Exception as e:
                print(e) 

        text = [tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False) for message in conversations]
        return {"text": text}


In [20]:
processed_val_dataset = []
for i in range(len(val_dataset)):
    system_message = parse_conversation(val_dataset[i]["system"])
    chat_message = parse_conversation(val_dataset[i]["chat"])

    message = system_message + chat_message
    processed_val_dataset.append(message)  

In [19]:
### First level response
def get_qna_pairs(message):
    prompt = []
    answer = []
    for item in message:
        if item['role'] == 'assistant':
            response = item['content']
            answer.append(response)
            break
        else:
            prompt.append(item)
    
    return prompt, answer


In [26]:
first_level_prompts = []
first_level_responses = []
for example in processed_val_dataset:
    prompt, answer = get_qna_pairs(example)
    first_level_prompts.append(prompt)
    first_level_responses.append(answer)

print(first_level_prompts[1])
print(first_level_responses[1])

[{'role': 'system', 'content': 'You are a helpful assistant with access to the following functions. Use them if required -\n{\n    "name": "get_news_headlines",\n    "description": "Get the latest news headlines",\n    "parameters": {\n        "type": "object",\n        "properties": {\n            "country": {\n                "type": "string",\n                "description": "The country for which to fetch news"\n            }\n        },\n        "required": [\n            "country"\n        ]\n    }\n}'}, {'role': 'user', 'content': 'Can you tell me the latest news headlines for the United States?'}]
[{'tool_uses': [{'recipient_name': 'functions.get_news_headlines', 'parameters': {'country': 'United States'}}]}]


In [27]:
## Second-level response
def get_qna_pairs(message):
    prompt = []
    answer = []
    is_first_response = False
    for item in message:
        if item['role'] == 'assistant' and not is_first_response:
            is_first_response = True
            prompt.append(item)
        elif item['role'] == 'assistant' and is_first_response:
            response = item['content']
            answer.append(response)
            break
        else:
            prompt.append(item)
    
    if answer is not None:
        return prompt, answer
    else:
        return None, None


In [28]:
second_level_prompts = []
second_level_responses = []
for example in processed_val_dataset:
    prompt, answer = get_qna_pairs(example)
    if prompt is not None:
        second_level_prompts.append(prompt)
        second_level_responses.append(answer)

print(second_level_prompts[1])
print(second_level_responses[1])

[{'role': 'system', 'content': 'You are a helpful assistant with access to the following functions. Use them if required -\n{\n    "name": "get_news_headlines",\n    "description": "Get the latest news headlines",\n    "parameters": {\n        "type": "object",\n        "properties": {\n            "country": {\n                "type": "string",\n                "description": "The country for which to fetch news"\n            }\n        },\n        "required": [\n            "country"\n        ]\n    }\n}'}, {'role': 'user', 'content': 'Can you tell me the latest news headlines for the United States?'}, {'role': 'assistant', 'content': {'tool_uses': [{'recipient_name': 'functions.get_news_headlines', 'parameters': {'country': 'United States'}}]}}, {'role': 'tool', 'content': '{"headlines": ["Biden announces new vaccine mandates", "Hurricane Ida devastates Louisiana", "Apple unveils new iPhone", "NASA\'s Perseverance rover collects first Mars rock sample"]}'}]
["Here are the latest new