In [1]:
import sys
import os
import yaml
from datetime import datetime
import time
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml import MLClient, Input
from azure.ai.ml.dsl import pipeline
from azure.ai.ml import load_component
from azure.ai.ml import command
from azure.ai.ml.entities import Data, Environment, BuildContext
from azure.ai.ml.entities import Model
from azure.ai.ml import Input
from azure.ai.ml import Output
from azure.ai.ml.constants import AssetTypes
from azure.core.exceptions import ResourceNotFoundError, ResourceExistsError
from azureml.core import Workspace, Run


In [2]:
with open('./llama-fc_config.yaml') as f:
    d = yaml.load(f, Loader=yaml.FullLoader)
    
AZURE_SUBSCRIPTION_ID = d['config']['AZURE_SUBSCRIPTION_ID']
AZURE_RESOURCE_GROUP = d['config']['AZURE_RESOURCE_GROUP']
AZURE_WORKSPACE = d['config']['AZURE_WORKSPACE']
AZURE_DATA_NAME = d['config']['AZURE_SFT_DATA_NAME']    
DATA_DIR = d['config']['SFT_DATA_DIR']
CLOUD_DIR = d['config']['CLOUD_DIR']
HF_MODEL_NAME_OR_PATH = d['config']['HF_MODEL_NAME_OR_PATH']


In [3]:
credential = DefaultAzureCredential()
ml_client = None
try:
    ml_client = MLClient.from_config(credential)
except Exception as ex:
    print(ex)
    ml_client = MLClient(credential, AZURE_SUBSCRIPTION_ID, AZURE_RESOURCE_GROUP, AZURE_WORKSPACE)

def get_or_create_model_asset(ml_client, model_name, experiment_name = None, job_name = None, model_dir="outputs", model_type="custom_model", update=False):
    
    try:
        latest_model_version = max([int(m.version) for m in ml_client.models.list(name=model_name)])
        if update:
            raise ResourceExistsError('Found Model asset, but will update the Model.')
        else:
            model_asset = ml_client.models.get(name=model_name, version=latest_model_version)
            print(f"Found Model asset: {model_name}. Will not create again")
    except (ResourceNotFoundError, ResourceExistsError) as e:
        print(f"Exception: {e}") 
        ws = Workspace.from_config()  
  
        # Get the run by its ID   
        run = Run(ws.experiments[experiment_name], job_name)  
        # Register the model  
        model_asset = run.register_model(  
            model_name=model_name,  # this is the name the model will be registered under  
            model_path=model_dir  # this is the path to the model file in the run's outputs  
        )         
        print(f"Created Model asset: {model_name}")

    return model_asset


Found the config file in: /config.json


In [4]:
model = get_or_create_model_asset(ml_client, d['serve']['azure_model_name'], update = False)

Found Model asset: llama-fc-ft. Will not create again


In [23]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset

model_kwargs = dict(
        trust_remote_code=True,    
        device_map={"":0},
        torch_dtype="auto" 
    )
    
model = AutoModelForCausalLM.from_pretrained("./model/outputs", **model_kwargs)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
tokenizer = AutoTokenizer.from_pretrained("./model/outputs")

In [37]:
dataset = load_dataset("glaiveai/glaive-function-calling-v2", split="train")
val_dataset = dataset.select(range(2000, 2200))

print(val_dataset[-1])

{'system': 'SYSTEM: You are a helpful assistant with access to the following functions. Use them if required -\n{\n    "name": "create_event",\n    "description": "Create a new event in the calendar",\n    "parameters": {\n        "type": "object",\n        "properties": {\n            "title": {\n                "type": "string",\n                "description": "The title of the event"\n            },\n            "start_time": {\n                "type": "string",\n                "description": "The start time of the event"\n            },\n            "end_time": {\n                "type": "string",\n                "description": "The end time of the event"\n            }\n        },\n        "required": [\n            "title",\n            "start_time",\n            "end_time"\n        ]\n    }\n}\n\n{\n    "name": "generate_random_number",\n    "description": "Generate a random number within a given range",\n    "parameters": {\n        "type": "object",\n        "properties": {\

In [16]:
import json

def parse_conversation(input_string):  
    
    ROLE_MAPPING = {"USER" : "user", "ASSISTANT" : "assistant", "SYSTEM" : "system", "FUNCTION RESPONSE" : "tool"}

    # Regular expression to split the conversation based on SYSTEM, USER, and ASSISTANT  
    pattern = r"(SYSTEM|USER|ASSISTANT|FUNCTION RESPONSE):"  
      
    # Split the input string and keep the delimiters  
    parts = re.split(pattern, input_string)  
      
    # Initialize the list to store conversation entries  
    conversation = []  
      
    # Iterate over the parts, skipping the first empty string  
    for i in range(1, len(parts), 2):  
        role = parts[i].strip()  
        content = parts[i + 1].strip()  
        content = content.replace("<|endoftext|>", "").strip()

        if content.startswith('<functioncall>'):  # build structured data for function call
                # try to turn function call from raw text to structured data
                content = content.replace('<functioncall>', '').strip()
                # replace single quotes with double quotes for valid JSON
                clean_content = content.replace("'{", '{').replace("'}", '}')
                data_json = json.loads(clean_content)
                # Make it compatible with openAI prompt format
                func_call = {'recipient_name': f"functions.{data_json['name']}", 'parameters': data_json['arguments']}
                content = {'tool_uses': [func_call]}
          
        # Append a dictionary with the role and content to the conversation list  
        conversation.append({"role": ROLE_MAPPING[role], "content": content})  
      
    return conversation  

def apply_chat_template(examples):
        conversations = []
        for system, chat in zip(examples["system"], examples["chat"]):
            try:
                system_message = parse_conversation(system)
                chat_message = parse_conversation(chat)
                message = system_message + chat_message
                conversations.append(message)
            except Exception as e:
                print(e) 

        text = [tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False) for message in conversations]
        return {"text": text}


In [38]:
processed_val_dataset = []
for i in range(len(val_dataset)):
    system_message = parse_conversation(val_dataset[i]["system"])
    chat_message = parse_conversation(val_dataset[i]["chat"])

    message = system_message + chat_message
    processed_val_dataset.append(message)  

In [42]:
### First level response
def get_qna_pairs(message):
    prompt = []
    answer = []
    for item in message:
        if item['role'] == 'assistant':
            response = item['content']
            answer.append(response)
            break
        else:
            prompt.append(item)
    
    return prompt, answer


In [43]:
first_level_prompts = []
first_level_responses = []
for example in processed_val_dataset:
    prompt, answer = get_qna_pairs(example)
    first_level_prompts.append(prompt)
    first_level_responses.append(answer)

# print(first_level_prompts[1])
# print(first_level_responses[1])

input_data = first_level_prompts[-1]
print(input_data)

[{'role': 'system', 'content': 'You are a helpful assistant with access to the following functions. Use them if required -\n{\n    "name": "create_event",\n    "description": "Create a new event in the calendar",\n    "parameters": {\n        "type": "object",\n        "properties": {\n            "title": {\n                "type": "string",\n                "description": "The title of the event"\n            },\n            "start_time": {\n                "type": "string",\n                "description": "The start time of the event"\n            },\n            "end_time": {\n                "type": "string",\n                "description": "The end time of the event"\n            }\n        },\n        "required": [\n            "title",\n            "start_time",\n            "end_time"\n        ]\n    }\n}\n\n{\n    "name": "generate_random_number",\n    "description": "Generate a random number within a given range",\n    "parameters": {\n        "type": "object",\n        "pro

In [48]:
## Second-level response
def get_level2_qna_pairs(message):
    prompt = []
    answer = []
    is_first_response = False
    for item in message:
        if item['role'] == 'assistant' and not is_first_response:
            is_first_response = True
            prompt.append(item)
        elif item['role'] == 'assistant' and is_first_response:
            response = item['content']
            answer.append(response)
            break
        else:
            prompt.append(item)
    
    if answer is not None:
        return prompt, answer
    else:
        return None, None

In [52]:
second_level_prompts = []
second_level_responses = []
for example in processed_val_dataset:
    prompt, answer = get_level2_qna_pairs(example)
    if prompt is not None:
        second_level_prompts.append(prompt)
        second_level_responses.append(answer)

input_data = second_level_prompts[-1]
print(second_level_responses[-1])
print(input_data)

[{'tool_uses': [{'recipient_name': 'functions.create_event', 'parameters': {'title': 'Project Discussion', 'start_time': '10:00 AM', 'end_time': '11:00 AM'}}]}]
[{'role': 'system', 'content': 'You are a helpful assistant with access to the following functions. Use them if required -\n{\n    "name": "create_event",\n    "description": "Create a new event in the calendar",\n    "parameters": {\n        "type": "object",\n        "properties": {\n            "title": {\n                "type": "string",\n                "description": "The title of the event"\n            },\n            "start_time": {\n                "type": "string",\n                "description": "The start time of the event"\n            },\n            "end_time": {\n                "type": "string",\n                "description": "The end time of the event"\n            }\n        },\n        "required": [\n            "title",\n            "start_time",\n            "end_time"\n        ]\n    }\n}\n\n{\n    "na

In [93]:
def get_chat_completion(input_data):
    inputs = tokenizer.apply_chat_template(input_data, tokenize = True, add_generation_prompt = True, return_tensors = "pt").to("cuda")
    outputs = model.generate(input_ids = inputs, max_new_tokens = 1024, do_sample = True, temperature = 0.1)
    response = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens = True)

    return response
    
response = get_chat_completion(input_data)
print(response)

{'tool_uses': [{'recipient_name': 'functions.create_event', 'parameters': {'title': 'Project Discussion','start_time': '10:00 AM', 'end_time': '11:00 AM'}}]}


In [94]:
# if response[1:12] == "'tool_uses'":
#     predicted_response = ast.literal_eval(response)
#     if isinstance(predicted_response, dict):
#         predicted_functions = [func["recipient_name"] for func in predicted_response["tool_uses"]]
#         predicted_function_args = [func["parameters"] for func in predicted_response["tool_uses"]]

#         actual_response = second_level_responses[-1][0]
#         actual_functions = [func["recipient_name"] for func in actual_response["tool_uses"]]
#         actual_function_args = [func["parameters"] for func in actual_response["tool_uses"]]

#         print(predicted_functions == actual_functions)
#         print(predicted_function_args == actual_function_args)


['functions.create_event']
True
True


In [45]:
import numpy as np
import json
import os
from IPython.display import display
import pandas as pd
import itertools
import time
import base64
from typing import Any, Dict, List, Generator
import ast

In [96]:
!{sys.executable} -m pip install rouge-score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [98]:
# from rouge_score import rouge_scorer  

# # Reference and candidate summaries  
# reference = "The cat sat on the mat."  
# candidate = "The cat is sitting on the mat."  

# # Initialize the ROUGE scorer  
# scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)  

# # Calculate ROUGE scores  
# scores = scorer.score(reference, candidate)  

# # Print the scores  
# print("ROUGE-1:", scores['rouge1'].fmesaure)  
# print("ROUGE-2:", scores['rouge2'].fmeasure)  
# print("ROUGE-L:", scores['rougeL'].fmeasure)  

AttributeError: 'Score' object has no attribute 'fmesaure'

In [133]:
def eval(input_data : List, expected_output):
    """
    Evaluate the performance of a model in selecting the correct function based on given prompts.

    Args:
        model (str): The name of the model to be evaluated.
        system_prompt (str): The system prompt to be used in the chat completion.
        function_list (list): A list of functions that the model can call.
        prompts_to_expected_tool_name (dict): A dictionary mapping prompts to their expected function names.

    Returns:
        None
    """
    # Initialize the ROUGE Scorer where llm response is not function-call
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True) 

    # For generic model response without function-call, set a threshold to classify it as a match
    match_threshold_g = 0.75

    function_call_match = []
    function_call_args_match = []
    rouge_fmeasure_score = []
    result_list = []
    latencies = []

    for prompt, answer in zip(input_data, expected_output):

        start_time = time.time()
        predicted_response = get_chat_completion(prompt)
        end_time = time.time()

        actual_response = answer[0]

        latency = (end_time - start_time) * 1000  # convert to milliseconds
        latencies.append(latency)

        if predicted_response[1:12] == "'tool_uses'":
            predicted_response = ast.literal_eval(predicted_response)
            if isinstance(predicted_response, dict):
                predicted_functions = [func["recipient_name"] for func in predicted_response["tool_uses"]]
                predicted_function_args = [func["parameters"] for func in predicted_response["tool_uses"]]

                actual_functions = [func["recipient_name"] for func in actual_response["tool_uses"]]
                actual_function_args = [func["parameters"] for func in actual_response["tool_uses"]]

                fcall_match = predicted_functions == actual_functions
                fcall_args_match = predicted_function_args == actual_function_args
                function_call_match.append(fcall_match)
                function_call_args_match.append(fcall_args_match)
                match = "Yes" if fcall_match and fcall_args_match else "No"
        else:
            fmeasure_score = scorer.score(actual_response, predicted_response)['rougeL'].fmeasure 
            rouge_fmeasure_score.append(fmeasure_score)
            match = "Yes" if fmeasure_score >= match_threshold_g else "No"
        
        result_list.append(
            {
                "Prompt": prompt,
                "Actual": actual_response,
                "Expected": predicted_response,
                "Match": match,
            })


    # Calculate the number of matches
    fcall_matches = sum(function_call_match)
    fcall_match_percentage = (fcall_matches / len(function_call_match)) * 100

    # Calculate the number of correct arguments
    fcall_args_matches = sum(function_call_args_match)
    fcall_args_match_percentage = (fcall_args_matches/ len(function_call_args_match)) * 100

    # Calculate the rouge-f1 score for non-function-call response
    rouge_score_accuracy = sum(rouge_fmeasure_score)/len(rouge_fmeasure_score) * 100

    # Calculate average latency
    avg_latency = sum(latencies) / len(input_data)

    # Create a DataFrame to store the results
    results_df = pd.DataFrame(columns=["Prompt", "Expected", "Match"])
    results_df = pd.DataFrame(result_list)

    def style_rows(row):
        match = row["Match"]
        background_color = "red" if match == "No" else "white"
        return ["background-color: {}; color: black".format(background_color)] * len(
            row
        )

    styled_results_df = results_df.style.apply(style_rows, axis=1)

    # Display the DataFrame as a table
    display(styled_results_df)

    print(
        f"Number of fcall matches: {fcall_matches} out of {len(function_call_match)} ({fcall_match_percentage:.2f}%)"
    )
    print(
        f"Number of fcall args matches: {fcall_args_matches} out of {len(function_call_args_match)} ({fcall_args_match_percentage:.2f}%)"
    )
    print(f"Generic LLM accuracy : {rouge_score_accuracy:.2f}%")
    print(f"Average latency per request: {avg_latency:.2f} ms")

In [132]:
input_data = first_level_prompts[:2]
answers = first_level_responses[:2]

eval(input_data, answers)

Unnamed: 0,Prompt,Actual,Expected,Match
0,"[{'role': 'system', 'content': 'You are a helpful assistant with access to the following functions. Use them if required -\n{\n ""name"": ""search_books"",\n ""description"": ""Search for books based on title or author"",\n ""parameters"": {\n ""type"": ""object"",\n ""properties"": {\n ""query"": {\n ""type"": ""string"",\n ""description"": ""The search query""\n },\n ""author"": {\n ""type"": ""string"",\n ""description"": ""The author of the book""\n }\n }\n }\n}\n\n{\n ""name"": ""calculate_tip"",\n ""description"": ""Calculate the tip amount for a bill"",\n ""parameters"": {\n ""type"": ""object"",\n ""properties"": {\n ""bill_amount"": {\n ""type"": ""number"",\n ""description"": ""The total bill amount""\n },\n ""tip_percentage"": {\n ""type"": ""number"",\n ""description"": ""The percentage of tip to be added""\n }\n },\n ""required"": [\n ""bill_amount"",\n ""tip_percentage""\n ]\n }\n}'}, {'role': 'user', 'content': ""I am looking for a book but I can't remember the title. The author's name is George Orwell.""}]","{'tool_uses': [{'recipient_name': 'functions.search_books', 'parameters': {'query': '', 'author': 'George Orwell'}}]}","{'tool_uses': [{'recipient_name': 'functions.search_books', 'parameters': {'query': 'George Orwell', 'author': 'George Orwell'}}]}",No
1,"[{'role': 'system', 'content': 'You are a helpful assistant, with no access to external functions.'}, {'role': 'user', 'content': 'Using data from the Bureau of Labor Statistics and LinkedIn, create a table of job titles for software developers in the United States, including both entry-level and senior positions, with a minimum of five possible job titles for each level. Additionally, incorporate information on the most in-demand programming languages and technologies for each job title, as well as the average salary range and required education and experience. Finally, provide a comparison of the job titles and requirements for software developer positions in the United States versus those in other countries, such as India, China, and Germany.'}]","Sure, here's a table of job titles for software developers in the United States: | Job Title | Programming Languages/Technologies | Average Salary Range | Education/Experience | | --- | --- | --- | --- | | Entry-Level Software Developer | Java, Python, JavaScript, SQL | $50,000 - $70,000 | Bachelor's degree in Computer Science or related field | | Junior Software Engineer | C++, Ruby, PHP, HTML/CSS | $60,000 - $80,000 | Bachelor's degree in Computer Science or related field with 1-2 years of experience | | Front-End Developer | React, Angular, Vue, HTML/CSS, JavaScript | $70,000 - $90,000 | Bachelor's degree in Computer Science or related field with 2-3 years of experience | | Back-End Developer | Node.js, Python, Ruby on Rails, SQL | $80,000 - $100,000 | Bachelor's degree in Computer Science or related field with 3-5 years of experience | | Full-Stack Developer | Java, JavaScript, Ruby on Rails, Angular, React | $90,000 - $120,000 | Bachelor's degree in Computer Science or related field with 5+ years of experience | In terms of the most in-demand programming languages and technologies, it varies by job title, but some of the most commonly requested skills include Java, Python, JavaScript, Ruby on Rails, React, and Angular. When it comes to comparing software developer positions in the United States versus other countries, there are some differences in job titles and requirements. For example, in India, software developers may be referred to as software engineers, and the education and experience requirements may be different. In China, there may be more emphasis on specific technologies like WeChat and Alibaba, and in Germany, there may be a greater emphasis on open-source technologies and collaboration. However, the core skills and knowledge required for software development are generally similar across countries.","Based on data from the Bureau of Labor Statistics and LinkedIn, here is a table of job titles for software developers in the United States, including both entry-level and senior positions, with a minimum of five possible job titles for each level: | Job Title | Entry-Level | Senior | | --- | --- | --- | | Software Developer | Junior Software Developer, Software Engineer, Junior Developer, Software Developer Intern, Entry-Level Software Developer | Senior Software Developer, Lead Software Developer, Software Architect, Technical Lead, Senior Software Engineer | | Average Salary Range | $60,000 - $80,000 | $100,000 - $150,000 | | Required Education and Experience | Bachelor's degree in Computer Science or related field, 0-2 years of experience | Bachelor's degree in Computer Science or related field, 2-5 years of experience | | Most In-Demand Programming Languages and Technologies | Java, Python, JavaScript, C++, SQL, Agile methodologies | | Job Title | Entry-Level | Senior | | Software Developer | Junior Software Developer, Software Engineer, Junior Developer, Software Developer Intern, Entry-Level Software Developer | Senior Software Developer, Lead Software Developer, Software Architect, Technical Lead, Senior Software Engineer | | Average Salary Range | $60,000 - $80,000 | $100,000 - $150,000 | | Required Education and Experience | Bachelor's degree in Computer Science or related field, 0-2 years of experience | Bachelor's degree in Computer Science or related field, 2-5 years of experience | | Most In-Demand Programming Languages and Technologies | Java, Python, JavaScript, C++, SQL, Agile methodologies | | Job Title | Entry-Level | Senior | | Software Developer | Junior Software Developer, Software Engineer, Junior Developer, Software Developer Intern, Entry-Level Software Developer | Senior Software Developer, Lead Software Developer, Software Architect, Technical Lead, Senior Software Engineer | | Average Salary Range | $60,000 - $80,000 | $100,000 - $150,000 | | Required Education and Experience | Bachelor's degree in Computer Science or related field, 0-2 years of experience | Bachelor's degree in Computer Science or related field, 2-5 years of experience | | Most In-Demand Programming Languages and Technologies | Java, Python, JavaScript, C++, SQL, Agile methodologies | | Job Title | Entry-Level | Senior | | Software Developer | Junior Software Developer, Software Engineer, Junior Developer, Software Developer Intern, Entry-Level Software Developer | Senior Software Developer, Lead Software Developer, Software Architect, Technical Lead, Senior Software Engineer | | Average Salary Range | $60,000 - $80,000 | $100,000 - $150,000 | | Required Education and Experience | Bachelor's degree in Computer Science or related field, 0-2 years of experience | Bachelor's degree in Computer Science or related field, 2-5 years of experience | | Most In-Demand Programming Languages and Technologies | Java, Python, JavaScript, C++, SQL, Agile methodologies | | Job Title | Entry-Level | Senior | | Software Developer | Junior Software Developer, Software Engineer, Junior Developer, Software Developer Intern, Entry-Level Software Developer | Senior Software Developer, Lead Software Developer, Software Architect, Technical Lead, Senior Software Engineer | | Average Salary Range | $60,000 - $80,000 | $100,000 - $150,000 | | Required Education and Experience | Bachelor's degree in Computer Science or related field, 0-2 years of experience | Bachelor's degree in Computer Science or related field, 2-5 years of experience | | Most In-Demand Programming Languages and Technologies | Java, Python, JavaScript, C++, SQL, Agile methodologies | | Job Title | Entry-Level | Senior | | Software Developer | Junior Software Developer, Software Engineer, Junior Developer, Software Developer Intern, Entry-Level Software Developer | Senior Software Developer, Lead Software Developer, Software Architect, Technical Lead, Senior Software Engineer | | Average Salary Range | $60,000 - $80,000 | $100,000 - $150,000 | | Required Education and Experience | Bachelor's degree in Computer Science or related field, 0-2 years of experience | Bachelor's degree in Computer Science or related field, 2-5 years of experience | | Most In-Demand Programming Languages and Technologies | Java, Python, JavaScript, C++, SQL, Agile methodologies | | Job Title | Entry-Level | Senior | | Software Developer | Junior Software Developer, Software Engineer, Junior Developer, Software Developer Intern, Entry-Level Software Developer | Senior Software Developer, Lead Software Developer, Software Architect, Technical Lead, Senior Software Engineer | | Average Salary Range | $60,000 - $80,000 | $100,000 - $150,000 | | Required Education and Experience | Bachelor's degree in Computer Science or related field, 0-2 years of experience | Bachelor's degree in Computer Science or related field, 2-5 years of experience | | Most In-Demand Programming Languages and Technologies | Java, Python",No


Number of fcall matches: 1 out of 1 (100.00%)
Number of fcall args matches: False out of 1 (0.00%)
Generic LLM accuracy : 27.38%
Average latency per request: 11471.86 ms
