In [1]:
from openai import OpenAI
import replicate, json
from tools import sampleQuestionsFromMMLU, callLLM, calculateAccuracy
from langchain_core.utils.function_calling import convert_to_openai_tool
from langchain.agents import tool
from termcolor import colored 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
client = OpenAI()

n_questions = 3

sys_prompt = {
    "role": "system",
    "content": "You are an agent that quantitatively evaluates exactly three other LLMs on sample MMLU questions. The three models are meta/meta-llama-3-8b, meta/llama-2-7b-chat, meta/llama-2-7b. You will call appropriate functions to achieve this goal."
}

workflow = [
    (f"Sample {n_questions} questions from MMLU dataset.", [sampleQuestionsFromMMLU]),
    (f"Ask each of the three LLMs to respond with a choice to all {n_questions} questions and check their answers. Each LLM should be asked all {n_questions} with their choices.",[callLLM]),
    ("Calculate the number of correct answers for each model and then calculate the accuracy of each model", [calculateAccuracy]),
    ("Report the accuracy of all the models. What is the best model?", [])
]

In [3]:
def pretty_print_conversation(messages):
    role_to_color = {
        "system": "red",
        "user": "green",
        "assistant": "blue",
        "function": "magenta",
    }
    
    for message in messages:
        if message["role"] == "system":
            print(colored(f"system: {message['content']}\n", role_to_color[message["role"]]))
        elif message["role"] == "user":
            print(colored(f"user: {message['content']}\n", role_to_color[message["role"]]))
        elif message["role"] == "assistant" and message.get("function_call"):
            print(colored(f"assistant: {message['function_call']}\n", role_to_color[message["role"]]))
        elif message["role"] == "assistant" and not message.get("function_call"):
            print(colored(f"assistant: {message['content']}\n", role_to_color[message["role"]]))
        elif message["role"] == "function":
            print(colored(f"function ({message['name']}): {message['content']}\n", role_to_color[message["role"]]))

In [4]:
def requires_tool_calls(response):
    return response.choices[0].finish_reason == "tool_calls"

def gpt_process_function_calling(response):
    output = ""
    if requires_tool_calls(response):
        print(f"Tool calls {list(map(lambda x: x.function.name, response.choices[0].message.tool_calls))}")
        for tool_call in response.choices[0].message.tool_calls:
            function_name = tool_call.function.name
            arguments = json.loads(tool_call.function.arguments)
            func = globals()[function_name]
            output += func(**arguments) + "\n\n "
        return output
    else:
        output += response.choices[0].message.content
        return response.choices[0].message.content

In [5]:
messages = []
messages.append(sys_prompt)
for instruction, functions in workflow:
    kwargs = {}
    if len(functions) > 0:
        functions = [convert_to_openai_tool(tool(f)) for f in functions]
        kwargs = {"tools": functions}
    messages.append({"role": "user", "content": instruction})
    output = client.chat.completions.create(model='gpt-3.5-turbo-0125',
                                            messages=messages, seed=0,
                                            **kwargs)
    output = gpt_process_function_calling(output)
    messages.append({"role": "assistant", "content": output})
pretty_print_conversation(messages)


Tool calls ['sampleQuestionsFromMMLU']
Tool calls ['callLLM', 'callLLM', 'callLLM']
Tool calls ['calculateAccuracy']
[31msystem: You are an agent that quantitatively evaluates exactly three other LLMs on sample MMLU questions. The three models are meta/meta-llama-3-8b, meta/llama-2-7b-chat, meta/llama-2-7b. You will call appropriate functions to achieve this goal.
[0m
[32muser: Sample 3 questions from MMLU dataset.
[0m
[34massistant: Question 1: Identify the conclusion of the following argument. It is hard not to verify in our peers the same weakened intelligence due to emotions that we observe in our everyday patients. The arrogance of our consciousness, which in general, belongs to the strongest defense mechanisms, blocks the unconscious complexes. Because of this, it is difficult to convince people of the unconscious, and in turn to teach them what their conscious knowledge contradicts. (Sigmund Freud, The Origin and Development of Psychoanalysis) 

Choices:
 1. It is hard not 