In [None]:
from openai import OpenAI
import json
from tools import sampleQuestionsFromMMLU, callLLM, calculateAccuracy
from utils import pretty_print_conversation, gpt_process_function_calling
from langchain_core.utils.function_calling import convert_to_openai_tool
from langchain.agents import tool
from termcolor import colored 

In [None]:
client = OpenAI()

models_to_evaluate = ["meta/meta-llama-3-8b", "meta/llama-2-7b-chat", "meta/llama-2-7b"]

n_questions = 5

sys_prompt = {
    "role": "system",
    "content": "You are an agent that quantitatively evaluates exactly three other LLMs on sample MMLU questions. You will call appropriate functions to achieve this goal."
}

workflow = [
    (f"Sample {n_questions} questions from MMLU dataset.", [sampleQuestionsFromMMLU]),
    (f"Ask all above {n_questions} questions to the first LLM {models_to_evaluate[0]} and check their answers. LLM should be asked all {n_questions} with their choices.", [callLLM]),
    (f"Ask all above {n_questions} questions to the first LLM {models_to_evaluate[1]} and check their answers. LLM should be asked all {n_questions} with their choices.", [callLLM]),
    (f"Ask all above {n_questions} questions to the first LLM {models_to_evaluate[2]} and check their answers. LLM should be asked all {n_questions} with their choices.", [callLLM]),
    (f"Calculate the number of correct answers for {models_to_evaluate[0]} and then calculate the accuracy of each model", [calculateAccuracy]),
    (f"Calculate the number of correct answers for {models_to_evaluate[1]} and then calculate the accuracy of each model", [calculateAccuracy]),
    (f"Calculate the number of correct answers for {models_to_evaluate[2]} and then calculate the accuracy of each model", [calculateAccuracy]),
    ("Report the accuracy of all the models. What is the best model?", [])
]

In [None]:
messages = []
messages.append(sys_prompt)
for instruction, functions in workflow:
    kwargs = {}
    if len(functions) > 0:
        functions = [convert_to_openai_tool(tool(f)) for f in functions]
        kwargs = {"tools": functions}
    messages.append({"role": "user", "content": instruction})
    output = client.chat.completions.create(model='gpt-3.5-turbo',
                                            messages=messages, seed=0,
                                            **kwargs)
    output = gpt_process_function_calling(output)
    messages.append({"role": "assistant", "content": output})
pretty_print_conversation(messages)
