# Evaluating OpenAI Functions

This is a first pass evaluation of OpenAI Functions. I consider the following tasks:
1. Information Extraction
2. Tagging

We measure different kinds of errors: 
1. Hallucinating keys
1. Hallucinating values
1. Missing keys
1. Missing values

The evaluation is human, systemic and based on datasets curated and tagged by humans. The comparison function does not use LLMs. This is a honest, fair evaluation to the best of my ability.

## Information Extraction Evaluation

In [9]:
%load_ext autoreload
%autoreload 2

GPT_MODEL = "gpt-3.5-turbo-0613"

import json
from collections import defaultdict
from agentai.api import chat_complete, chat_complete_execute_fn
from agentai.annotations import tool, ToolRegistry
from agentai.conversation import Conversation, Message
from pydantic import BaseModel, Field, validate_arguments
from typing import List, Optional
from tqdm.auto import tqdm

extraction_registry = ToolRegistry()  # Namespace for functions

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
class Medicines(BaseModel):
    name: str = Field(..., description="The name of the medicine")
    dosage: Optional[str] = Field(..., description="The dosage of the medicine")
    frequency: Optional[str] = Field(..., description="The frequency of the medicine")
    effect: Optional[str] = Field(..., description="The response to existing medication")


class Symptoms(BaseModel):
    description: str = Field(..., description="The description of the symptom")
    duration: Optional[str] = Field(
        ..., description="The duration of the symptom", enums=["days", "weeks", "months", "years", "ongoing"]
    )
    body_parts_or_organs: Optional[str] = Field(
        ...,
        description="The body parts affected by the symptom e.g. head, liver. Give me a single comma separated list of body parts.",
    )


class PatientInformation(BaseModel):
    gender: str = Field(..., description="Gender of the patient", enum=["Male", "Female", "Other"])
    age: int = Field(..., description="Age of the patient")
    symptoms: Optional[List[Symptoms]] = Field(
        ...,
        description="Symptoms that the patient is currently experiencing. Each symptom should be classified into separate item in the list.",
    )
    medicines: Optional[List[Medicines]] = Field(
        ...,
        description="Medicines that the patient is currently taking. Each medicine should be classified into separate item in the list.",
    )


@tool(extraction_registry)
def patient_history(history: PatientInformation) -> None:
    """
    Save the patient's history which consists of gender, age, symptoms and medicines

    Args:
        history (PatientInformation): The patient's history

    Returns:
        str: The current weather
    """
    print(f"Saving patient history: {history}")


system_ = """You are an AI assistant and programmer. You need to extract the patient's history from the notes.
- Think about the functions you can call to extract information
- Ask for more information if you need it. Be concise.
- DO NOT make assumptions or guess. Only extract information that is explicitly stated. Do not infer 
- Use None unless you are very sure of the value. Never use null or unknown."""

conversations = [
    Conversation(
        history=[
            Message(role="system", content=system_),
            Message(role="user", content="Patient presents with acute headache."),
        ],
        id="single_user_message_simple_medical_information",
    ),
    Conversation(
        history=[
            Message(role="system", content=system_),
            Message(role="user", content="Patient was admitted with acute headache."),
            Message(role="user", content="Now also has a low fever."),
        ],
        id="multiple_user_messages_simple_medical_information",
    ),
    Conversation(
        history=[
            Message(role="system", content=system_),
            Message(role="user", content="Patient was admitted with acute headache."),
            Message(role="user", content="Now also has a low fever."),
            Message(role="assistant", content="What is the patient's age?"),
            Message(role="user", content="Patient is 33F from Texas."),
        ],
        id="multiple_user_messages_with_assistant_simple_medical_information",
    ),
    Conversation(
        history=[
            Message(role="system", content=system_),
            Message(
                role="user",
                content="33F from Texas with acute asthma, wet cough and fever.",
            ),
        ],
        id="1_complex_medical_information",
    ),
    Conversation(
        history=[
            Message(role="system", content=system_),
            Message(
                role="user",
                content="""49 y/o Male with chronic macular rash to face & hair, worse in beard, eyebrows & nares. Itchy, flaky, slightly scaly. Moderate response to OTC steroid cream""",
            ),
        ],
        id="2_complex_medical_information",
    ),
]

expected_completion_message = [
    {
        "role": "assistant",
        "content": "What is the patient's age?",
    },
    {
        "role": "assistant",
        "content": "Need more info?",
    },
    {
        "function_call": {
            "function_name": "save_patient_history",
            "arguments": {
                "history": {
                    "gender": "Female",
                    "age": 33,
                    "symptoms": [
                        {
                            "description": "acute headache",
                            "duration": None,
                            "body_parts_or_organs": None,
                        }
                    ],
                    "medicines": [],
                },
            },
        },
        "role": "assistant",
    },
    {
        "function_call": {
            "function_name": "save_patient_history",
            "arguments": {
                "history": {
                    "gender": "Female",
                    "age": 33,
                    "symptoms": [
                        {"description": "acute asthma", "duration": None, "body_parts_or_organs": None},
                        {"description": "wet cough", "duration": None, "body_parts_or_organs": None},
                        {"description": "fever", "duration": None, "body_parts_or_organs": None},
                    ],
                    "medicines": [],
                }
            },
        },
        "role": "assistant",
    },
    {
        "function_call": {
            "function_name": "save_patient_history",
            "arguments": {
                "history": {
                    "gender": "Male",
                    "age": 49,
                    "symptoms": [
                        {
                            "description": "chronic macular rash to face & hair",
                            "duration": None,
                            "body_parts_or_organs": "face & hair",
                        },
                        {
                            "description": "worse in beard, eyebrows & nares",
                            "duration": None,
                            "body_parts_or_organs": "beard, eyebrows, nares",
                        },
                        {
                            "description": "itchy, flaky, slightly scaly",
                            "duration": None,
                            "body_parts_or_organs": None,
                        },
                    ],
                    "medicines": [
                        {
                            "name": "OTC steroid cream",
                            "dosage": None,
                            "frequency": None,
                            "effect": "moderate response",
                        }
                    ],
                },
            },
        },
        "role": "assistant",
    },
]


def compare_dicts(expected, actual, convo, parent_key=""):
    missing_keys = [key for key in expected if key not in actual]
    extra_keys = [key for key in actual if key not in expected]
    llm_hallucinated = []
    missing_values = []
    incorrect = []

    for key in expected:
        if key in actual:
            full_key = f"{parent_key}.{key}" if parent_key else key
            expected_value = expected[key]
            actual_value = actual[key]
            if isinstance(expected_value, list) and isinstance(actual_value, list):
                # Handle lists of dictionaries
                for idx, (expected_dict, actual_dict) in enumerate(zip(expected_value, actual_value)):
                    nested_result = compare_dicts(expected_dict, actual_dict, convo, f"{full_key}[{idx}]")
                    if nested_result != {
                        "missing_keys": [],
                        "extra_keys": [],
                        "llm_hallucinated": [],
                        "incorrect": [],
                        "missing_values": [],
                    }:
                        llm_hallucinated.extend(nested_result["llm_hallucinated"])
                        incorrect.extend(nested_result["incorrect"])
            elif isinstance(expected_value, dict) and isinstance(actual_value, dict):
                # Recursively compare nested dictionaries
                nested_result = compare_dicts(expected_value, actual_value, convo, full_key)
                if nested_result != {
                    "missing_keys": [],
                    "extra_keys": [],
                    "llm_hallucinated": [],
                    "incorrect": [],
                    "missing_values": [],
                }:
                    llm_hallucinated.extend(nested_result["llm_hallucinated"])
                    incorrect.extend(nested_result["incorrect"])
            else:
                if expected_value != actual_value:
                    if (
                        actual_value not in [message.content for message in convo.history]
                        and actual_value != ""
                        and actual_value != None
                    ):
                        llm_hallucinated.append(full_key)
                    else:
                        incorrect.append(full_key)
                if actual_value is None or actual_value == "":
                    missing_values.append(full_key)

    return {
        "missing_keys": missing_keys,
        "extra_keys": extra_keys,
        "llm_hallucinated": llm_hallucinated,
        "incorrect": incorrect,
        "missing_values": missing_values,
    }


def evaluate_chat_completions(conversations, expected_completions, model, tool_registry, iterations):
    results = []
    for i in tqdm(range(iterations)):
        for convo, expected in zip(conversations, expected_completions):
            completion = chat_complete(convo, model=model, tool_registry=tool_registry)
            completion_message = completion.choices[0].message
            actual = None
            if "function_call" in completion_message:
                # If the completion message has a function call, we compare the arguments of the function call.
                try:
                    expected_arguments = expected["function_call"]["arguments"]["history"]
                except KeyError:
                    # Was expecting a message, but got a function call instead.
                    if "content" not in expected:
                        result = {
                            "missing_keys": [],
                            "extra_keys": ["function_call"],
                            "llm_hallucinated": [],
                            "incomplete": [],
                            "missing_values": ["content"],
                        }
                    if "content" in expected:
                        result = {
                            "missing_keys": [],
                            "extra_keys": ["function_call"],
                            "llm_hallucinated": [],
                            "incomplete": [],
                            "missing_values": [],
                        }
                    actual = completion_message
                else:
                    try:
                        arguments = json.loads(completion_message["function_call"]["arguments"])
                    except json.decoder.JSONDecodeError:
                        arguments = eval(completion_message["function_call"]["arguments"])
                        if not isinstance(arguments, dict):
                            result = {
                                "missing_keys": [],
                                "extra_keys": [],
                                "llm_hallucinated": [],
                                "incomplete": [],
                                "missing_values": ["arguments"],
                            }
                    else:
                        if "history" not in arguments:
                            result = {
                                "missing_keys": [],
                                "extra_keys": [],
                                "llm_hallucinated": [],
                                "incomplete": [],
                                "missing_values": ["history"],
                            }
                            actual = arguments
                        else:
                            result = compare_dicts(expected_arguments, arguments["history"], convo)
                            actual = arguments["history"]

            else:
                if "content" not in expected:
                    # Was expecting a function call, but got a message instead.
                    result = {
                        "missing_keys": [],
                        "extra_keys": [],
                        "llm_hallucinated": [],
                        "incorrect": [],
                        "missing_values": ["content"],
                    }
                else:
                    result = {
                        "missing_keys": [],
                        "extra_keys": [],
                        "llm_hallucinated": [],
                        "incorrect": [],
                        "missing_values": [],
                    }
                actual = completion_message["content"]

            results.append(
                {
                    "conversation_id": convo.id,
                    "expected": expected,
                    "actual": actual,
                    "result": result,
                }
            )

    return results


evaluation_results = evaluate_chat_completions(
    conversations,
    expected_completion_message,
    model=GPT_MODEL,
    tool_registry=extraction_registry,
    iterations=100,
)
print(json.dumps(evaluation_results, indent=4))

  0%|          | 0/100 [00:00<?, ?it/s]

[
    {
        "conversation_id": "single_user_message_simple_medical_information",
        "expected": {
            "role": "assistant",
            "content": "What is the patient's age?"
        },
        "actual": "Can you provide any additional information about the patient's symptoms?",
        "result": {
            "missing_keys": [],
            "extra_keys": [],
            "llm_hallucinated": [],
            "incorrect": [],
            "missing_values": []
        }
    },
    {
        "conversation_id": "multiple_user_messages_simple_medical_information",
        "expected": {
            "role": "assistant",
            "content": "Need more info?"
        },
        "actual": "Please provide more information about the patient's history.",
        "result": {
            "missing_keys": [],
            "extra_keys": [],
            "llm_hallucinated": [],
            "incorrect": [],
            "missing_values": []
        }
    },
    {
        "conversation_id": "

In [13]:
len(evaluation_results)

500

In [2]:
import json
with open("evaluation_resuls.json", "w") as f:
    json.dump(evaluation_results, f, indent=4)

# (evaluation_results, indent=4)

NameError: name 'evaluation_results' is not defined