# Evaluating OpenAI Functions

This is a first pass evaluation of OpenAI Functions. I consider the following tasks:
1. Information Extraction
2. Tagging

We measure different kinds of errors: 
1. Hallucinating keys
1. Hallucinating values
1. Missing keys
1. Missing values

The evaluation is human, systemic and based on datasets curated and tagged by humans. The comparison function does not use LLMs. This is a honest, fair evaluation to the best of my ability.

## Information Extraction Evaluation

In [11]:
%load_ext autoreload
%autoreload 2

GPT_MODEL = "gpt-3.5-turbo-0613"

import json
from agentai.api import chat_complete, chat_complete_execute_fn
from agentai.annotations import tool, ToolRegistry
from agentai.conversation import Conversation, Message
from pydantic import BaseModel, Field, validate_arguments
from typing import List, Optional

extraction_registry = ToolRegistry()  # Namespace for functions

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [45]:
class Medicines(BaseModel):
    name: str = Field(..., description="The name of the medicine")
    dosage: Optional[str] = Field(..., description="The dosage of the medicine")
    frequency: Optional[str] = Field(..., description="The frequency of the medicine")
    effect: Optional[str] = Field(..., description="The response to existing medication")


class Symptoms(BaseModel):
    description: str = Field(..., description="The description of the symptom")
    duration: Optional[str] = Field(..., description="The duration of the symptom")
    body_parts: Optional[str] = Field(..., description="The body part which has the visible or experienced symptom")


class PatientInformation(BaseModel):
    gender: str = Field(..., description="Gender of the patient", enum=["Male", "Female", "Other"])
    age: int = Field(..., description="Age of the patient")
    symptoms: Optional[List[Symptoms]] = Field(
        ...,
        description="Symptoms that the patient is currently experiencing. Each symptom should be classified into separate item in the list.",
    )
    medicines: Optional[List[Medicines]] = Field(
        ...,
        description="Medicines that the patient is currently taking. Each medicine should be classified into separate item in the list.",
    )


@tool(extraction_registry)
def patient_history(history: PatientInformation) -> None:
    """
    Save the patient's history which consists of gender, age, symptoms and medicines

    Args:
        history (PatientInformation): The patient's history

    Returns:
        str: The current weather
    """
    print(f"Saving patient history: {history}")


system_ = """You are an AI assistant working with medical notes from a doctor.
- DO NOT make assumptions
- Think about the functions you can call to extract information
- Ask for more information if you need it. Be concise.
- Use `null`, or 'unknown' if you are not confident"""

conversations = [
    Conversation(
        history=[
            Message(role="system", content=system_),
            Message(role="user", content="Patient presents with acute headache."),
        ],
        id="single_user_message_simple_medical_information",
    ),
    Conversation(
        history=[
            Message(role="system", content=system_),
            Message(role="user", content="Patient was admitted with acute headache."),
            Message(role="user", content="Now also has a low fever."),
        ],
        id="multiple_user_messages_simple_medical_information",
    ),
    Conversation(
        history=[
            Message(role="system", content=system_),
            Message(role="user", content="Patient was admitted with acute headache."),
            Message(role="user", content="Now also has a low fever."),
            Message(role="assistant", content="What is the patient's age?"),
            Message(role="user", content="Patient is 33F from Texas."),
        ],
        id="multiple_user_messages_with_assistant_simple_medical_information",
    ),
    Conversation(
        history=[
            Message(role="system", content=system_),
            Message(
                role="user",
                content="33F from Texas with acute bronchitis, wet cough and fever.",
            ),
        ],
        id="1_complex_medical_information",
    ),
    Conversation(
        history=[
            Message(role="system", content=system_),
            Message(
                role="user",
                content="""49 y/o Male with chronic macular rash to face & hair, worse in beard, eyebrows & nares. 
Itchy, flaky, slightly scaly. Moderate response to OTC steroid cream""",
            ),
        ],
        id="2_complex_medical_information",
    ),
]

expected_completion_message = [
    {
        "role": "assistant",
        "content": "What is the patient's age?",
    },
    {
        "role": "assistant",
        "content": "Need more info?",
    },
    {
        "function_call": {
            "function_name": "save_patient_history",
            "arguments": {
                "history": {
                    "gender": "Female",
                    "age": 33,
                    "symptoms": [
                        {
                            "description": "acute headache",
                            "duration": "unknown",
                            "body_parts": "head",
                        }
                    ],
                    "medicines": [],
                },
            },
        },
        "role": "assistant",
    },
    {
        "function_call": {
            "function_name": "save_patient_history",
            "arguments": {
                "history": {
                    "gender": "Female",
                    "age": 33,
                    "symptoms": [
                        {
                            "description": "acute bronchitis",
                            "duration": "unknown",
                            "body_parts": "lungs",
                        },
                        {
                            "description": "wet cough",
                            "duration": "unknown",
                            "body_parts": "chest",
                        },
                        {
                            "description": "fever",
                            "duration": "unknown",
                            "body_parts": "whole body",
                        },
                    ],
                    "medicines": [],
                }
            },
        },
        "role": "assistant",
    },
    {
        "function_call": {
            "function_name": "save_patient_history",
            "arguments": {
                "history": {
                    "gender": "Male",
                    "age": 49,
                    "symptoms": [
                        {
                            "description": "chronic macular rash to face & hair",
                            "duration": "unknown",
                            "body_parts": "face, hair",
                        },
                        {
                            "description": "worse in beard, eyebrows & nares",
                            "duration": "ongoing",
                            "body_parts": "beard, eyebrows, nares",
                        },
                        {
                            "description": "itchy, flaky, slightly scaly",
                            "duration": "unknown",
                            "body_parts": "unknown",
                        },
                    ],
                    "medicines": [
                        {
                            "name": "OTC steroid cream",
                            "dosage": "unknown",
                            "frequency": "unknown",
                            "effect": "moderate response",
                        }
                    ],
                },
            },
        },
        "role": "assistant",
    },
]


def compare_dicts(expected, actual):
    missing_keys = [key for key in expected if key not in actual]
    extra_keys = [key for key in actual if key not in expected]
    incorrect_values = []
    missing_values = []

    for key in expected:
        if key in actual:
            expected_value = expected[key]
            actual_value = actual[key]
            if isinstance(expected_value, dict) and isinstance(actual_value, dict):
                # Recursively compare nested dictionaries
                nested_result = compare_dicts(expected_value, actual_value)
                if nested_result != {
                    "missing_keys": [],
                    "extra_keys": [],
                    "incorrect_values": [],
                    "missing_values": [],
                }:
                    incorrect_values.append({key: nested_result})
            else:
                if expected_value != actual_value:
                    incorrect_values.append(key)
                if actual_value is None or actual_value == "":
                    missing_values.append(key)

    return {
        "missing_keys": missing_keys,
        "extra_keys": extra_keys,
        "incorrect_values": incorrect_values,
        "missing_values": missing_values,
    }


def evaluate_chat_completions(conversations, expected_completions, model, tool_registry):
    results = []
    for convo, expected in zip(conversations, expected_completions):
        completion = chat_complete(convo, model=model, tool_registry=tool_registry)
        completion_message = completion.choices[0].message

        if "function_call" in completion_message:
            # If the completion message has a function call, we compare the arguments of the function call.
            expected_arguments = expected["function_call"]["arguments"]
            try:
                arguments = json.loads(completion_message["function_call"]["arguments"])
            except json.decoder.JSONDecodeError:
                arguments = eval(completion_message["function_call"]["arguments"])
                if not isinstance(arguments, dict):
                    result = {
                        "missing_keys": [],
                        "extra_keys": [],
                        "incorrect_values": [],
                        "missing_values": ["arguments"],
                    }
            else:
                result = compare_dicts(expected_arguments, arguments)
                actual = arguments
        else:
            if "content" not in expected:
                # Was expecting a function call, but got a message instead.
                result = {
                    "missing_keys": [],
                    "extra_keys": [],
                    "incorrect_values": [],
                    "missing_values": ["content"],
                }
            else:
                result = True
            actual = completion_message["content"]

        results.append(
            {
                "conversation_id": convo.id,
                "expected": expected,
                "actual": actual,
                "result": result,
            }
        )

    return results


evaluation_results = evaluate_chat_completions(
    conversations,
    expected_completion_message,
    model=GPT_MODEL,
    tool_registry=extraction_registry,
)

In [46]:
evaluation_results

[{'conversation_id': 'single_user_message_simple_medical_information',
  'expected': {'role': 'assistant', 'content': "What is the patient's age?"},
  'actual': 'I see. Has the patient mentioned any other symptoms or provided any additional information about the headache?',
  'result': True},
 {'conversation_id': 'multiple_user_messages_simple_medical_information',
  'expected': {'role': 'assistant', 'content': 'Need more info?'},
  'actual': 'Thank you for the update. \n\nTo better assist you, I would need some additional information:\n\n1. Has the patient experienced any other symptoms besides the acute headache and low fever?\n2. How long has the patient been experiencing these symptoms?\n3. Is there any specific part of the body where the headache is located?\n4. Has the patient taken any medication for the headache or fever?\n\nPlease provide the necessary details so that I can help you further.',
  'result': True},
 {'conversation_id': 'multiple_user_messages_with_assistant_simpl

In [47]:
from collections import defaultdict

def summarize_results(results):
    summary = defaultdict(lambda: defaultdict(int))

    for result in results:
        convo_id = result['conversation_id']
        if result['result'] == True:
            summary[convo_id]['correct'] += 1
        if result['result'] != True:  # If the result is not True, it's a dictionary with error details
            if result['result']['missing_keys']:
                summary[convo_id]['missing_keys'] += len(result['result']['missing_keys'])
                
            if result['result']['extra_keys']:
                summary[convo_id]['extra_keys'] += len(result['result']['extra_keys'])
                
            if result['result']['incorrect_values']:
                summary[convo_id]['incorrect_values'] += len(result['result']['incorrect_values'])
                
            if result['result']['missing_values']:
                summary[convo_id]['missing_values'] += len(result['result']['missing_values'])
        else:
            summary[convo_id]['correct'] += 1

    return dict(summary)

summarize_results(evaluation_results)

{'single_user_message_simple_medical_information': defaultdict(int,
             {'correct': 2}),
 'multiple_user_messages_simple_medical_information': defaultdict(int,
             {'correct': 2}),
 '1_complex_medical_information': defaultdict(int, {'incorrect_values': 1}),
 '2_complex_medical_information': defaultdict(int, {'incorrect_values': 1})}