# 🧪 Prompt Test Runner for Function Calling LLM

This notebook validates the function call outputs of an LLM (e.g., GPT-4, Azure OpenAI, Microsoft Fabric LLM) against a labeled test set of utterances and expected results.

In [None]:
# 📦 Step 1: Load Test Set
import pandas as pd
df = pd.read_csv('prompt_test_set.csv')
df.head()

In [None]:
# 🧠 Step 2: Define your LLM function calling API (mocked for now)
def call_llm(prompt):
    # Placeholder: Replace with your actual LLM call
    # This could be OpenAI, Azure OpenAI, Fabric, or local model
    return {
        "function": "mock_function",
        "parameters": {"mock_param": "value"}
    }

In [None]:
# ✅ Step 3: Evaluate each prompt
results = []
for _, row in df.iterrows():
    prompt = row['utterance']
    expected_func = row['function']
    expected_params = eval(str(row['parameters']))
    
    response = call_llm(prompt)
    actual_func = response['function']
    actual_params = response['parameters']
    
    result = {
        "utterance": prompt,
        "expected_function": expected_func,
        "actual_function": actual_func,
        "match_function": expected_func == actual_func,
        "expected_parameters": expected_params,
        "actual_parameters": actual_params,
        "match_parameters": expected_params == actual_params
    }
    results.append(result)

df_results = pd.DataFrame(results)
df_results

In [None]:
# 📊 Step 4: Score the model
accuracy_func = df_results['match_function'].mean()
accuracy_params = df_results['match_parameters'].mean()
print(f"Function Accuracy: {accuracy_func:.2%}")
print(f"Parameter Accuracy: {accuracy_params:.2%}")