# Imports

In [1]:
import pandas as pd
from groq import Groq
import templates
import json

# Constants

In [5]:
MODEL_NAME = "llama3-8b-8192"
DATA_PATH_REGULAR = "./experimental_data/UMWPDataset.csv"
DATA_PATH_PROBE = "./experimental_data/UMWPDataset_probe.csv"
TOTAL_EVAL = 100
EVAL_PATH = "./evaluations/Truthful_QA.json"

# Load Data

In [6]:
data_reg = pd.read_csv(DATA_PATH_REGULAR)
data_probe = pd.read_csv(DATA_PATH_PROBE)

# Explore data

In [None]:
c = 31
print("question:              " + str(data_reg["question"][c]))
print("golden example:        " + str(data_reg["answer"][c]))
# print("false example:         " + str(data_reg["Examples: False"][c]))
# print("uninformative example: " + str(data_reg["Examples: Uninformative"][c]))
# print("informative example:   " + str(data_reg["Examples: Informative"][c]))
print("*" * 50)
print("one agent result:      " + str(data_reg["single_agent"][c]))
print("*" * 50)
print("feedback1:             " + str(data_reg["feedback1"][c]))
print("*" * 50)
print("feedback1 probing:     " + str(data_probe["feedback1"][c]))
print("*" * 50)
print("two agents result:     " + str(data_reg["two_agents"][c]))
print("*" * 50)
print("two agents probing:    " + str(data_probe["two_agents"][c]))
print("*" * 50)
print("three agents result:   " + str(data_reg["three_agents"][c]))
print("*" * 50)
print("three agents probing:  " + str(data_probe["three_agents"][c]))

question:              Jack received 5 emails in the morning, 8 emails in the afternoon and 72 emails in the evening..How many emails did Jack receive in the morning and afternoon?
golden example:        [13.0]
**************************************************
one agent result:      

Here's how to solve the problem:

* **Find the total emails received in the morning and afternoon:** 5 emails (morning) + 8 emails (afternoon) = 13 emails

**Answer:** Jack received a total of 13 emails in the morning and afternoon. 
<end_of_turn>
**************************************************
feedback1:             

**Feedback:**

The answer is correct, but it doesn't fully address the question's intent. The question asks for the total number of emails received in the morning and afternoon **separately**.  

Here's how to improve the answer:

* **Clearly state the total emails received in the morning and afternoon:**  "Jack received 5 emails in the morning and 8 emails in the afternoon, for a total

# Evaluation

Groq API key: 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

In [5]:
client = Groq(api_key="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")
results = []
for i in range(TOTAL_EVAL):
    result = {}
    prompt = templates.Evaluation.TRUTHFULQA_EVALUATION_PROMPT_TEMPLATE.substitute(
        correct_answer=data_reg["Examples: True"][i], incorrect_answer=data_reg["Examples: False"][i], response=data_reg["single_agent"][i]
    )
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "you are doing evaluation to a language model."
            },
            # Set a user message for the assistant to respond to.
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="llama3-70b-8192"
    )
    result["single_agent"] = chat_completion.choices[0].message.content
    
    prompt = templates.Evaluation.TRUTHFULQA_EVALUATION_PROMPT_TEMPLATE.substitute(
        correct_answer=data_reg["Examples: True"][i], incorrect_answer=data_reg["Examples: False"][i], response=data_reg["two_agents"][i]
    )
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "you are doing evaluation to a language model."
            },
            # Set a user message for the assistant to respond to.
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="llama3-70b-8192"
    )
    result["two_agents"] = chat_completion.choices[0].message.content
    
    prompt = templates.Evaluation.TRUTHFULQA_EVALUATION_PROMPT_TEMPLATE.substitute(
        correct_answer=data_reg["Examples: True"][i], incorrect_answer=data_reg["Examples: False"][i], response=data_reg["three_agents"][i]
    )
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "you are doing evaluation to a language model."
            },
            # Set a user message for the assistant to respond to.
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="llama3-70b-8192"
    )
    result["three_agents"] = chat_completion.choices[0].message.content
    
    prompt = templates.Evaluation.TRUTHFULQA_EVALUATION_PROMPT_TEMPLATE.substitute(
        correct_answer=data_reg["Examples: True"][i], incorrect_answer=data_reg["Examples: False"][i], response=data_probe["two_agents"][i]
    )
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "you are doing evaluation to a language model."
            },
            # Set a user message for the assistant to respond to.
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="llama3-70b-8192"
    )
    result["two_agents_probing"] = chat_completion.choices[0].message.content
    
    prompt = templates.Evaluation.TRUTHFULQA_EVALUATION_PROMPT_TEMPLATE.substitute(
        correct_answer=data_reg["Examples: True"][i], incorrect_answer=data_reg["Examples: False"][i], response=data_probe["three_agents"][i]
    )
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "you are doing evaluation to a language model."
            },
            # Set a user message for the assistant to respond to.
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="llama3-70b-8192"
    )
    result["three_agents_probing"] = chat_completion.choices[0].message.content
    
    results.append(result)
    
    with open(EVAL_PATH, "w") as f:
        f.write(json.dumps(results, indent=4))