# Imports

In [1]:
import pandas as pd
from groq import Groq
import templates
import json

# Constants

In [2]:
MODEL_NAME = "llama3-8b-8192"
DATA_PATH_REGULAR = "./experimental_data/UMWPDataset_last.csv"
DATA_PATH_PROBE = "./experimental_data/UMWPDataset_probe_last.csv"
TOTAL_EVAL = 100
EVAL_PATH = "./evaluations/Truthful_QA.json"

# Load Data

In [3]:
data_reg = pd.read_csv(DATA_PATH_REGULAR)
data_probe = pd.read_csv(DATA_PATH_PROBE)
print(len(data_reg), len(data_probe))

100 100


# Explore data

In [8]:
c = 83
print("question:              " + str(data_reg["question"][c]))
print("golden example:        " + str(data_reg["answer"][c]))
print("answerable:            " + str(data_reg["answerable"][c]))
# print("false example:         " + str(data_reg["Examples: False"][c]))
# print("uninformative example: " + str(data_reg["Examples: Uninformative"][c]))
# print("informative example:   " + str(data_reg["Examples: Informative"][c]))
print("*" * 50)
print("one agent result:      " + str(data_reg["single_agent"][c]))
print("*" * 50)
print("feedback1:             " + str(data_reg["feedback1"][c]))
print("*" * 50)
print("feedback1 probing:     " + str(data_probe["feedback1"][c]))
print("*" * 50)
print("two agents result:     " + str(data_reg["two_agents"][c]))
print("*" * 50)
print("two agents probing:    " + str(data_probe["two_agents"][c]))
print("*" * 50)
print("three agents result:   " + str(data_reg["three_agents"][c]))
print("*" * 50)
print("three agents probing:  " + str(data_probe["three_agents"][c]))

question:              Mark has 13 trees in his backyard. If he plants dozens of more. How many trees will he have?
golden example:        nan
answerable:            False
**************************************************
one agent result:      

Here's how to solve the problem:

* **Dozens to Trees:**  A dozen is 12.  So, he's planting 12 trees.
* **Total Trees:**  He starts with 13 trees and adds 12 more, so he has 13 + 12 = 25 trees.


**Answer:**  He will have 25 trees. 
<end_of_turn>
**************************************************
feedback1:              

**Here's what I like:**

* **Clear and concise:** The answer is easy to understand.
* **Correct calculation:** The answer is accurate.
* **Logical explanation:** The explanation is straightforward and easy to follow.

**Here's what could be improved:**

* **Addressing the ambiguity:** The question is a bit ambiguous. It's not clear if "dozens" refers to a specific number of trees or a general concept.  It might be helpful to

# Evaluation

Groq API key: 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

In [5]:
client = Groq(api_key="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")
results = []
for i in range(TOTAL_EVAL):
    result = {}
    prompt = templates.Evaluation.TRUTHFULQA_EVALUATION_PROMPT_TEMPLATE.substitute(
        correct_answer=data_reg["Examples: True"][i], incorrect_answer=data_reg["Examples: False"][i], response=data_reg["single_agent"][i]
    )
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "you are doing evaluation to a language model."
            },
            # Set a user message for the assistant to respond to.
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="llama3-70b-8192"
    )
    result["single_agent"] = chat_completion.choices[0].message.content
    
    prompt = templates.Evaluation.TRUTHFULQA_EVALUATION_PROMPT_TEMPLATE.substitute(
        correct_answer=data_reg["Examples: True"][i], incorrect_answer=data_reg["Examples: False"][i], response=data_reg["two_agents"][i]
    )
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "you are doing evaluation to a language model."
            },
            # Set a user message for the assistant to respond to.
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="llama3-70b-8192"
    )
    result["two_agents"] = chat_completion.choices[0].message.content
    
    prompt = templates.Evaluation.TRUTHFULQA_EVALUATION_PROMPT_TEMPLATE.substitute(
        correct_answer=data_reg["Examples: True"][i], incorrect_answer=data_reg["Examples: False"][i], response=data_reg["three_agents"][i]
    )
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "you are doing evaluation to a language model."
            },
            # Set a user message for the assistant to respond to.
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="llama3-70b-8192"
    )
    result["three_agents"] = chat_completion.choices[0].message.content
    
    prompt = templates.Evaluation.TRUTHFULQA_EVALUATION_PROMPT_TEMPLATE.substitute(
        correct_answer=data_reg["Examples: True"][i], incorrect_answer=data_reg["Examples: False"][i], response=data_probe["two_agents"][i]
    )
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "you are doing evaluation to a language model."
            },
            # Set a user message for the assistant to respond to.
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="llama3-70b-8192"
    )
    result["two_agents_probing"] = chat_completion.choices[0].message.content
    
    prompt = templates.Evaluation.TRUTHFULQA_EVALUATION_PROMPT_TEMPLATE.substitute(
        correct_answer=data_reg["Examples: True"][i], incorrect_answer=data_reg["Examples: False"][i], response=data_probe["three_agents"][i]
    )
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "you are doing evaluation to a language model."
            },
            # Set a user message for the assistant to respond to.
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="llama3-70b-8192"
    )
    result["three_agents_probing"] = chat_completion.choices[0].message.content
    
    results.append(result)
    
    with open(EVAL_PATH, "w") as f:
        f.write(json.dumps(results, indent=4))