In [2]:
from reason_agent import ReasonAgent
from refine_agent import RefineAgent
from retrive_agent import RetrieveAgent
from memory_agent import MemoryAgent

import json
import os
import time
import agentscope
from agentscope.message import Msg

KeyboardInterrupt: 

In [16]:
def init_model():
    HTTP_LLM_API_KEY='eyJ0eXAiOiJqd3QiLCJhbGciOiJIUzI1NiJ9.eyJ1c2VybmFtZSI6IjM5NDc3MyIsInBhc3N3b3JkIjoiMzk0NzczMTIzIiwiZXhwIjoyMDIxNjE4MzE3fQ.oQx2Rh-GJ_C29AfHTHE4x_2kVyy7NamwQRKRA4GPA94'
    # models can be configured by loading config file
    with open("configs/model_config.json", "r", encoding="utf-8") as f:
        model_configs = json.load(f)
    for config in model_configs:
        if config.get("model_type", "") == "post_api_chat":
            # for gpt4 API
            config["headers"]["Authorization"] = (
                    "Bearer " + HTTP_LLM_API_KEY
            )
        else:
            # for dashscope
            config["api_key"] = f"{os.environ.get('DASHSCOPE_API_KEY')}"
    agentscope.init(model_configs=model_configs)

In [33]:
from agentscope.parsers import MarkdownJsonDictParser
with open("data/hotpot_dev_v1_simplified.json", "r", encoding="utf-8") as f:
        hotpotqa = json.load(f)
#initalize models
init_model()
#initialize agents
with open("configs/agent_config.json", "r", encoding="utf-8") as f:
    agent_configs = json.load(f)
parser_analysis = MarkdownJsonDictParser(
    content_hint={
        "Key components": "main elements of the problem",
        "Relationship between components": "reason behind your analysis",
        "Clarify the problem": "a clearer and simpler restatement",
        "Scope":"decide the boundary of the problem",
        "Sub-questions":"break into sub-questions"
    },
    keys_to_content=["Key components", "Relationship between components", "Clarify the problem", "Scope", "Sub-questions"],
    keys_to_memory=["Key components", "Relationship between components", "Clarify the problem", "Scope", "Sub-questions"],
    keys_to_metadata=[]
)
reason_agent = ReasonAgent(parser_analysis=parser_analysis,**agent_configs[0]["args"])
refine_agent = RefineAgent(**agent_configs[1]["args"])
data_sources = ['text', 'category list', 'infobox', 'table', 'images with caption']
retrive_agent = RetrieveAgent(**agent_configs[2]["args"], data_sources=data_sources)
memory_agent = MemoryAgent()
print(f"Agents initialized.")
    
q=hotpotqa[10]['question']
print(f"Original question:{q}")
# structure analysis
x={
    "type": "structure_analysis",
    "query": q,
    "analysis": None,
    "step": None,
    "action": None,
    "info": None
    }
structure_analysis = reason_agent(x)
print(structure_analysis.content)

Agents initialized.
Original question:What is the name of the fight song of the university whose main campus is in Lawrence, Kansas and whose branch campuses are in the Kansas City metropolitan area?
{'Key components': 'The main elements of the question are the location of the main campus of the university (Lawrence, Kansas), the location of the branch campuses of the university (Kansas City metropolitan area), and the fight song of the mentioned university.', 'Relationship between components': 'The locations are used to specify one particular university. Once the university is determined, the next step is to find out the name of its fight song.', 'Clarify the problem': 'What is the name of the song that is played for spirit or athletic events at the university which is primarily located in Lawrence, Kansas but also has branches in Kansas City metropolitan area?', 'Scope': "This problem involves university identification (based on its main and branch campus locations) and the identific

In [34]:
print(parser_analysis.format_instruction)

Respond a JSON dictionary in a markdown's fenced code block as follows:
```json
{"Key components": "main elements of the problem", "Relationship between components": "reason behind your analysis", "Clarify the problem": "a clearer and simpler restatement", "Scope": "decide the boundary of the problem", "Sub-questions": "break into sub-questions"}
```


### eval

In [3]:
import json
import requests

def llm(input_text, stop=["\n"]):
    url = "http://47.88.8.18:8088/api/ask"
    HTTP_LLM_API_KEY='eyJ0eXAiOiJqd3QiLCJhbGciOiJIUzI1NiJ9.eyJ1c2VybmFtZSI6IjM5NDc3MyIsInBhc3N3b3JkIjoiMzk0NzczMTIzIiwiZXhwIjoyMDIxNjE4MzE3fQ.oQx2Rh-GJ_C29AfHTHE4x_2kVyy7NamwQRKRA4GPA94'
    headers = {
                "Content-Type": "application/json",
                "Authorization": "Bearer " + HTTP_LLM_API_KEY
                }
    data = {
            "model": 'gpt-4',
            "messages": [
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": input_text}
            ],
            "n": 1,
            "temperature": 0.0,
            # "stop": ["\n"]
            }
    response = requests.post(url, json=data, headers=headers)
    response = response.json()
    new_response = response['data']['response']
    return new_response["choices"][0]["message"]["content"]

def llm_equal(question, gt, pred):
    prompt = f"I have a ground truth answer and a suspect answer for a question. I need to determine if the suspect answer is correct by comparing it to the ground truth answer. Please compare the two answers and let me know if the suspect answer is correct. Please also provide the reason behind your comparison.\nQuestion:{question}\nGround Truth Answer: {gt}\nSuspect Answer: {pred}\nYou need respond in the following strcure.\n\nCorrect:[True or False]\nReason:"
    return llm(prompt.format(question=question,gt=gt, pred=pred))

In [9]:
with open("output/gpt4_mmluphy_cot.json", 'r') as file:
    data = json.load(file)

In [10]:
ems = [item['em'] for item in data]
f1s = [item['f1'] for item in data]
print(f"em:{sum(ems)/len(ems)}")
print(f"f1:{sum(f1s)/len(f1s)}")

em:0.0
f1:0.146325110678423


In [11]:
llm_eval = []
for item in data:
    llm_eval.append(llm_equal(question=item['question'], gt=item['gt'], pred=item['pred']))

In [12]:
def parse_output(text):
    # Initialize the dictionary
    result_dict = {"correct": None, "reason": None}
    
    # Split the output text into lines
    if '\n' not in text:
        result_dict['correct']=False
        result_dict['reason']=None

    lines = text.split('\n')
    
    # Extract the "Correct" and "Reason" parts
    for line in lines:
        if line.startswith('Correct:'):
            correct_value = line[len('Correct: '):].strip()
            # Convert 'True' and 'False' to boolean values
            if correct_value == 'True':
                result_dict["correct"] = True
            elif correct_value == 'False':
                result_dict["correct"] = False
        elif line.startswith('Reason:'):
            result_dict["reason"] = line[len('Reason: '):].strip()
    
    return result_dict

In [13]:
llm_evals=[]
for item in llm_eval:
    llm_evals.append(parse_output(item))
    
trues = [item['correct'] for item in llm_evals]
print(f"llm true:{sum(trues)/len(trues)}")

llm true:0.57
