In [2]:
from reason_agent import ReasonAgent
from refine_agent import RefineAgent
from retrive_agent import RetrieveAgent
from memory_agent import MemoryAgent

import json
import os
import time
import agentscope
from agentscope.message import Msg

In [16]:
def init_model():
    HTTP_LLM_API_KEY='eyJ0eXAiOiJqd3QiLCJhbGciOiJIUzI1NiJ9.eyJ1c2VybmFtZSI6IjM5NDc3MyIsInBhc3N3b3JkIjoiMzk0NzczMTIzIiwiZXhwIjoyMDIxNjE4MzE3fQ.oQx2Rh-GJ_C29AfHTHE4x_2kVyy7NamwQRKRA4GPA94'
    # models can be configured by loading config file
    with open("configs/model_config.json", "r", encoding="utf-8") as f:
        model_configs = json.load(f)
    for config in model_configs:
        if config.get("model_type", "") == "post_api_chat":
            # for gpt4 API
            config["headers"]["Authorization"] = (
                    "Bearer " + HTTP_LLM_API_KEY
            )
        else:
            # for dashscope
            config["api_key"] = f"{os.environ.get('DASHSCOPE_API_KEY')}"
    agentscope.init(model_configs=model_configs)

In [33]:
from agentscope.parsers import MarkdownJsonDictParser
with open("data/hotpot_dev_v1_simplified.json", "r", encoding="utf-8") as f:
        hotpotqa = json.load(f)
#initalize models
init_model()
#initialize agents
with open("configs/agent_config.json", "r", encoding="utf-8") as f:
    agent_configs = json.load(f)
parser_analysis = MarkdownJsonDictParser(
    content_hint={
        "Key components": "main elements of the problem",
        "Relationship between components": "reason behind your analysis",
        "Clarify the problem": "a clearer and simpler restatement",
        "Scope":"decide the boundary of the problem",
        "Sub-questions":"break into sub-questions"
    },
    keys_to_content=["Key components", "Relationship between components", "Clarify the problem", "Scope", "Sub-questions"],
    keys_to_memory=["Key components", "Relationship between components", "Clarify the problem", "Scope", "Sub-questions"],
    keys_to_metadata=[]
)
reason_agent = ReasonAgent(parser_analysis=parser_analysis,**agent_configs[0]["args"])
refine_agent = RefineAgent(**agent_configs[1]["args"])
data_sources = ['text', 'category list', 'infobox', 'table', 'images with caption']
retrive_agent = RetrieveAgent(**agent_configs[2]["args"], data_sources=data_sources)
memory_agent = MemoryAgent()
print(f"Agents initialized.")
    
q=hotpotqa[10]['question']
print(f"Original question:{q}")
# structure analysis
x={
    "type": "structure_analysis",
    "query": q,
    "analysis": None,
    "step": None,
    "action": None,
    "info": None
    }
structure_analysis = reason_agent(x)
print(structure_analysis.content)

Agents initialized.
Original question:What is the name of the fight song of the university whose main campus is in Lawrence, Kansas and whose branch campuses are in the Kansas City metropolitan area?
{'Key components': 'The main elements of the question are the location of the main campus of the university (Lawrence, Kansas), the location of the branch campuses of the university (Kansas City metropolitan area), and the fight song of the mentioned university.', 'Relationship between components': 'The locations are used to specify one particular university. Once the university is determined, the next step is to find out the name of its fight song.', 'Clarify the problem': 'What is the name of the song that is played for spirit or athletic events at the university which is primarily located in Lawrence, Kansas but also has branches in Kansas City metropolitan area?', 'Scope': "This problem involves university identification (based on its main and branch campus locations) and the identific

In [34]:
print(parser_analysis.format_instruction)

Respond a JSON dictionary in a markdown's fenced code block as follows:
```json
{"Key components": "main elements of the problem", "Relationship between components": "reason behind your analysis", "Clarify the problem": "a clearer and simpler restatement", "Scope": "decide the boundary of the problem", "Sub-questions": "break into sub-questions"}
```


### eval

In [1]:
import json
import requests

def llm(input_text, stop=["\n"]):
    url = "http://47.88.8.18:8088/api/ask"
    HTTP_LLM_API_KEY='eyJ0eXAiOiJqd3QiLCJhbGciOiJIUzI1NiJ9.eyJ1c2VybmFtZSI6IjM5NDc3MyIsInBhc3N3b3JkIjoiMzk0NzczMTIzIiwiZXhwIjoyMDIxNjE4MzE3fQ.oQx2Rh-GJ_C29AfHTHE4x_2kVyy7NamwQRKRA4GPA94'
    headers = {
                "Content-Type": "application/json",
                "Authorization": "Bearer " + HTTP_LLM_API_KEY
                }
    data = {
            "model": 'gpt-4',
            "messages": [
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": input_text}
            ],
            "n": 1,
            "temperature": 0.0,
            # "stop": ["\n"]
            }
    response = requests.post(url, json=data, headers=headers)
    response = response.json()
    new_response = response['data']['response']
    return new_response["choices"][0]["message"]["content"]

def llm_equal(question, gt, pred):
    prompt = f"I have a ground truth answer and a suspect answer for a question. I need to determine if the suspect answer is correct by comparing it to the ground truth answer. Please compare the two answers and let me know if the suspect answer is correct. Please also provide the reason behind your comparison.\nQuestion:{question}\nGround Truth Answer: {gt}\nSuspect Answer: {pred}\nYou need respond in the following strcure.\n\nCorrect:[True or False]\nReason:"
    return llm(prompt)

In [9]:
with open("output/gpt4_math_icl_analysisFalse.json", 'r') as file:
    data = json.load(file)
len(data)

948

In [3]:
### for gsm8k

import re

ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)")
INVALID_ANS = "[invalid]"


def extract_answer(completion):
    match = ANS_RE.search(completion)
    if match:
        match_str = match.group(1).strip()
        match_str = match_str.replace(",", "")
        return match_str
    else:
        return INVALID_ANS

number_pattern = r"-?\d{1,3}(?:,\d{3})*(?:\.\d+)?"
def extract_number(text):
    match = re.search(number_pattern, text)
    if match:
        return match.group()
    else:
        return None


ems=[]
for item in data:
    gt=extract_answer(item['gt'])
    gt=float(gt)
    pred=extract_number(item['pred'])
    if ',' in pred:
        pred = float(pred.replace(',', ''))
    else:
        pred = float(pred)
    ems.append(int(gt==pred))
print(f"em:{sum(ems)/len(ems)}")

ValueError: could not convert string to float: '[invalid]'

In [66]:
data[2]['question']

'Evaluate $i^5+i^{-25}+i^{45}$.'

In [9]:
# suitable for stretagy
ems = [item['em'] for item in data]
f1s = [item['f1'] for item in data]
print(f"em:{sum(ems)/len(ems)}")
print(f"f1:{sum(f1s)/len(f1s)}")


em:0.7763157894736842
f1:0.7763157894736842


In [76]:
data[0]['question']

'How many vertical asymptotes does the graph of $y=\\frac{2}{x^2+x-6}$ have?'

In [12]:
#suitable for math
import time
llm_eval = []
idx=0

In [13]:
#if dead, rerun this block
for i in range(idx, len(data)):
    print(f"Index {i}")
    llm_eval.append(llm_equal(question=data[i]['question'], gt=data[i]['gt'], pred=data[i]['pred']))
    idx=i
    time.sleep(2)

Index 0


Index 1
Index 2
Index 3
Index 4
Index 5
Index 6
Index 7
Index 8
Index 9
Index 10
Index 11
Index 12
Index 13
Index 14
Index 15
Index 16
Index 17
Index 18
Index 19
Index 20
Index 21
Index 22
Index 23
Index 24
Index 25
Index 26
Index 27
Index 28
Index 29
Index 30
Index 31
Index 32
Index 33
Index 34
Index 35
Index 36
Index 37
Index 38
Index 39
Index 40
Index 41
Index 42
Index 43
Index 44
Index 45
Index 46
Index 47
Index 48
Index 49
Index 50
Index 51
Index 52
Index 53
Index 54
Index 55
Index 56
Index 57
Index 58
Index 59
Index 60
Index 61
Index 62
Index 63
Index 64
Index 65
Index 66
Index 67
Index 68
Index 69
Index 70
Index 71
Index 72
Index 73
Index 74
Index 75
Index 76
Index 77
Index 78
Index 79
Index 80
Index 81
Index 82
Index 83
Index 84
Index 85
Index 86
Index 87
Index 88
Index 89
Index 90
Index 91
Index 92
Index 93
Index 94
Index 95
Index 96
Index 97
Index 98
Index 99
Index 100
Index 101
Index 102
Index 103
Index 104
Index 105
Index 106
Index 107
Index 108
Index 109
Index 110
Index 11

In [14]:
def parse_output(text):
    # Initialize the dictionary
    result_dict = {"correct": None, "reason": None}
    
    # Split the output text into lines
    if '\n' not in text:
        result_dict['correct']=False
        result_dict['reason']=None

    lines = text.split('\n')
    
    # Extract the "Correct" and "Reason" parts
    for line in lines:
        if line.startswith('Correct:'):
            correct_value = line[len('Correct: '):].strip()
            # Convert 'True' and 'False' to boolean values
            if correct_value == 'True':
                result_dict["correct"] = True
            elif correct_value == 'False':
                result_dict["correct"] = False
        elif line.startswith('Reason:'):
            result_dict["reason"] = line[len('Reason: '):].strip()
    
    return result_dict

In [15]:
llm_evals=[]
for item in llm_eval:
    llm_evals.append(parse_output(item))
    
trues = [item['correct'] for item in llm_evals]
print(f"llm true:{sum(trues)/len(trues)}")

llm true:0.6128691983122363


In [34]:
import string
def finalize(thought):
    # current_answer = thought['Answer'].strip().lower()
    cleaned_thought = ''.join(char for char in thought if char not in string.punctuation)
    current_answer = cleaned_thought.lower()
    words = current_answer.split()
    if words[:2] == ['not', 'yet']:
        return False
    return True

In [35]:
thought = 'Not yet, the calculation needs to be executed to find the answer.'
finalize(thought)

False

In [29]:
cleaned_text = ''.join(char for char in thought if char not in string.punctuation)

['N',
 'o',
 't',
 ' ',
 'y',
 'e',
 't',
 ' ',
 't',
 'h',
 'e',
 ' ',
 'c',
 'a',
 'l',
 'c',
 'u',
 'l',
 'a',
 't',
 'i',
 'o',
 'n',
 ' ',
 'n',
 'e',
 'e',
 'd',
 's',
 ' ',
 't',
 'o',
 ' ',
 'b',
 'e',
 ' ',
 'e',
 'x',
 'e',
 'c',
 'u',
 't',
 'e',
 'd',
 ' ',
 't',
 'o',
 ' ',
 'f',
 'i',
 'n',
 'd',
 ' ',
 't',
 'h',
 'e',
 ' ',
 'a',
 'n',
 's',
 'w',
 'e',
 'r']