Skip to content

Commit

Permalink
Support Logic Reasoning Benchmark (#1973)
Browse files Browse the repository at this point in the history
  • Loading branch information
Ren-Ma committed May 30, 2024
1 parent 01ef902 commit a982349
Show file tree
Hide file tree
Showing 10 changed files with 836 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"Dataset": "ProntoQA",
"Data split": "validation",
"Number of Samples": 6,
"Agent class": "CodeActAgent",
"Model name": "gpt-4o-2024-05-13",
"Start_time": "2024-05-29 17:51:09",
"End_time": "2024-05-29 17:52:24",
"Final Accuracy": "0.83"
}

Large diffs are not rendered by default.

12 changes: 12 additions & 0 deletions evaluation/logic_reasoning/.cache_program/facts.kfb
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
Cold(Bob, True)
Quiet(Bob, True)
Red(Bob, True)
Smart(Bob, True)
Kind(Charlie, True)
Quiet(Charlie, True)
Red(Charlie, True)
Rough(Charlie, True)
Cold(Dave, True)
Kind(Dave, True)
Smart(Dave, True)
Quiet(Fiona, True)
52 changes: 52 additions & 0 deletions evaluation/logic_reasoning/.cache_program/rules.krb
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
fact1
foreach
facts.Quiet($x, True)
facts.Cold($x, True)
assert
facts.Smart($x, True)

fact2
foreach
facts.Red($x, True)
facts.Cold($x, True)
assert
facts.Round($x, True)

fact3
foreach
facts.Kind($x, True)
facts.Rough($x, True)
assert
facts.Red($x, True)

fact4
foreach
facts.Quiet($x, True)
assert
facts.Rough($x, True)

fact5
foreach
facts.Cold($x, True)
facts.Smart($x, True)
assert
facts.Red($x, True)

fact6
foreach
facts.Rough($x, True)
assert
facts.Cold($x, True)

fact7
foreach
facts.Red($x, True)
assert
facts.Rough($x, True)

fact8
foreach
facts.Smart(Dave, True)
facts.Kind(Dave, True)
assert
facts.Quiet(Dave, True)
43 changes: 43 additions & 0 deletions evaluation/logic_reasoning/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Logic Reasoning Evaluation

This folder contains evaluation harness for evaluating agents on the logic reasoning benchmark [ProntoQA](https://github.com/asaparov/prontoqa) and [ProofWriter](https://allenai.org/data/proofwriter).

## Configure OpenDevin and your LLM

Create a `config.toml` file if it does not exist at the root of the workspace.

Add the following configurations:

```toml
[core]
max_iterations = 100
cache_dir = "/tmp/cache"
ssh_hostname = "localhost"
enable_auto_lint = true

# TODO: Change these to the model you want to evaluate
[eval_gpt4_1106_preview]
model = "gpt-4-1106-preview"
api_key = "XXX"
temperature = 0.0

[eval_some_openai_compatible_model]
model = "openai/MODEL_NAME"
base_url = "https://OPENAI_COMPATIBLE_URL/v1"
api_key = "XXX"
temperature = 0.0
```

## Run Inference on logic_reasoning
The following code will run inference on the first example of the ProntoQA dataset with model gpt-4o.
```bash
./evaluation/logic_reasoning/scripts/run_infer.sh ProntoQA gpt-4o 1
```


## Examples

See example output in
`./evaluation_outputs/outputs/logic_reasoning/CodeActAgent/ProntoQA/gpt-4o-2024-05-13_maxiter_10_N_v1.5/output.jsonl`
and final evaluation performance in
`./evaluation_outputs/outputs/logic_reasoning/CodeActAgent/ProntoQA/gpt-4o-2024-05-13_maxiter_10_N_v1.5/metadata.json`
Empty file.
20 changes: 20 additions & 0 deletions evaluation/logic_reasoning/instruction.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
You are a helpful assistant assigned with logic reasoning task. You need to determine the correctness of a query given some facts and fules.
you can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag.
In this task, you need to use the code in [[logic_inference_path.py]] to help you. Specifically, you first need to instantiate a **LogicInferenceEngine** class and use the **safe_execute_program** method to prove the **logic programs**. You should receive *answer*, *flag*, *error_message* from the output.

An example would be look like this:
<execute_ipython>
import sys
sys.path.append(workspace_mount_path)
engine = LogicInferenceEngine(dataset_name, workspace_mount_path)
answer, flag, error_message = engine.safe_execute_program(logic_programs)
</execute_ipython>

Please send the *answer* variable through message.

dataset_name:
[[dataset_name]]

logic_programs:
[[logic_programs]]

220 changes: 220 additions & 0 deletions evaluation/logic_reasoning/logic_inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
import os
import random
import re
import shutil

from pyke import knowledge_engine


class PykeProgram:
def __init__(
self, logic_program: str, dataset_name='ProntoQA', workspace_mount_path='./'
) -> None:
self.logic_program = logic_program
self.flag = self.parse_logic_program()
self.dataset_name = dataset_name
self.cache_dir = os.path.join(workspace_mount_path, '.cache_program')

# prepare the files for facts and rules
try:
self.create_fact_file(self.Facts)
self.create_rule_file(self.Rules)
self.flag = True
except Exception:
self.flag = False

self.answer_map = {
'ProntoQA': self.answer_map_prontoqa,
'ProofWriter': self.answer_map_proofwriter,
}

def parse_logic_program(self):
keywords = ['Query:', 'Rules:', 'Facts:', 'Predicates:']
program_str = self.logic_program
for keyword in keywords:
try:
program_str, segment_list = self._parse_segment(program_str, keyword)
setattr(self, keyword[:-1], segment_list)
except Exception:
setattr(self, keyword[:-1], None)

return self.validate_program()

def _parse_segment(self, program_str, key_phrase):
remain_program_str, segment = program_str.split(key_phrase)
segment_list = segment.strip().split('\n')
for i in range(len(segment_list)):
segment_list[i] = segment_list[i].split(':::')[0].strip()
return remain_program_str, segment_list

# check if the program is valid; if not, try to fix it
def validate_program(self):
if self.Rules is not None and self.Facts is not None:
if not self.Rules[0] == '' and not self.Facts[0] == '':
return True
# try to fix the program
tmp_rules = []
tmp_facts = []
statements = self.Facts if self.Facts is not None else self.Rules
if statements is None:
return False

for fact in statements:
if fact.find('>>>') >= 0: # this is a rule
tmp_rules.append(fact)
else:
tmp_facts.append(fact)
self.Rules = tmp_rules
self.Facts = tmp_facts
return False

def create_fact_file(self, facts):
with open(os.path.join(self.cache_dir, 'facts.kfb'), 'w') as f:
for fact in facts:
# check for invalid facts
if not fact.find('$x') >= 0:
f.write(fact + '\n')

def create_rule_file(self, rules):
pyke_rules = []
for idx, rule in enumerate(rules):
pyke_rules.append(self.parse_forward_rule(idx + 1, rule))

with open(os.path.join(self.cache_dir, 'rules.krb'), 'w') as f:
f.write('\n\n'.join(pyke_rules))

# example rule: Furry($x, True) && Quite($x, True) >>> White($x, True)
def parse_forward_rule(self, f_index, rule):
premise, conclusion = rule.split('>>>')
premise = premise.strip()
# split the premise into multiple facts if needed
premise = premise.split('&&')
premise_list = [p.strip() for p in premise]

conclusion = conclusion.strip()
# split the conclusion into multiple facts if needed
conclusion = conclusion.split('&&')
conclusion_list = [c.strip() for c in conclusion]

# create the Pyke rule
pyke_rule = f"""fact{f_index}\n\tforeach"""
for p in premise_list:
pyke_rule += f"""\n\t\tfacts.{p}"""
pyke_rule += """\n\tassert"""
for c in conclusion_list:
pyke_rule += f"""\n\t\tfacts.{c}"""
return pyke_rule

"""
for example: Is Marvin from Mars?
Query: FromMars(Marvin, $label)
"""

def check_specific_predicate(self, subject_name, predicate_name, engine):
results = []
with engine.prove_goal(
f'facts.{predicate_name}({subject_name}, $label)'
) as gen:
for vars, plan in gen:
results.append(vars['label'])

with engine.prove_goal(
f'rules.{predicate_name}({subject_name}, $label)'
) as gen:
for vars, plan in gen:
results.append(vars['label'])

if len(results) == 1:
return results[0]
elif len(results) == 2:
return results[0] and results[1]
elif len(results) == 0:
return None

"""
Input Example: Metallic(Wren, False)
"""

def parse_query(self, query):
pattern = r'(\w+)\(([^,]+),\s*([^)]+)\)'
match = re.match(pattern, query)
if match:
function_name = match.group(1)
arg1 = match.group(2)
arg2 = match.group(3)
arg2 = True if arg2 == 'True' else False
return function_name, arg1, arg2
else:
raise ValueError(f'Invalid query: {query}')

def execute_program(self):
# delete the compiled_krb dir
complied_krb_dir = './models/compiled_krb'
if os.path.exists(complied_krb_dir):
print('removing compiled_krb')
# os.system(f'rm -rf {complied_krb_dir}/*')
shutil.rmtree(complied_krb_dir)

# absolute_path = os.path.abspath(complied_krb_dir)
# print(absolute_path)
try:
engine = knowledge_engine.engine(self.cache_dir)
engine.reset()
engine.activate('rules')
engine.get_kb('facts')

# parse the logic query into pyke query
predicate, subject, value_to_check = self.parse_query(self.Query[0])
result = self.check_specific_predicate(subject, predicate, engine)
answer = self.answer_map[self.dataset_name](result, value_to_check)
except Exception as err:
return None, err

return answer, ''

def answer_mapping(self, answer):
return answer

def answer_map_prontoqa(self, result, value_to_check):
if result == value_to_check:
return 'A'
else:
return 'B'

def answer_map_proofwriter(self, result, value_to_check):
if result is None:
return 'C'
elif result == value_to_check:
return 'A'
else:
return 'B'


class LogicInferenceEngine:
def __init__(self, dataset_name, workspace_mount_path):
self.dataset_name = dataset_name
self.workspace_mount_path = workspace_mount_path

def random_backup(self):
if self.dataset_name == 'ProntoQA':
return random.choice(['A', 'B'])
elif self.dataset_name == 'ProofWriter':
return random.choice(['A', 'B', 'C'])

def safe_execute_program(self, logic_program):
program = PykeProgram(
logic_program, self.dataset_name, self.workspace_mount_path
)
# cannot parse the program
if not program.flag:
answer = self.random_backup()
return answer, 'parsing error', ''
# execuate the program
answer, error_message = program.execute_program()
# not executable
if answer is None:
answer = self.random_backup()
return answer, 'execution error', error_message
# successfully executed
answer = program.answer_mapping(answer)
return answer, 'success', ''
Loading

0 comments on commit a982349

Please sign in to comment.