-
Notifications
You must be signed in to change notification settings - Fork 3.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Support Logic Reasoning Benchmark (#1973)
- Loading branch information
Showing
10 changed files
with
836 additions
and
0 deletions.
There are no files selected for viewing
10 changes: 10 additions & 0 deletions
10
...s/logic_reasoning/CodeActAgent/ProntoQA/gpt-4o-2024-05-13_maxiter_10_N_v1.5/metadata.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
{ | ||
"Dataset": "ProntoQA", | ||
"Data split": "validation", | ||
"Number of Samples": 6, | ||
"Agent class": "CodeActAgent", | ||
"Model name": "gpt-4o-2024-05-13", | ||
"Start_time": "2024-05-29 17:51:09", | ||
"End_time": "2024-05-29 17:52:24", | ||
"Final Accuracy": "0.83" | ||
} |
6 changes: 6 additions & 0 deletions
6
...ts/logic_reasoning/CodeActAgent/ProntoQA/gpt-4o-2024-05-13_maxiter_10_N_v1.5/output.jsonl
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
Cold(Bob, True) | ||
Quiet(Bob, True) | ||
Red(Bob, True) | ||
Smart(Bob, True) | ||
Kind(Charlie, True) | ||
Quiet(Charlie, True) | ||
Red(Charlie, True) | ||
Rough(Charlie, True) | ||
Cold(Dave, True) | ||
Kind(Dave, True) | ||
Smart(Dave, True) | ||
Quiet(Fiona, True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
fact1 | ||
foreach | ||
facts.Quiet($x, True) | ||
facts.Cold($x, True) | ||
assert | ||
facts.Smart($x, True) | ||
|
||
fact2 | ||
foreach | ||
facts.Red($x, True) | ||
facts.Cold($x, True) | ||
assert | ||
facts.Round($x, True) | ||
|
||
fact3 | ||
foreach | ||
facts.Kind($x, True) | ||
facts.Rough($x, True) | ||
assert | ||
facts.Red($x, True) | ||
|
||
fact4 | ||
foreach | ||
facts.Quiet($x, True) | ||
assert | ||
facts.Rough($x, True) | ||
|
||
fact5 | ||
foreach | ||
facts.Cold($x, True) | ||
facts.Smart($x, True) | ||
assert | ||
facts.Red($x, True) | ||
|
||
fact6 | ||
foreach | ||
facts.Rough($x, True) | ||
assert | ||
facts.Cold($x, True) | ||
|
||
fact7 | ||
foreach | ||
facts.Red($x, True) | ||
assert | ||
facts.Rough($x, True) | ||
|
||
fact8 | ||
foreach | ||
facts.Smart(Dave, True) | ||
facts.Kind(Dave, True) | ||
assert | ||
facts.Quiet(Dave, True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
# Logic Reasoning Evaluation | ||
|
||
This folder contains evaluation harness for evaluating agents on the logic reasoning benchmark [ProntoQA](https://github.com/asaparov/prontoqa) and [ProofWriter](https://allenai.org/data/proofwriter). | ||
|
||
## Configure OpenDevin and your LLM | ||
|
||
Create a `config.toml` file if it does not exist at the root of the workspace. | ||
|
||
Add the following configurations: | ||
|
||
```toml | ||
[core] | ||
max_iterations = 100 | ||
cache_dir = "/tmp/cache" | ||
ssh_hostname = "localhost" | ||
enable_auto_lint = true | ||
|
||
# TODO: Change these to the model you want to evaluate | ||
[eval_gpt4_1106_preview] | ||
model = "gpt-4-1106-preview" | ||
api_key = "XXX" | ||
temperature = 0.0 | ||
|
||
[eval_some_openai_compatible_model] | ||
model = "openai/MODEL_NAME" | ||
base_url = "https://OPENAI_COMPATIBLE_URL/v1" | ||
api_key = "XXX" | ||
temperature = 0.0 | ||
``` | ||
|
||
## Run Inference on logic_reasoning | ||
The following code will run inference on the first example of the ProntoQA dataset with model gpt-4o. | ||
```bash | ||
./evaluation/logic_reasoning/scripts/run_infer.sh ProntoQA gpt-4o 1 | ||
``` | ||
|
||
|
||
## Examples | ||
|
||
See example output in | ||
`./evaluation_outputs/outputs/logic_reasoning/CodeActAgent/ProntoQA/gpt-4o-2024-05-13_maxiter_10_N_v1.5/output.jsonl` | ||
and final evaluation performance in | ||
`./evaluation_outputs/outputs/logic_reasoning/CodeActAgent/ProntoQA/gpt-4o-2024-05-13_maxiter_10_N_v1.5/metadata.json` |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
You are a helpful assistant assigned with logic reasoning task. You need to determine the correctness of a query given some facts and fules. | ||
you can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag. | ||
In this task, you need to use the code in [[logic_inference_path.py]] to help you. Specifically, you first need to instantiate a **LogicInferenceEngine** class and use the **safe_execute_program** method to prove the **logic programs**. You should receive *answer*, *flag*, *error_message* from the output. | ||
|
||
An example would be look like this: | ||
<execute_ipython> | ||
import sys | ||
sys.path.append(workspace_mount_path) | ||
engine = LogicInferenceEngine(dataset_name, workspace_mount_path) | ||
answer, flag, error_message = engine.safe_execute_program(logic_programs) | ||
</execute_ipython> | ||
|
||
Please send the *answer* variable through message. | ||
|
||
dataset_name: | ||
[[dataset_name]] | ||
|
||
logic_programs: | ||
[[logic_programs]] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,220 @@ | ||
import os | ||
import random | ||
import re | ||
import shutil | ||
|
||
from pyke import knowledge_engine | ||
|
||
|
||
class PykeProgram: | ||
def __init__( | ||
self, logic_program: str, dataset_name='ProntoQA', workspace_mount_path='./' | ||
) -> None: | ||
self.logic_program = logic_program | ||
self.flag = self.parse_logic_program() | ||
self.dataset_name = dataset_name | ||
self.cache_dir = os.path.join(workspace_mount_path, '.cache_program') | ||
|
||
# prepare the files for facts and rules | ||
try: | ||
self.create_fact_file(self.Facts) | ||
self.create_rule_file(self.Rules) | ||
self.flag = True | ||
except Exception: | ||
self.flag = False | ||
|
||
self.answer_map = { | ||
'ProntoQA': self.answer_map_prontoqa, | ||
'ProofWriter': self.answer_map_proofwriter, | ||
} | ||
|
||
def parse_logic_program(self): | ||
keywords = ['Query:', 'Rules:', 'Facts:', 'Predicates:'] | ||
program_str = self.logic_program | ||
for keyword in keywords: | ||
try: | ||
program_str, segment_list = self._parse_segment(program_str, keyword) | ||
setattr(self, keyword[:-1], segment_list) | ||
except Exception: | ||
setattr(self, keyword[:-1], None) | ||
|
||
return self.validate_program() | ||
|
||
def _parse_segment(self, program_str, key_phrase): | ||
remain_program_str, segment = program_str.split(key_phrase) | ||
segment_list = segment.strip().split('\n') | ||
for i in range(len(segment_list)): | ||
segment_list[i] = segment_list[i].split(':::')[0].strip() | ||
return remain_program_str, segment_list | ||
|
||
# check if the program is valid; if not, try to fix it | ||
def validate_program(self): | ||
if self.Rules is not None and self.Facts is not None: | ||
if not self.Rules[0] == '' and not self.Facts[0] == '': | ||
return True | ||
# try to fix the program | ||
tmp_rules = [] | ||
tmp_facts = [] | ||
statements = self.Facts if self.Facts is not None else self.Rules | ||
if statements is None: | ||
return False | ||
|
||
for fact in statements: | ||
if fact.find('>>>') >= 0: # this is a rule | ||
tmp_rules.append(fact) | ||
else: | ||
tmp_facts.append(fact) | ||
self.Rules = tmp_rules | ||
self.Facts = tmp_facts | ||
return False | ||
|
||
def create_fact_file(self, facts): | ||
with open(os.path.join(self.cache_dir, 'facts.kfb'), 'w') as f: | ||
for fact in facts: | ||
# check for invalid facts | ||
if not fact.find('$x') >= 0: | ||
f.write(fact + '\n') | ||
|
||
def create_rule_file(self, rules): | ||
pyke_rules = [] | ||
for idx, rule in enumerate(rules): | ||
pyke_rules.append(self.parse_forward_rule(idx + 1, rule)) | ||
|
||
with open(os.path.join(self.cache_dir, 'rules.krb'), 'w') as f: | ||
f.write('\n\n'.join(pyke_rules)) | ||
|
||
# example rule: Furry($x, True) && Quite($x, True) >>> White($x, True) | ||
def parse_forward_rule(self, f_index, rule): | ||
premise, conclusion = rule.split('>>>') | ||
premise = premise.strip() | ||
# split the premise into multiple facts if needed | ||
premise = premise.split('&&') | ||
premise_list = [p.strip() for p in premise] | ||
|
||
conclusion = conclusion.strip() | ||
# split the conclusion into multiple facts if needed | ||
conclusion = conclusion.split('&&') | ||
conclusion_list = [c.strip() for c in conclusion] | ||
|
||
# create the Pyke rule | ||
pyke_rule = f"""fact{f_index}\n\tforeach""" | ||
for p in premise_list: | ||
pyke_rule += f"""\n\t\tfacts.{p}""" | ||
pyke_rule += """\n\tassert""" | ||
for c in conclusion_list: | ||
pyke_rule += f"""\n\t\tfacts.{c}""" | ||
return pyke_rule | ||
|
||
""" | ||
for example: Is Marvin from Mars? | ||
Query: FromMars(Marvin, $label) | ||
""" | ||
|
||
def check_specific_predicate(self, subject_name, predicate_name, engine): | ||
results = [] | ||
with engine.prove_goal( | ||
f'facts.{predicate_name}({subject_name}, $label)' | ||
) as gen: | ||
for vars, plan in gen: | ||
results.append(vars['label']) | ||
|
||
with engine.prove_goal( | ||
f'rules.{predicate_name}({subject_name}, $label)' | ||
) as gen: | ||
for vars, plan in gen: | ||
results.append(vars['label']) | ||
|
||
if len(results) == 1: | ||
return results[0] | ||
elif len(results) == 2: | ||
return results[0] and results[1] | ||
elif len(results) == 0: | ||
return None | ||
|
||
""" | ||
Input Example: Metallic(Wren, False) | ||
""" | ||
|
||
def parse_query(self, query): | ||
pattern = r'(\w+)\(([^,]+),\s*([^)]+)\)' | ||
match = re.match(pattern, query) | ||
if match: | ||
function_name = match.group(1) | ||
arg1 = match.group(2) | ||
arg2 = match.group(3) | ||
arg2 = True if arg2 == 'True' else False | ||
return function_name, arg1, arg2 | ||
else: | ||
raise ValueError(f'Invalid query: {query}') | ||
|
||
def execute_program(self): | ||
# delete the compiled_krb dir | ||
complied_krb_dir = './models/compiled_krb' | ||
if os.path.exists(complied_krb_dir): | ||
print('removing compiled_krb') | ||
# os.system(f'rm -rf {complied_krb_dir}/*') | ||
shutil.rmtree(complied_krb_dir) | ||
|
||
# absolute_path = os.path.abspath(complied_krb_dir) | ||
# print(absolute_path) | ||
try: | ||
engine = knowledge_engine.engine(self.cache_dir) | ||
engine.reset() | ||
engine.activate('rules') | ||
engine.get_kb('facts') | ||
|
||
# parse the logic query into pyke query | ||
predicate, subject, value_to_check = self.parse_query(self.Query[0]) | ||
result = self.check_specific_predicate(subject, predicate, engine) | ||
answer = self.answer_map[self.dataset_name](result, value_to_check) | ||
except Exception as err: | ||
return None, err | ||
|
||
return answer, '' | ||
|
||
def answer_mapping(self, answer): | ||
return answer | ||
|
||
def answer_map_prontoqa(self, result, value_to_check): | ||
if result == value_to_check: | ||
return 'A' | ||
else: | ||
return 'B' | ||
|
||
def answer_map_proofwriter(self, result, value_to_check): | ||
if result is None: | ||
return 'C' | ||
elif result == value_to_check: | ||
return 'A' | ||
else: | ||
return 'B' | ||
|
||
|
||
class LogicInferenceEngine: | ||
def __init__(self, dataset_name, workspace_mount_path): | ||
self.dataset_name = dataset_name | ||
self.workspace_mount_path = workspace_mount_path | ||
|
||
def random_backup(self): | ||
if self.dataset_name == 'ProntoQA': | ||
return random.choice(['A', 'B']) | ||
elif self.dataset_name == 'ProofWriter': | ||
return random.choice(['A', 'B', 'C']) | ||
|
||
def safe_execute_program(self, logic_program): | ||
program = PykeProgram( | ||
logic_program, self.dataset_name, self.workspace_mount_path | ||
) | ||
# cannot parse the program | ||
if not program.flag: | ||
answer = self.random_backup() | ||
return answer, 'parsing error', '' | ||
# execuate the program | ||
answer, error_message = program.execute_program() | ||
# not executable | ||
if answer is None: | ||
answer = self.random_backup() | ||
return answer, 'execution error', error_message | ||
# successfully executed | ||
answer = program.answer_mapping(answer) | ||
return answer, 'success', '' |
Oops, something went wrong.