Support Logic Reasoning Benchmark (#1973)

OpenDevin · May 30, 2024 · a982349 · a982349
1 parent 01ef902
commit a982349
Show file tree

Hide file tree

Showing 10 changed files with 836 additions and 0 deletions.
diff --git a/...s/logic_reasoning/CodeActAgent/ProntoQA/gpt-4o-2024-05-13_maxiter_10_N_v1.5/metadata.json b/...s/logic_reasoning/CodeActAgent/ProntoQA/gpt-4o-2024-05-13_maxiter_10_N_v1.5/metadata.json
@@ -0,0 +1,10 @@
+{
+    "Dataset": "ProntoQA",
+    "Data split": "validation",
+    "Number of Samples": 6,
+    "Agent class": "CodeActAgent",
+    "Model name": "gpt-4o-2024-05-13",
+    "Start_time": "2024-05-29 17:51:09",
+    "End_time": "2024-05-29 17:52:24",
+    "Final Accuracy": "0.83"
+}
diff --git a/...ts/logic_reasoning/CodeActAgent/ProntoQA/gpt-4o-2024-05-13_maxiter_10_N_v1.5/output.jsonl b/...ts/logic_reasoning/CodeActAgent/ProntoQA/gpt-4o-2024-05-13_maxiter_10_N_v1.5/output.jsonl
diff --git a/evaluation/logic_reasoning/.cache_program/facts.kfb b/evaluation/logic_reasoning/.cache_program/facts.kfb
@@ -0,0 +1,12 @@
+Cold(Bob, True)
+Quiet(Bob, True)
+Red(Bob, True)
+Smart(Bob, True)
+Kind(Charlie, True)
+Quiet(Charlie, True)
+Red(Charlie, True)
+Rough(Charlie, True)
+Cold(Dave, True)
+Kind(Dave, True)
+Smart(Dave, True)
+Quiet(Fiona, True)
diff --git a/evaluation/logic_reasoning/.cache_program/rules.krb b/evaluation/logic_reasoning/.cache_program/rules.krb
@@ -0,0 +1,52 @@
+fact1
+	foreach
+		facts.Quiet($x, True)
+		facts.Cold($x, True)
+	assert
+		facts.Smart($x, True)
+
+fact2
+	foreach
+		facts.Red($x, True)
+		facts.Cold($x, True)
+	assert
+		facts.Round($x, True)
+
+fact3
+	foreach
+		facts.Kind($x, True)
+		facts.Rough($x, True)
+	assert
+		facts.Red($x, True)
+
+fact4
+	foreach
+		facts.Quiet($x, True)
+	assert
+		facts.Rough($x, True)
+
+fact5
+	foreach
+		facts.Cold($x, True)
+		facts.Smart($x, True)
+	assert
+		facts.Red($x, True)
+
+fact6
+	foreach
+		facts.Rough($x, True)
+	assert
+		facts.Cold($x, True)
+
+fact7
+	foreach
+		facts.Red($x, True)
+	assert
+		facts.Rough($x, True)
+
+fact8
+	foreach
+		facts.Smart(Dave, True)
+		facts.Kind(Dave, True)
+	assert
+		facts.Quiet(Dave, True)
diff --git a/evaluation/logic_reasoning/README.md b/evaluation/logic_reasoning/README.md
@@ -0,0 +1,43 @@
+# Logic Reasoning Evaluation
+
+This folder contains evaluation harness for evaluating agents on the logic reasoning benchmark [ProntoQA](https://github.com/asaparov/prontoqa) and [ProofWriter](https://allenai.org/data/proofwriter).
+
+## Configure OpenDevin and your LLM
+
+Create a `config.toml` file if it does not exist at the root of the workspace.
+
+Add the following configurations:
+
+```toml
+[core]
+max_iterations = 100
+cache_dir = "/tmp/cache"
+ssh_hostname = "localhost"
+enable_auto_lint = true
+
+# TODO: Change these to the model you want to evaluate
+[eval_gpt4_1106_preview]
+model = "gpt-4-1106-preview"
+api_key = "XXX"
+temperature = 0.0
+
+[eval_some_openai_compatible_model]
+model = "openai/MODEL_NAME"
+base_url = "https://OPENAI_COMPATIBLE_URL/v1"
+api_key = "XXX"
+temperature = 0.0
+```
+
+## Run Inference on logic_reasoning
+The following code will run inference on the first example of the ProntoQA dataset with model gpt-4o.
+```bash
+./evaluation/logic_reasoning/scripts/run_infer.sh ProntoQA gpt-4o 1 
+```
+
+
+## Examples
+
+See example output in 
+`./evaluation_outputs/outputs/logic_reasoning/CodeActAgent/ProntoQA/gpt-4o-2024-05-13_maxiter_10_N_v1.5/output.jsonl`
+and final evaluation performance in 
+`./evaluation_outputs/outputs/logic_reasoning/CodeActAgent/ProntoQA/gpt-4o-2024-05-13_maxiter_10_N_v1.5/metadata.json`
diff --git a/evaluation/logic_reasoning/__init__.py b/evaluation/logic_reasoning/__init__.py
diff --git a/evaluation/logic_reasoning/instruction.txt b/evaluation/logic_reasoning/instruction.txt
@@ -0,0 +1,20 @@
+You are a helpful assistant assigned with logic reasoning task. You need to determine the correctness of a query given some facts and fules. 
+you can interact with an interactive Python (Jupyter Notebook) environment and receive the corresponding output when needed. The code should be enclosed using "<execute_ipython>" tag.
+In this task, you need to use the code in [[logic_inference_path.py]] to help you. Specifically, you first need to instantiate a **LogicInferenceEngine** class and use the **safe_execute_program** method to prove the **logic programs**. You should receive *answer*, *flag*, *error_message* from the output. 
+
+An example would be look like this:
+    <execute_ipython>
+    import sys
+    sys.path.append(workspace_mount_path)
+    engine = LogicInferenceEngine(dataset_name, workspace_mount_path)
+    answer, flag, error_message = engine.safe_execute_program(logic_programs)
+    </execute_ipython>
+
+Please send the *answer* variable through message.
+
+dataset_name:
+[[dataset_name]]
+
+logic_programs:
+[[logic_programs]]
+
diff --git a/evaluation/logic_reasoning/logic_inference.py b/evaluation/logic_reasoning/logic_inference.py
@@ -0,0 +1,220 @@
+import os
+import random
+import re
+import shutil
+
+from pyke import knowledge_engine
+
+
+class PykeProgram:
+    def __init__(
+        self, logic_program: str, dataset_name='ProntoQA', workspace_mount_path='./'
+    ) -> None:
+        self.logic_program = logic_program
+        self.flag = self.parse_logic_program()
+        self.dataset_name = dataset_name
+        self.cache_dir = os.path.join(workspace_mount_path, '.cache_program')
+
+        # prepare the files for facts and rules
+        try:
+            self.create_fact_file(self.Facts)
+            self.create_rule_file(self.Rules)
+            self.flag = True
+        except Exception:
+            self.flag = False
+
+        self.answer_map = {
+            'ProntoQA': self.answer_map_prontoqa,
+            'ProofWriter': self.answer_map_proofwriter,
+        }
+
+    def parse_logic_program(self):
+        keywords = ['Query:', 'Rules:', 'Facts:', 'Predicates:']
+        program_str = self.logic_program
+        for keyword in keywords:
+            try:
+                program_str, segment_list = self._parse_segment(program_str, keyword)
+                setattr(self, keyword[:-1], segment_list)
+            except Exception:
+                setattr(self, keyword[:-1], None)
+
+        return self.validate_program()
+
+    def _parse_segment(self, program_str, key_phrase):
+        remain_program_str, segment = program_str.split(key_phrase)
+        segment_list = segment.strip().split('\n')
+        for i in range(len(segment_list)):
+            segment_list[i] = segment_list[i].split(':::')[0].strip()
+        return remain_program_str, segment_list
+
+    # check if the program is valid; if not, try to fix it
+    def validate_program(self):
+        if self.Rules is not None and self.Facts is not None:
+            if not self.Rules[0] == '' and not self.Facts[0] == '':
+                return True
+        # try to fix the program
+        tmp_rules = []
+        tmp_facts = []
+        statements = self.Facts if self.Facts is not None else self.Rules
+        if statements is None:
+            return False
+
+        for fact in statements:
+            if fact.find('>>>') >= 0:  # this is a rule
+                tmp_rules.append(fact)
+            else:
+                tmp_facts.append(fact)
+        self.Rules = tmp_rules
+        self.Facts = tmp_facts
+        return False
+
+    def create_fact_file(self, facts):
+        with open(os.path.join(self.cache_dir, 'facts.kfb'), 'w') as f:
+            for fact in facts:
+                # check for invalid facts
+                if not fact.find('$x') >= 0:
+                    f.write(fact + '\n')
+
+    def create_rule_file(self, rules):
+        pyke_rules = []
+        for idx, rule in enumerate(rules):
+            pyke_rules.append(self.parse_forward_rule(idx + 1, rule))
+
+        with open(os.path.join(self.cache_dir, 'rules.krb'), 'w') as f:
+            f.write('\n\n'.join(pyke_rules))
+
+    # example rule: Furry($x, True) && Quite($x, True) >>> White($x, True)
+    def parse_forward_rule(self, f_index, rule):
+        premise, conclusion = rule.split('>>>')
+        premise = premise.strip()
+        # split the premise into multiple facts if needed
+        premise = premise.split('&&')
+        premise_list = [p.strip() for p in premise]
+
+        conclusion = conclusion.strip()
+        # split the conclusion into multiple facts if needed
+        conclusion = conclusion.split('&&')
+        conclusion_list = [c.strip() for c in conclusion]
+
+        # create the Pyke rule
+        pyke_rule = f"""fact{f_index}\n\tforeach"""
+        for p in premise_list:
+            pyke_rule += f"""\n\t\tfacts.{p}"""
+        pyke_rule += """\n\tassert"""
+        for c in conclusion_list:
+            pyke_rule += f"""\n\t\tfacts.{c}"""
+        return pyke_rule
+
+    """
+    for example: Is Marvin from Mars?
+    Query: FromMars(Marvin, $label)
+    """
+
+    def check_specific_predicate(self, subject_name, predicate_name, engine):
+        results = []
+        with engine.prove_goal(
+            f'facts.{predicate_name}({subject_name}, $label)'
+        ) as gen:
+            for vars, plan in gen:
+                results.append(vars['label'])
+
+        with engine.prove_goal(
+            f'rules.{predicate_name}({subject_name}, $label)'
+        ) as gen:
+            for vars, plan in gen:
+                results.append(vars['label'])
+
+        if len(results) == 1:
+            return results[0]
+        elif len(results) == 2:
+            return results[0] and results[1]
+        elif len(results) == 0:
+            return None
+
+    """
+    Input Example: Metallic(Wren, False)
+    """
+
+    def parse_query(self, query):
+        pattern = r'(\w+)\(([^,]+),\s*([^)]+)\)'
+        match = re.match(pattern, query)
+        if match:
+            function_name = match.group(1)
+            arg1 = match.group(2)
+            arg2 = match.group(3)
+            arg2 = True if arg2 == 'True' else False
+            return function_name, arg1, arg2
+        else:
+            raise ValueError(f'Invalid query: {query}')
+
+    def execute_program(self):
+        # delete the compiled_krb dir
+        complied_krb_dir = './models/compiled_krb'
+        if os.path.exists(complied_krb_dir):
+            print('removing compiled_krb')
+            # os.system(f'rm -rf {complied_krb_dir}/*')
+            shutil.rmtree(complied_krb_dir)
+
+        # absolute_path = os.path.abspath(complied_krb_dir)
+        # print(absolute_path)
+        try:
+            engine = knowledge_engine.engine(self.cache_dir)
+            engine.reset()
+            engine.activate('rules')
+            engine.get_kb('facts')
+
+            # parse the logic query into pyke query
+            predicate, subject, value_to_check = self.parse_query(self.Query[0])
+            result = self.check_specific_predicate(subject, predicate, engine)
+            answer = self.answer_map[self.dataset_name](result, value_to_check)
+        except Exception as err:
+            return None, err
+
+        return answer, ''
+
+    def answer_mapping(self, answer):
+        return answer
+
+    def answer_map_prontoqa(self, result, value_to_check):
+        if result == value_to_check:
+            return 'A'
+        else:
+            return 'B'
+
+    def answer_map_proofwriter(self, result, value_to_check):
+        if result is None:
+            return 'C'
+        elif result == value_to_check:
+            return 'A'
+        else:
+            return 'B'
+
+
+class LogicInferenceEngine:
+    def __init__(self, dataset_name, workspace_mount_path):
+        self.dataset_name = dataset_name
+        self.workspace_mount_path = workspace_mount_path
+
+    def random_backup(self):
+        if self.dataset_name == 'ProntoQA':
+            return random.choice(['A', 'B'])
+        elif self.dataset_name == 'ProofWriter':
+            return random.choice(['A', 'B', 'C'])
+
+    def safe_execute_program(self, logic_program):
+        program = PykeProgram(
+            logic_program, self.dataset_name, self.workspace_mount_path
+        )
+        # cannot parse the program
+        if not program.flag:
+            answer = self.random_backup()
+            return answer, 'parsing error', ''
+        # execuate the program
+        answer, error_message = program.execute_program()
+        # not executable
+        if answer is None:
+            answer = self.random_backup()
+            return answer, 'execution error', error_message
+        # successfully executed
+        answer = program.answer_mapping(answer)
+        return answer, 'success', ''