In [1]:
import json
from tqdm import tqdm
import random
import pandas as pd
import numpy as np
import re

# Extract Data

In [2]:
df_gpt_logic = pd.read_json('proof_train_logic.json')

In [3]:
# Records that failed at tool
print(f"No. of records that failed at tool: {len(df_gpt_logic[(df_gpt_logic['predicted_answer']=='None')])}")

# Mismatch
print(f"No. of records with incorrect predictions: {len(df_gpt_logic[(df_gpt_logic['predicted_answer']!='None')&(df_gpt_logic['output']!=df_gpt_logic['predicted_answer'])])}")

No. of records that failed at tool: 3881
No. of records with incorrect predictions: 695


In [4]:
# Get correct fol only
df_proof = df_gpt_logic[(df_gpt_logic['predicted_answer']!="None")&(df_gpt_logic['output']==df_gpt_logic['predicted_answer'])].copy()

In [5]:
print(len(df_proof))

10424


# Correct Data

In [6]:
# Separate premise and conclusions
def create_conclusion(logic):
  logic_edited = re.sub(r'(\d+\.?\s*|-+\s*|\*|\`)', '', logic)
  logic_edited = logic_edited.replace("Question", "Conclusion")
  if "Conclusion" in logic_edited:
    conclusion_string = logic_edited.split("Conclusion")[1].strip(':').strip('\n')
  else:
    conclusion_string = logic_edited.split("Conclusion")[-1].strip(':').strip('\n')
  conclusion = conclusion_string.strip().split('\n')
  logic_conclusion = conclusion[0].strip()
  return logic_conclusion

def create_premise(logic):
  logic_edited = re.sub(r'(\d+\.?\s*|-+\s*|\*|\`)', '', logic)
  logic_edited = logic_edited.replace("Question", "Conclusion")
  premises_string = logic_edited.split("Conclusion")[0].split("Premises")[-1].strip(':').strip('\n')
  premises = premises_string.strip().split('\n')
  logic_premise = [premise.strip() for premise in premises if ':::' in premise]
  return logic_premise

In [7]:
df_proof['Premise'] = df_proof['logic'].apply(create_premise)
df_proof['Conclusion'] = df_proof['logic'].apply(create_conclusion)

In [8]:
# Create FOL
df_proof['fol'] = df_proof.apply(lambda x: "Premise_FOL- "+str(";".join(x['Premise'])) + ";Conclusion_FOL- " + str(x['Conclusion']), axis=1)

In [9]:
# Extract predicates
def extract_predicates(fol_statement):
    # Regular expression pattern to match predicates
    pattern = r'\b[A-Za-z][A-Za-z0-9]*\([^()]*\)'

    # Find all predicate matches in the statement
    predicates = re.findall(pattern, fol_statement)
    generalized_predicates = []

    for predicate in predicates:
        args = predicate.split('(')[-1].split(')')[0].split(',')
        args = [arg.strip() for arg in args]

        if len(args)>3:
          new_args = [chr(110 + i) for i in range(len(args))]
        else:
          new_args = [chr(120 + i) for i in range(len(args))]

        predicate_name = predicate.split('(')[0]
        new_predicate = f"{predicate_name}({', '.join(new_args)})"
        generalized_predicates.append(new_predicate)

    unique_predicates = list(set(generalized_predicates))
    return f"Predicates: {'; '.join(unique_predicates)}"

df_proof['predicates'] = df_proof['fol'].apply(extract_predicates)

In [10]:
# NL input
df_proof["input"] = df_proof.apply(lambda x: "Premise: " + str(x['context']) + " \nConclusion: " + str(x['question']), axis=1)

In [11]:
# Additional parsing
df_proof.rename(columns={"output":"label"}, inplace=True)
df_proof['output'] = df_proof['predicates'] + '\n' + df_proof['fol']
df_proof_final = df_proof[['id', 'input', 'label', 'output']]

In [12]:
df_proof_final.to_json("proof_train_fol.json", orient="records", indent=4)