In [1]:
import openai
import time
openai.api_key = ""


def get_GPT4_response(input, temp=1.0, max_tokens=256, logit_dict={}, model="gpt-4-0613"):
    while True:
        try:
            completion = openai.ChatCompletion.create(
                model=model,
                # model="gpt-3.5-turbo",
                messages=[
                {
                    "role": "system", 
                    "content": "You are a helpful factual assistant." 
                },
                {
                    "role": "user", 
                    "content": input
                }
                ],
                max_tokens=max_tokens,
                temperature=temp,
                logit_bias=logit_dict
            )
            break
        except Exception as e:
            sleep_time = 5
            print(e, f"Sleep {sleep_time} seconds.")
            time.sleep(sleep_time)
    # print(completion.usage)
    return completion.choices[0].message["content"]

def get_davinci3_response(input, temp=1.0, max_tokens=256):
    while True:
        try:
            response = openai.Completion.create(
                # model="text-davinci-003",
                model='gpt-3.5-turbo-instruct',
                prompt=input,
                temperature=temp,
                max_tokens=max_tokens,
            )
            break
        except Exception as e:
            sleep_time = 30
            print(e, f"Sleep {sleep_time} seconds.")
            time.sleep(sleep_time)

    return response["choices"][0]["text"].strip()


def get_chat_response(inputs_list, temp=0.0, max_tokens=256, logit_dict={}, model="gpt-4-0613"):
    while True:
        try:
            completion = openai.ChatCompletion.create(
                model=model,
                messages=[
                {
                    "role": "system", 
                    "content": "You are a helpful factual assistant." 
                },
                {
                    "role": "user", 
                    "content": inputs_list[0]
                },
                {
                    "role": "assistant", 
                    "content": inputs_list[1]
                },
                {
                    "role": "user", 
                    "content": inputs_list[2]
                },
                {
                    "role": "assistant", 
                    "content": inputs_list[4]
                },
                {
                    "role": "user", 
                    "content": inputs_list[2]
                },
                ],
                max_tokens=max_tokens,
                temperature=temp,
                logit_bias=logit_dict
            )
            break
        except Exception as e:
            sleep_time = 5
            print(e, f"Sleep {sleep_time} seconds.")
            time.sleep(sleep_time)
    return completion.choices[0].message["content"]

In [2]:
def parse_sentence_to_conclusion_premise(each_rule):
    each_rule = each_rule.strip()

    assert ":- " in each_rule
    conclusion, premises = each_rule.split(":- ")
    premises_list = premises.split("),")
    for m in range(len(premises_list)):
        premises_list[m] = premises_list[m].strip()
        if m < len(premises_list) - 1:
            premises_list[m] = premises_list[m] + ")"
        elif premises_list[m][-1] == ";" or premises_list[m][-1] == ".":
            premises_list[m] = premises_list[m][:-1]
    return conclusion.strip(), premises_list

# parsing a premise/conclusion into arguments
def argument_parsing(premise, output_rela=False):
    premise = premise.strip()
    args_type_variable_list = []
    if premise.count("(") != 1:
        if not output_rela:
            return args_type_variable_list
        else:
            return None, args_type_variable_list   
         
    rela_end = premise.index("(")
    relation = premise[:rela_end]
    args_list = [each.strip() for each in premise[rela_end+1:-1].split(",")]
    for each in args_list:
        assert " " in each
        each_arg_split= each.split()
        each_type = " ".join(each_arg_split[:-1])
        each_variable = each_arg_split[-1]
        args_type_variable_list.append(each_type + " " + each_variable)
        
    if output_rela:
        return relation, args_type_variable_list
    else:
        return args_type_variable_list

In [34]:
def get_affordance_verbalized_critic_input(each_rule):    
    input = "True or False? Please predict whether the input rule is very likely to be true, and also explain why. Please note that the rule need not necessarily but just very likely to be true. \n\nExamples:\n" + \
            "Input: If Person X has an Age Z1 and Vehicle Y requires an Age above Z2 for driving, with Age Z1 being greater than Age Z2, then Person X can drive Vehicle Y. \n" + \
            "Output: True. Because Person X has achieved the minimum age requirement for driving vehicle. \n" + \
            "Input: If Person X was born in Season Z and Plant Y blooms in the same Season Z, then Person X can access Plant Y. \n" + \
            "Output: False. Because the season of a person's birth and the blooming season of a plant has no logical connection. \n\n"  + \
            "Input: " + each_rule + " \n" + \
            "Output:\n"
    return input

In [182]:
def get_affordance_symbolic_critic_input(each_rule):    
    input = "True or False? Please predict whether the input symbolic rule is very likely to be true, and also explain why. Please note that the rule need not necessarily but just very likely to be true. \n\nExamples:\n" + \
            "Input: CanDrive(Person X, Vehicle Y):- Have(Person X, Age Z1), RequireMinimumAge(Vehicle Y, Age Z2), BiggerThan(Age Z1, Age Z2); \n" + \
            "Output: True. Because Person X has achieved the minimum age requirement for driving vehicle. \n" + \
            "Input: CanAccess(Person X, Plant Y):- BornIn(Person X, Season Z), BloomsIn(Plant Y, Season Z); \n" + \
            "Output: False. Because the season of a person's birth and the blooming season of a plant has no logical connection. \n\n"  + \
            "Input: " + each_rule + " \n" + \
            "Output:\n"
    return input

In [3]:
def get_affordance_verbalized_critic_input_Think(each_rule):    
    input = f'''True or False? Please predict whether the input rule is very likely to be true. Please first briefly explain your thought process in one sentence, and then give your answer. Note that the rule need not necessarily but just very likely to be true. 

Examples:
Input: If Person X has an Age Z1 and Vehicle Y requires an Age above Z2 for driving, with Age Z1 being greater than Age Z2, then Person X can drive Vehicle Y.
Thought: If Person X's age (Z1) is indeed greater than the required age (Z2) to drive Vehicle Y, then logically, Person X meets the age requirement to drive Vehicle Y. 
Answer: Based on this thought process, the statement is True.
Input: If Person X was born in Season Z and Plant Y blooms in the same Season Z, then Person X can access Plant Y.
Thought: The fact that Person X was born in Season Z and Plant Y blooms in Season Z is a coincidence in timing and does not inherently establish a causal or enabling relationship for access. 
Answer: Based on this thought process, the statement is False.

Input: {each_rule}
Thought:
'''
    return input


def get_affordance_verbalized_critic_input_Think_v2(each_rule):    
    input = f'''True or False? Please predict whether the input rule is very likely to be true. Note that the rule need not necessarily but just very likely to be true. 
Please first generate three different sentences to respectively explain your three thought processes briefly, and then based on the corresponding thought to give your answer. Finally, output the final answer according to majority voting. 

Examples:
Input: If Person X has an Age Z1 and Vehicle Y requires an Age above Z2 for driving, with Age Z1 being greater than Age Z2, then Person X can drive Vehicle Y.
Thought 1: The condition states that Person X's age (Z1) is greater than the required age (Z2) for driving Vehicle Y, which directly satisfies the age requirement for driving the vehicle.
Thought 2: In most circumstances, meeting the age requirement is a primary factor in determining eligibility to drive a vehicle, suggesting that Person X is likely eligible to drive Vehicle Y based on age.
Thought 3: Assuming all other necessary conditions for driving (like having a valid driver's license) are also met, the age criterion being satisfied strongly indicates that Person X can legally drive Vehicle Y.
Answer 1: Based on the Thought 1, the statement is True.
Answer 2: Based on the Thought 2, the statement is True.
Answer 3: Based on the Thought 3, the statement is True.
Final Answer: The statement is True.

Input: {each_rule}
Thought 1:
'''
    return input

In [35]:
def get_affordance_verbalized_critic_input_v2(each_rule):    
    input = "Right or Wrong? Please predict whether the input rule is valid and correct, and also explain why. Please note that the rule need not necessarily but just very likely to be valid and correct. \n\nExamples:\n" + \
            "Input: If Person X has an Age Z1 and Vehicle Y requires an Age above Z2 for driving, with Age Z1 being greater than Age Z2, then Person X can drive Vehicle Y. \n" + \
            "Output: Right. Because Person X has achieved the minimum age requirement for driving vehicle. \n" + \
            "Input: If Person X was born in Season Z and Plant Y blooms in the same Season Z, then Person X can access Plant Y. \n" + \
            "Output: Wrong. Because the season of a person's birth and the blooming season of a plant has no logical connection. \n\n"  + \
            "Input: " + each_rule + " \n" + \
            "Output:\n"
    return input

In [183]:
def get_affordance_symbolic_critic_input_v2(each_rule):    
    input = "Right or Wrong? Please predict whether the input symbolic rule is valid and correct, and also explain why. Please note that the rule need not necessarily but just very likely to be valid and correct. \n\nExamples:\n" + \
            "Input: CanDrive(Person X, Vehicle Y):- Have(Person X, Age Z1), RequireMinimumAge(Vehicle Y, Age Z2), BiggerThan(Age Z1, Age Z2); \n" + \
            "Output: Right. Because Person X has achieved the minimum age requirement for driving vehicle. \n" + \
            "Input: CanAccess(Person X, Plant Y):- BornIn(Person X, Season Z), BloomsIn(Plant Y, Season Z); \n" + \
            "Output: Wrong. Because the season of a person's birth and the blooming season of a plant has no logical connection. \n\n"  + \
            "Input: " + each_rule + " \n" + \
            "Output:\n"
    return input

In [4]:
def get_affordance_verbalized_critic_input_v2_Think(each_rule):    
    input = f'''Right or Wrong? Please predict whether the input rule is valid and correct. Please first briefly explain your thought process in one sentence, and then give your answer. Note that the rule need not necessarily but just very likely to be valid and correct.

Examples:
Input: If Person X has an Age Z1 and Vehicle Y requires an Age above Z2 for driving, with Age Z1 being greater than Age Z2, then Person X can drive Vehicle Y.
Thought: If Person X's age (Z1) is indeed greater than the required age (Z2) to drive Vehicle Y, then logically, Person X meets the age requirement to drive Vehicle Y. 
Answer: Based on this thought process, the statement is Right.
Input: If Person X was born in Season Z and Plant Y blooms in the same Season Z, then Person X can access Plant Y.
Thought: The fact that Person X was born in Season Z and Plant Y blooms in Season Z is a coincidence in timing and does not inherently establish a causal or enabling relationship for access. 
Answer: Based on this thought process, the statement is Wrong.

Input: {each_rule}
Thought:
'''
    return input


def get_affordance_verbalized_critic_input_v2_Think_v2(each_rule):    
    input = f'''Right or Wrong? Please predict whether the input rule is valid and correct. Note that the rule need not necessarily but just very likely to be valid and correct.
Please first generate three different sentences to respectively explain your three thought processes briefly, and then based on the corresponding thought to give your answer. Finally, output the final answer according to majority voting. 

Examples:
Input: If Person X has an Age Z1 and Vehicle Y requires an Age above Z2 for driving, with Age Z1 being greater than Age Z2, then Person X can drive Vehicle Y.
Thought 1: The condition states that Person X's age (Z1) is greater than the required age (Z2) for driving Vehicle Y, which directly satisfies the age requirement for driving the vehicle.
Thought 2: In most circumstances, meeting the age requirement is a primary factor in determining eligibility to drive a vehicle, suggesting that Person X is likely eligible to drive Vehicle Y based on age.
Thought 3: Assuming all other necessary conditions for driving (like having a valid driver's license) are also met, the age criterion being satisfied strongly indicates that Person X can legally drive Vehicle Y.
Answer 1: Based on the Thought 1, the statement is Right.
Answer 2: Based on the Thought 2, the statement is Right.
Answer 3: Based on the Thought 3, the statement is Right.
Final Answer: The statement is Right.

Input: {each_rule}
Thought 1:
'''
    return input

In [36]:
def get_affordance_verbalized_critic_input_v3(premise, conclusion):    
    input = "Yes or No? Please predict whether the premise entails the conclusion, and also explain why. Please note that the premise need not necessarily but just very likely to entail the conclusion. \n\nExamples:\n" + \
            "Premise: Person X has an Age Z1 and Vehicle Y requires an Age above Z2 for driving, with Age Z1 being greater than Age Z2. \n" + \
            "Conclusion: Person X can drive Vehicle Y. \n" + \
            "Output: Yes. Because the premise implies that Person X has achieved the minimum age requirement for driving vehicle. \n" + \
            "Premise: Person X was born in Season Z and Plant Y blooms in the same Season Z. \n" + \
            "Conclusion: Person X can access Plant Y. \n" + \
            "Output: No. Because the season of a person's birth and the blooming season of a plant has no logical connection. \n\n"  + \
            "Premise: " + premise + ". \n" + \
            "Conclusion: " + conclusion + " \n" + \
            "Output:\n"
    return input

In [184]:
def get_affordance_symbolic_critic_input_v3(premise, conclusion):    
    input = "Yes or No? Please predict whether the premise entails the conclusion, and also explain why. Please note that the premise need not necessarily but just very likely to entail the conclusion. \n\nExamples:\n" + \
            "Premise: Have(Person X, Age Z1), RequireMinimumAge(Vehicle Y, Age Z2), BiggerThan(Age Z1, Age Z2). \n" + \
            "Conclusion: CanDrive(Person X, Vehicle Y). \n" + \
            "Output: Yes. Because the premise implies that Person X has achieved the minimum age requirement for driving vehicle. \n" + \
            "Premise: BornIn(Person X, Season Z), BloomsIn(Plant Y, Season Z). \n" + \
            "Conclusion: CanAccess(Person X, Plant Y). \n" + \
            "Output: No. Because the season of a person's birth and the blooming season of a plant has no logical connection. \n\n"  + \
            "Premise: " + premise + " \n" + \
            "Conclusion: " + conclusion + " \n" + \
            "Output:\n"
    return input

In [5]:
def get_affordance_verbalized_critic_input_v3_Think(premise, conclusion):    
    input = f'''Yes or No? Please predict whether the premise entails the conclusion. Please first briefly explain your thought process in one sentence, and then give your answer. Note that the premise need not necessarily but just very likely to entail the conclusion. 

Examples:
Premise: Person X has an Age Z1 and Vehicle Y requires an Age above Z2 for driving, with Age Z1 being greater than Age Z2.
Conclusion: Person X can drive Vehicle Y.
Thought: If Person X's age (Z1) is indeed greater than the required age (Z2) to drive Vehicle Y, then logically, Person X meets the age requirement to drive Vehicle Y. 
Answer: Based on this thought process, the answer is Yes.
Premise: Person X was born in Season Z and Plant Y blooms in the same Season Z, then Person X can access Plant Y.
Conclusion: Person X can access Plant Y.
Thought: The fact that Person X was born in Season Z and Plant Y blooms in Season Z is a coincidence in timing and does not inherently establish a causal or enabling relationship for access. 
Answer: Based on this thought process, the answer is No.

Premise: {premise}.
Conclusion: {conclusion}
Thought:
'''
    return input


def get_affordance_verbalized_critic_input_v3_Think_v2(premise, conclusion):    
    input = f'''Yes or No? Please predict whether the premise entails the conclusion. Note that the premise need not necessarily but just very likely to entail the conclusion.  
Please first generate three different sentences to respectively explain your three thought processes briefly, and then based on the corresponding thought to give your answer. Finally, output the final answer according to majority voting. 

Examples:
Premise: Person X has an Age Z1 and Vehicle Y requires an Age above Z2 for driving, with Age Z1 being greater than Age Z2.
Conclusion: Person X can drive Vehicle Y.
Thought 1: The condition states that Person X's age (Z1) is greater than the required age (Z2) for driving Vehicle Y, which directly satisfies the age requirement for driving the vehicle.
Thought 2: In most circumstances, meeting the age requirement is a primary factor in determining eligibility to drive a vehicle, suggesting that Person X is likely eligible to drive Vehicle Y based on age.
Thought 3: Assuming all other necessary conditions for driving (like having a valid driver's license) are also met, the age criterion being satisfied strongly indicates that Person X can legally drive Vehicle Y.
Answer 1: Based on the Thought 1, the answer is Yes.
Answer 2: Based on the Thought 2, the answer is Yes.
Answer 3: Based on the Thought 3, the answer is Yes.
Final Answer: The answer is Yes.

Premise: {premise}.
Conclusion: {conclusion}
Thought 1:
'''
    return input

In [37]:
def get_affordance_verbalized_critic_input_v4(premise, conclusion): 
    input = f'''Examples:
Premise: Person X has an Age Z1 and Vehicle Y requires an Age above Z2 for driving, with Age Z1 being greater than Age Z2.
Conclusion: Person X can drive Vehicle Y. 
Is this conclusion logically supported by the given premise? Please answer Yes or No, and also explain why. Note that the premise need not necessarily but just very likely to support the conclusion.
Output: Yes. Because the premise implies that Person X has achieved the minimum age requirement for driving vehicle.
Premise: Person X was born in Season Z and Plant Y blooms in the same Season Z. 
Conclusion: Person X can access Plant Y. 
Is this conclusion logically supported by the given premise? Please answer Yes or No, and also explain why. Note that the premise need not necessarily but just very likely to support the conclusion.
Output: No. Because the season of a person's birth and the blooming season of a plant has no logical connection.

Premise: {premise}. 
Conclusion: {conclusion} 
Is this conclusion logically supported by the given premise? Please answer Yes or No, and also explain why. Note that the premise need not necessarily but just very likely to support the conclusion.
Output: 
'''
    return input

In [192]:
def get_affordance_symbolic_critic_input_v4(premise, conclusion):    
    input = f'''Examples:
Premise: Have(Person X, Age Z1), RequireMinimumAge(Vehicle Y, Age Z2), BiggerThan(Age Z1, Age Z2).
Conclusion: CanDrive(Person X, Vehicle Y).
Is this conclusion logically supported by the given premise? Please answer Yes or No, and also explain why. Note that the premise need not necessarily but just very likely to support the conclusion.
Output: Yes. Because the premise implies that Person X has achieved the minimum age requirement for driving vehicle.
Premise: BornIn(Person X, Season Z), BloomsIn(Plant Y, Season Z).
Conclusion: CanAccess(Person X, Plant Y).
Is this conclusion logically supported by the given premise? Please answer Yes or No, and also explain why. Note that the premise need not necessarily but just very likely to support the conclusion.
Output: No. Because the season of a person's birth and the blooming season of a plant has no logical connection.

Premise: {premise}
Conclusion: {conclusion}
Is this conclusion logically supported by the given premise? Please answer Yes or No, and also explain why. Note that the premise need not necessarily but just very likely to support the conclusion.
Output:
'''
    return input

In [6]:
def get_affordance_verbalized_critic_input_v4_Think(premise, conclusion):    
    input = f'''Examples:
Premise: Person X has an Age Z1 and Vehicle Y requires an Age above Z2 for driving, with Age Z1 being greater than Age Z2.
Conclusion: Person X can drive Vehicle Y.
Is this conclusion logically supported by the given premise? Please first briefly explain your thought process in one sentence, and then answer Yes or No. Note that the premise need not necessarily but just very likely to support the conclusion.
Thought: If Person X's age (Z1) is indeed greater than the required age (Z2) to drive Vehicle Y, then logically, Person X meets the age requirement to drive Vehicle Y. 
Answer: Based on this thought process, the answer is Yes.
Premise: Person X was born in Season Z and Plant Y blooms in the same Season Z, then Person X can access Plant Y.
Conclusion: Person X can access Plant Y.
Is this conclusion logically supported by the given premise? Please first briefly explain your thought process in one sentence, and then answer Yes or No. Note that the premise need not necessarily but just very likely to support the conclusion.
Thought: The fact that Person X was born in Season Z and Plant Y blooms in Season Z is a coincidence in timing and does not inherently establish a causal or enabling relationship for access. 
Answer: Based on this thought process, the answer is No.

Premise: {premise}.
Conclusion: {conclusion}
Is this conclusion logically supported by the given premise? Please first briefly explain your thought process in one sentence, and then answer Yes or No. Note that the premise need not necessarily but just very likely to support the conclusion.
Thought:
'''
    return input

def get_affordance_verbalized_critic_input_v4_Think_v2(premise, conclusion):    
    input = f'''Examples:
Premise: Person X has an Age Z1 and Vehicle Y requires an Age above Z2 for driving, with Age Z1 being greater than Age Z2.
Conclusion: Person X can drive Vehicle Y.
Is this conclusion logically supported by the given premise? Note that the premise need not necessarily but just very likely to support the conclusion.
Please first generate three different sentences to respectively explain your three thought processes briefly, and then based on the corresponding thought to answer Yes or No. Finally, output the final answer according to majority voting. 
Thought 1: The condition states that Person X's age (Z1) is greater than the required age (Z2) for driving Vehicle Y, which directly satisfies the age requirement for driving the vehicle.
Thought 2: In most circumstances, meeting the age requirement is a primary factor in determining eligibility to drive a vehicle, suggesting that Person X is likely eligible to drive Vehicle Y based on age.
Thought 3: Assuming all other necessary conditions for driving (like having a valid driver's license) are also met, the age criterion being satisfied strongly indicates that Person X can legally drive Vehicle Y.
Answer 1: Based on the Thought 1, the answer is Yes.
Answer 2: Based on the Thought 2, the answer is Yes.
Answer 3: Based on the Thought 3, the answer is Yes.
Final Answer: The answer is Yes.

Premise: {premise}.
Conclusion: {conclusion}
Is this conclusion logically supported by the given premise? Note that the premise need not necessarily but just very likely to support the conclusion.
Please first generate three different sentences to respectively explain your three thought processes briefly, and then based on the corresponding thought to answer Yes or No. Finally, output the final answer according to majority voting. 
Thought 1:
'''
    return input

In [38]:
def get_affordance_verbalized_critic_input_v5(premise, conclusion): 
    input = f'''Examples:
Given the observations that Person X has an Age Z1 and Vehicle Y requires an Age above Z2 for driving, with Age Z1 being greater than Age Z2, can we draw the conlcusion that Person X can drive Vehicle Y?
Please answer Yes or No, and also explain why. Note that the observations need not necessarily but just very likely to draw the conclusion.
Output: Yes. Because the premise implies that Person X has achieved the minimum age requirement for driving vehicle.
Given the observations that Person X was born in Season Z and Plant Y blooms in the same Season Z, can we draw the conlcusion that Person X can access Plant Y?
Please answer Yes or No, and also explain why. Note that the observations need not necessarily but just very likely to draw the conclusion.
Output: No. Because the season of a person's birth and the blooming season of a plant has no logical connection.

Given the observations that {premise}, can we draw the conlcusion that {conclusion[:-1]}?
Please answer Yes or No, and also explain why. Note that the observations need not necessarily but just very likely to draw the conclusion.
Output: 
'''
    return input

In [39]:
def get_affordance_symbolic_critic_input_v5(premise, conclusion): 
    input = f'''Examples:
Given the observations that Have(Person X, Age Z1), RequireMinimumAge(Vehicle Y, Age Z2), BiggerThan(Age Z1, Age Z2), can we draw the conlcusion that CanDrive(Person X, Vehicle Y)?
Please answer Yes or No, and also explain why. Note that the observations need not necessarily but just very likely to draw the conclusion.
Output: Yes. Because the premise implies that Person X has achieved the minimum age requirement for driving vehicle.
Given the observations that BornIn(Person X, Season Z), BloomsIn(Plant Y, Season Z), can we draw the conlcusion that CanAccess(Person X, Plant Y)?
Please answer Yes or No, and also explain why. Note that the observations need not necessarily but just very likely to draw the conclusion.
Output: No. Because the season of a person's birth and the blooming season of a plant has no logical connection.

Given the observations that {premise[:-1]}, can we draw the conlcusion that {conclusion[:-1]}?
Please answer Yes or No, and also explain why. Note that the observations need not necessarily but just very likely to draw the conclusion.
Output: 
'''
    
    return input

In [7]:
def get_affordance_verbalized_critic_input_v5_Think(premise, conclusion):    
    input = f'''Examples:
Given the observations that Person X has an Age Z1 and Vehicle Y requires an Age above Z2 for driving, with Age Z1 being greater than Age Z2, can we draw the conlcusion that Person X can drive Vehicle Y?
Please first briefly explain your thought process in one sentence, and then answer Yes or No. Note that the observations need not necessarily but just very likely to draw the conclusion.
Thought: If Person X's age (Z1) is indeed greater than the required age (Z2) to drive Vehicle Y, then logically, Person X meets the age requirement to drive Vehicle Y. 
Answer: Based on this thought process, the answer is Yes.
Given the observations that Person X was born in Season Z and Plant Y blooms in the same Season Z, can we draw the conlcusion that Person X can access Plant Y?
Please first briefly explain your thought process in one sentence, and then answer Yes or No. Note that the observations need not necessarily but just very likely to draw the conclusion.
Thought: The fact that Person X was born in Season Z and Plant Y blooms in Season Z is a coincidence in timing and does not inherently establish a causal or enabling relationship for access. 
Answer: Based on this thought process, the answer is No.

Given the observations that {premise}, can we draw the conlcusion that {conclusion[:-1]}?
Please first briefly explain your thought process in one sentence, and then answer Yes or No. Note that the observations need not necessarily but just very likely to draw the conclusion.
Thought: 
'''
    return input

def get_affordance_verbalized_critic_input_v5_Think_v2(premise, conclusion):    
    input = f'''Examples:
Given the observations that Person X has an Age Z1 and Vehicle Y requires an Age above Z2 for driving, with Age Z1 being greater than Age Z2, can we draw the conlcusion that Person X can drive Vehicle Y? Note that the observations need not necessarily but just very likely to draw the conclusion.
Please first generate three different sentences to respectively explain your three thought processes briefly, and then based on the corresponding thought to answer Yes or No. Finally, output the final answer according to majority voting. 
Thought 1: The condition states that Person X's age (Z1) is greater than the required age (Z2) for driving Vehicle Y, which directly satisfies the age requirement for driving the vehicle.
Thought 2: In most circumstances, meeting the age requirement is a primary factor in determining eligibility to drive a vehicle, suggesting that Person X is likely eligible to drive Vehicle Y based on age.
Thought 3: Assuming all other necessary conditions for driving (like having a valid driver's license) are also met, the age criterion being satisfied strongly indicates that Person X can legally drive Vehicle Y.
Answer 1: Based on the Thought 1, the answer is Yes.
Answer 2: Based on the Thought 2, the answer is Yes.
Answer 3: Based on the Thought 3, the answer is Yes.
Final Answer: The answer is Yes.

Given the observations that {premise}, can we draw the conlcusion that {conclusion[:-1]}? Note that the observations need not necessarily but just very likely to draw the conclusion.
Please first generate three different sentences to respectively explain your three thought processes briefly, and then based on the corresponding thought to answer Yes or No. Finally, output the final answer according to majority voting.
Thought 1:
'''
    return input

In [40]:
def get_location_verbalized_critic_input(each_rule):
    input = "True or False? Please predict whether the input rule is very likely to be true, and also explain why. Please note that the rule need not necessarily but just very likely to be true. \n\nExamples:\n" + \
            "Input: If Person X is born in City Z and City Z is located in Region Y, then Person X lives in Region Y.\n" + \
            "Output: False. Because the place of birth is not always indicative of the current place of residence. \n" + \
            "Input: If Person X attends School Z and School Z is located in Region Y, then Person X studies in Region Y.\n" + \
            "Output: True. Because if a person attends a school, then the region in which they study is the region where the school is located. \n\n" + \
            "Input: " + each_rule + " \n" + \
            "Output:\n"
    return input

In [185]:
def get_location_symbolic_critic_input(each_rule):
    input = "True or False? Please predict whether the input symbolic rule is very likely to be true, and also explain why. Please note that the rule need not necessarily but just very likely to be true. \n\nExamples:\n" + \
            "Input: LivesIn(Person X, Region Y):- BornIn(Person X, City Z), LocatedIn(City Z, Region Y); \n" + \
            "Output: False. Because the place of birth is not always indicative of the current place of residence. \n" + \
            "Input: StudiesIn(Person X, Region Y):- Attends(Person X, School Z), LocatedIn(School Z, Region Y); \n" + \
            "Output: True. Because if a person attends a school, then the region in which they study is the region where the school is located. \n\n" + \
            "Input: " + each_rule + " \n" + \
            "Output:\n"
    return input

In [10]:
def get_location_verbalized_critic_input_Think(each_rule):    
    input = f'''True or False? Please predict whether the input rule is very likely to be true. Please first briefly explain your thought process in one sentence, and then give your answer. Note that the rule need not necessarily but just very likely to be true. 

Examples:
Input: If Person X is born in City Z and City Z is located in Region Y, then Person X lives in Region Y.
Thought: Being born in City Z, which is located in Region Y, implies that Person X was originally from Region Y, but it does not mean Person X currently lives in Region Y.
Answer: Based on this thought process, the statement is False.
Input: If Person X attends School Z and School Z is located in Region Y, then Person X studies in Region Y.
Thought: If Person X attends School Z, and School Z is located in Region Y, then it logically follows that Person X is studying in Region Y, as the location of the school determines where the education is taking place.
Answer: Based on this thought process, the statement is True.

Input: {each_rule}
Thought:
'''
    return input

def get_location_verbalized_critic_input_Think_v2(each_rule):    
    input = f'''True or False? Please predict whether the input rule is very likely to be true. Note that the rule need not necessarily but just very likely to be true. 
Please first generate three different sentences to respectively explain your three thought processes briefly, and then based on the corresponding thought to give your answer. Finally, output the final answer according to majority voting. 

Examples:
Input: If Person X is born in City Z and City Z is located in Region Y, then Person X lives in Region Y.
Thought 1: Being born in City Z, which is located in Region Y, implies that Person X was originally from Region Y, but it does not necessarily indicate their current residency.
Thought 2: The birthplace of a person often denotes their initial place of residence; however, it does not guarantee they continue to reside there throughout their life.
Thought 3: While Person X's birth in City Z situates them in Region Y at the time of birth, it doesn't account for any possible relocation or movement since then.
Answer 1: Based on the Thought 1, the statement is False.
Answer 2: Based on the Thought 2, the statement is False.
Answer 3: Based on the Thought 3, the statement is False.
Final Answer: The statement is False.

Input: {each_rule}
Thought 1:
'''
    return input

In [41]:
def get_location_verbalized_critic_input_v2(each_rule):
    input = "Right or Wrong? Please predict whether the input rule is valid and correct, and also explain why. Please note that the rule need not necessarily but just very likely to be valid and correct. \n\nExamples:\n" + \
            "Input: If Person X is born in City Z and City Z is located in Region Y, then Person X lives in Region Y.\n" + \
            "Output: Wrong. Because the place of birth is not always indicative of the current place of residence. \n" + \
            "Input: If Person X attends School Z and School Z is located in Region Y, then Person X studies in Region Y.\n" + \
            "Output: Right. Because if a person attends a school, then the region in which they study is the region where the school is located. \n\n" + \
            "Input: " + each_rule + " \n" + \
            "Output:\n"
    return input

In [186]:
def get_location_symbolic_critic_input_v2(each_rule):
    input = "Right or Wrong? Please predict whether the input rule is valid and correct, and also explain why. Please note that the rule need not necessarily but just very likely to be valid and correct. \n\nExamples:\n" + \
            "Input: LivesIn(Person X, Region Y):- BornIn(Person X, City Z), LocatedIn(City Z, Region Y); \n" + \
            "Output: Wrong. Because the place of birth is not always indicative of the current place of residence. \n" + \
            "Input: StudiesIn(Person X, Region Y):- Attends(Person X, School Z), LocatedIn(School Z, Region Y); \n" + \
            "Output: Right. Because if a person attends a school, then the region in which they study is the region where the school is located. \n\n" + \
            "Input: " + each_rule + " \n" + \
            "Output:\n"
    return input

In [11]:
def get_location_verbalized_critic_input_v2_Think(each_rule):    
    input = f'''Right or Wrong? Please predict whether the input rule is valid and correct. Please first briefly explain your thought process in one sentence, and then give your answer. Note that the rule need not necessarily but just very likely to be valid and correct. 

Examples:
Input: If Person X is born in City Z and City Z is located in Region Y, then Person X lives in Region Y.
Thought: Being born in City Z, which is located in Region Y, implies that Person X was originally from Region Y, but it does not mean Person X currently lives in Region Y.
Answer: Based on this thought process, the statement is Wrong.
Input: If Person X attends School Z and School Z is located in Region Y, then Person X studies in Region Y.
Thought: If Person X attends School Z, and School Z is located in Region Y, then it logically follows that Person X is studying in Region Y, as the location of the school determines where the education is taking place.
Answer: Based on this thought process, the statement is Right.

Input: {each_rule}
Thought:
'''
    return input

def get_location_verbalized_critic_input_v2_Think_v2(each_rule):    
    input = f'''Right or Wrong? Please predict whether the input rule is valid and correct. Note that the rule need not necessarily but just very likely to be valid and correct. 
Please first generate three different sentences to respectively explain your three thought processes briefly, and then based on the corresponding thought to give your answer. Finally, output the final answer according to majority voting. 

Examples:
Input: If Person X is born in City Z and City Z is located in Region Y, then Person X lives in Region Y.
Thought 1: Being born in City Z, which is located in Region Y, implies that Person X was originally from Region Y, but it does not necessarily indicate their current residency.
Thought 2: The birthplace of a person often denotes their initial place of residence; however, it does not guarantee they continue to reside there throughout their life.
Thought 3: While Person X's birth in City Z situates them in Region Y at the time of birth, it doesn't account for any possible relocation or movement since then.
Answer 1: Based on the Thought 1, the statement is Wrong.
Answer 2: Based on the Thought 2, the statement is Wrong.
Answer 3: Based on the Thought 3, the statement is Wrong.
Final Answer: The statement is Wrong.

Input: {each_rule}
Thought 1:
'''
    return input

In [42]:
def get_location_verbalized_critic_input_v3(premise, conclusion):    
    input = "Yes or No? Please predict whether the premise entails the conclusion, and also explain why. Please note that the premise need not necessarily but just very likely to entail the conclusion. \n\nExamples:\n" + \
            "Premise: Person X is born in City Z and City Z is located in Region Y. \n" + \
            "Conclusion: Person X lives in Region Y. \n" + \
            "Output: No. Because the place of birth is not always indicative of the current place of residence. \n" + \
            "Premise: Person X attends School Z and School Z is located in Region Y. \n" + \
            "Conclusion: Person X studies in Region Y. \n" + \
            "Output: Yes. Because if a person attends a school, then the region in which they study is the region where the school is located. \n\n"  + \
            "Premise: " + premise + ". \n" + \
            "Conclusion: " + conclusion + " \n" + \
            "Output:\n"
    return input

In [187]:
def get_location_symbolic_critic_input_v3(premise, conclusion):
    input = "Yes or No? Please predict whether the premise entails the conclusion, and also explain why. Please note that the premise need not necessarily but just very likely to entail the conclusion. \n\nExamples:\n" + \
            "Premise: BornIn(Person X, City Z), LocatedIn(City Z, Region Y). \n" + \
            "Conclusion: LivesIn(Person X, Region Y). \n" + \
            "Output: No. Because the place of birth is not always indicative of the current place of residence. \n" + \
            "Premise: Attends(Person X, School Z), LocatedIn(School Z, Region Y). \n" + \
            "Conclusion: StudiesIn(Person X, Region Y). \n" + \
            "Output: Yes. Because if a person attends a school, then the region in which they study is the region where the school is located. \n\n"  + \
            "Premise: " + premise + " \n" + \
            "Conclusion: " + conclusion + " \n" + \
            "Output:\n"
    return input

In [12]:
def get_location_verbalized_critic_input_v3_Think(premise, conclusion):    
    input = f'''Yes or No? Please predict whether the premise entails the conclusion. Please first briefly explain your thought process in one sentence, and then give your answer. Note that the premise need not necessarily but just very likely to entail the conclusion. 

Examples:
Premise: Person X is born in City Z and City Z is located in Region Y.
Conclusion: Person X lives in Region Y.
Thought: Being born in City Z, which is located in Region Y, implies that Person X was originally from Region Y, but it does not mean Person X currently lives in Region Y.
Answer: Based on this thought process, the answer is No.
Premise: Person X attends School Z and School Z is located in Region Y. 
Conclusion: Person X studies in Region Y.
Thought: If Person X attends School Z, and School Z is located in Region Y, then it logically follows that Person X is studying in Region Y, as the location of the school determines where the education is taking place.
Answer: Based on this thought process, the answer is Yes.

Premise: {premise}.
Conclusion: {conclusion}
Thought:
'''
    return input

def get_location_verbalized_critic_input_v3_Think_v2(premise, conclusion):    
    input = f'''Yes or No? Please predict whether the premise entails the conclusion. Note that the premise need not necessarily but just very likely to entail the conclusion.  
Please first generate three different sentences to respectively explain your three thought processes briefly, and then based on the corresponding thought to give your answer. Finally, output the final answer according to majority voting. 

Examples:
Premise: Person X is born in City Z and City Z is located in Region Y.
Conclusion: Person X lives in Region Y.
Thought 1: Being born in City Z, which is located in Region Y, implies that Person X was originally from Region Y, but it does not necessarily indicate their current residency.
Thought 2: The birthplace of a person often denotes their initial place of residence; however, it does not guarantee they continue to reside there throughout their life.
Thought 3: While Person X's birth in City Z situates them in Region Y at the time of birth, it doesn't account for any possible relocation or movement since then.
Answer 1: Based on the Thought 1, the answer is No.
Answer 2: Based on the Thought 2, the answer is No.
Answer 3: Based on the Thought 3, the answer is No.
Final Answer: The answer is No.

Premise: {premise}.
Conclusion: {conclusion}
Thought 1:
'''
    return input

In [43]:
def get_location_verbalized_critic_input_v4(premise, conclusion): 
    input = f'''Examples:
Premise: Person X is born in City Z and City Z is located in Region Y. 
Conclusion: Person X lives in Region Y.
Is this conclusion logically supported by the given premise? Please answer Yes or No, and also explain why. Note that the premise need not necessarily but just very likely to support the conclusion.
Output: No. Because the place of birth is not always indicative of the current place of residence.
Premise: Person X attends School Z and School Z is located in Region Y. 
Conclusion: Person X studies in Region Y.
Is this conclusion logically supported by the given premise? Please answer Yes or No, and also explain why. Note that the premise need not necessarily but just very likely to support the conclusion.
Output: Yes. Because if a person attends a school, then the region in which they study is the region where the school is located.

Premise: {premise}. 
Conclusion: {conclusion}
Is this conclusion logically supported by the given premise? Please answer Yes or No, and also explain why. Note that the premise need not necessarily but just very likely to support the conclusion.
Output:
'''
    return input

In [78]:
def get_location_symbolic_critic_input_v4(premise, conclusion):
    input = f'''Examples:
Premise: BornIn(Person X, City Z), LocatedIn(City Z, Region Y).
Conclusion: LivesIn(Person X, Region Y).
Is this conclusion logically supported by the given premise? Please answer Yes or No, and also explain why. Note that the premise need not necessarily but just very likely to support the conclusion.
Output: No. Because the place of birth is not always indicative of the current place of residence.
Premise: Attends(Person X, School Z), LocatedIn(School Z, Region Y).
Conclusion: StudiesIn(Person X, Region Y).
Is this conclusion logically supported by the given premise? Please answer Yes or No, and also explain why. Note that the premise need not necessarily but just very likely to support the conclusion.
Output: Yes. Because if a person attends a school, then the region in which they study is the region where the school is located.

Premise: {premise}
Conclusion: {conclusion}
Is this conclusion logically supported by the given premise? Please answer Yes or No, and also explain why. Note that the premise need not necessarily but just very likely to support the conclusion.
Output:
'''
    return input

In [13]:
def get_location_verbalized_critic_input_v4_Think(premise, conclusion):    
    input = f'''Examples:
Premise: Person X is born in City Z and City Z is located in Region Y.
Conclusion: Person X lives in Region Y.
Is this conclusion logically supported by the given premise? Please first briefly explain your thought process in one sentence, and then answer Yes or No. Note that the premise need not necessarily but just very likely to support the conclusion.
Thought: Being born in City Z, which is located in Region Y, implies that Person X was originally from Region Y, but it does not mean Person X currently lives in Region Y.
Answer: Based on this thought process, the answer is No.
Premise: Person X attends School Z and School Z is located in Region Y. 
Conclusion: Person X studies in Region Y.
Is this conclusion logically supported by the given premise? Please first briefly explain your thought process in one sentence, and then answer Yes or No. Note that the premise need not necessarily but just very likely to support the conclusion.
Thought: If Person X attends School Z, and School Z is located in Region Y, then it logically follows that Person X is studying in Region Y, as the location of the school determines where the education is taking place.
Answer: Based on this thought process, the answer is Yes.

Premise: {premise}.
Conclusion: {conclusion}
Is this conclusion logically supported by the given premise? Please first briefly explain your thought process in one sentence, and then answer Yes or No. Note that the premise need not necessarily but just very likely to support the conclusion.
Thought:
'''
    return input

def get_location_verbalized_critic_input_v4_Think_v2(premise, conclusion):    
    input = f'''Examples:
Premise: Person X is born in City Z and City Z is located in Region Y.
Conclusion: Person X lives in Region Y.
Is this conclusion logically supported by the given premise? Note that the premise need not necessarily but just very likely to support the conclusion.
Please first generate three different sentences to respectively explain your three thought processes briefly, and then based on the corresponding thought to answer Yes or No. Finally, output the final answer according to majority voting.
Thought 1: Being born in City Z, which is located in Region Y, implies that Person X was originally from Region Y, but it does not necessarily indicate their current residency.
Thought 2: The birthplace of a person often denotes their initial place of residence; however, it does not guarantee they continue to reside there throughout their life.
Thought 3: While Person X's birth in City Z situates them in Region Y at the time of birth, it doesn't account for any possible relocation or movement since then.
Answer 1: Based on the Thought 1, the answer is No.
Answer 2: Based on the Thought 2, the answer is No.
Answer 3: Based on the Thought 3, the answer is No.
Final Answer: The answer is No.

Premise: {premise}.
Conclusion: {conclusion}
Is this conclusion logically supported by the given premise? Note that the premise need not necessarily but just very likely to support the conclusion.
Please first generate three different sentences to respectively explain your three thought processes briefly, and then based on the corresponding thought to answer Yes or No. Finally, output the final answer according to majority voting.
Thought 1:
'''

    return input

In [56]:
def get_location_verbalized_critic_input_v5(premise, conclusion): 
    input = f'''Examples:
Given the observations that Person X is born in City Z and City Z is located in Region Y, can we draw the conlcusion that Person X lives in Region Y?
Please answer Yes or No, and also explain why. Note that the observations need not necessarily but just very likely to draw the conclusion.
Output: No. Because the place of birth is not always indicative of the current place of residence.
Given the observations that Person X attends School Z and School Z is located in Region Y, can we draw the conlcusion that Person X studies in Region Y?
Please answer Yes or No, and also explain why. Note that the observations need not necessarily but just very likely to draw the conclusion.
Output: Yes. Because if a person attends a school, then the region in which they study is the region where the school is located.

Given the observations that {premise}, can we draw the conlcusion that {conclusion[:-1]}?
Please answer Yes or No, and also explain why. Note that the observations need not necessarily but just very likely to draw the conclusion.
Output: 
'''
    return input

In [44]:
def get_location_symbolic_critic_input_v5(premise, conclusion):
    input = f'''Examples:
Given the observations that BornIn(Person X, City Z), LocatedIn(City Z, Region Y), can we draw the conlcusion that LivesIn(Person X, Region Y)?
Please answer Yes or No, and also explain why. Note that the observations need not necessarily but just very likely to draw the conclusion.
Output: No. Because the place of birth is not always indicative of the current place of residence.
Given the observations that Attends(Person X, School Z), LocatedIn(School Z, Region Y), can we draw the conlcusion that StudiesIn(Person X, Region Y)?
Please answer Yes or No, and also explain why. Note that the observations need not necessarily but just very likely to draw the conclusion.
Output: Yes. Because if a person attends a school, then the region in which they study is the region where the school is located.

Given the observations that {premise[:-1]}, can we draw the conlcusion that {conclusion[:-1]}?
Please answer Yes or No, and also explain why. Note that the observations need not necessarily but just very likely to draw the conclusion.
Output:
'''
    return input

In [14]:
def get_location_verbalized_critic_input_v5_Think(premise, conclusion):    
    input = f'''Examples:
Given the observations that Person X is born in City Z and City Z is located in Region Y, can we draw the conlcusion that Person X lives in Region Y?
Please first briefly explain your thought process in one sentence, and then answer Yes or No. Note that the observations need not necessarily but just very likely to draw the conclusion.
Thought: Being born in City Z, which is located in Region Y, implies that Person X was originally from Region Y, but it does not mean Person X currently lives in Region Y.
Answer: Based on this thought process, the answer is No.
Given the observations that Person X attends School Z and School Z is located in Region Y, can we draw the conlcusion that Person X studies in Region Y?
Please first briefly explain your thought process in one sentence, and then answer Yes or No. Note that the observations need not necessarily but just very likely to draw the conclusion.
Thought: If Person X attends School Z, and School Z is located in Region Y, then it logically follows that Person X is studying in Region Y, as the location of the school determines where the education is taking place.
Answer: Based on this thought process, the answer is Yes.

Given the observations that {premise}, can we draw the conlcusion that {conclusion[:-1]}?
Please first briefly explain your thought process in one sentence, and then answer Yes or No. Note that the observations need not necessarily but just very likely to draw the conclusion.
Thought:
'''
    return input

def get_location_verbalized_critic_input_v5_Think_v2(premise, conclusion):    
    input = f'''Examples:
Given the observations that Person X is born in City Z and City Z is located in Region Y, can we draw the conlcusion that Person X lives in Region Y? Note that the observations need not necessarily but just very likely to draw the conclusion.
Please first generate three different sentences to respectively explain your three thought processes briefly, and then based on the corresponding thought to answer Yes or No. Finally, output the final answer according to majority voting.
Thought 1: Being born in City Z, which is located in Region Y, implies that Person X was originally from Region Y, but it does not necessarily indicate their current residency.
Thought 2: The birthplace of a person often denotes their initial place of residence; however, it does not guarantee they continue to reside there throughout their life.
Thought 3: While Person X's birth in City Z situates them in Region Y at the time of birth, it doesn't account for any possible relocation or movement since then.
Answer 1: Based on the Thought 1, the answer is No.
Answer 2: Based on the Thought 2, the answer is No.
Answer 3: Based on the Thought 3, the answer is No.
Final Answer: The answer is No.

Given the observations that {premise}, can we draw the conlcusion that {conclusion[:-1]}? Note that the observations need not necessarily but just very likely to draw the conclusion.
Please first generate three different sentences to respectively explain your three thought processes briefly, and then based on the corresponding thought to answer Yes or No. Finally, output the final answer according to majority voting.
Thought 1:
'''
    return input

In [46]:
def get_accessibility_verbalized_critic_input(each_rule):    
    input = "True or False? Please predict whether the input rule is very likely to be true, and also explain why. Please note that the rule need not necessarily but just very likely to be true. \n\nExamples:\n" + \
            "Input: If Show Y was produced at Time Period Z1, Person X died at a Time Period Z2, and Time Period Z1 is earlier than Time Period Z2, then Person X had no chance to access Show Y.\n" + \
            "Output: False. Because Show Y was available before Person X died.  \n" + \
            "Input: If Person X lives in Region Z and Animal Y inhabits the same Region Z, then Person X can access Animal Y.\n" + \
            "Output: True. Because person and animal exist in the same region. \n\n" + \
            "Input: " + each_rule + " \n" + \
            "Output:\n"
    return input

In [188]:
def get_accessibility_symbolic_critic_input(each_rule):    
    input = "True or False? Please predict whether the input rule is very likely to be true, and also explain why. Please note that the rule need not necessarily but just very likely to be true. \n\nExamples:\n" + \
            "Input: CanNotAccess(Person X, Show Y):- ProducedAt(Show Y, Time Period Z1), DiedAt(Person X, Time Period Z2), EarlierThan(Time Period Z1, Time Period Z2); \n" + \
            "Output: False. Because Show Y was available before Person X died. \n" + \
            "Input: CanAccess(Person X, Animal Y):- LivesIn(Person X, Region Z), Inhabits(Animal Y, Region Z); \n" + \
            "Output: True. Because person and animal exist in the same region. \n\n" + \
            "Input: " + each_rule + " \n" + \
            "Output:\n"
    return input

In [15]:
def get_accessibility_verbalized_critic_input_Think(each_rule):    
    input = f'''True or False? Please predict whether the input rule is very likely to be true. Please first briefly explain your thought process in one sentence, and then give your answer. Note that the rule need not necessarily but just very likely to be true. 

Examples:
Input: If Show Y was produced at Time Period Z1, Person X died at a Time Period Z2, and Time Period Z1 is earlier than Time Period Z2, then Person X had no chance to access Show Y.
Thought: If Show Y was produced in Time Period Z1, and Person X died in a later Time Period Z2, then logically, Person X lived during the time when Show Y was available, suggesting a possibility of access.
Answer: Based on this thought process, the statement is False.
Input: If Person X lives in Region Z and Animal Y inhabits the same Region Z, then Person X can access Animal Y.
Thought: If Person X lives in Region Z and Animal Y also inhabits the same Region Z, then Person X is in the same geographical area as Animal Y, which makes access feasible.
Answer: Based on this thought process, the statement is True.

Input: {each_rule}
Thought:
'''
    return input
# Answer: Assuming all other necessary conditions are met

def get_accessibility_verbalized_critic_input_Think_v2(each_rule):    
    input = f'''True or False? Please predict whether the input rule is very likely to be true. Note that the rule need not necessarily but just very likely to be true. 
Please first generate three different sentences to respectively explain your three thought processes briefly, and then based on the corresponding thought to give your answer. Finally, output the final answer according to majority voting. 

Examples:
Input: If Show Y was produced at Time Period Z1, Person X died at a Time Period Z2, and Time Period Z1 is earlier than Time Period Z2, then Person X had no chance to access Show Y.
Thought 1: Since Show Y was produced in Time Period Z1 and Person X died in Time Period Z2, with Z1 being earlier than Z2, it suggests that Show Y existed during Person X's lifetime.
Thought 2: The chronological order of Show Y's production and Person X's death implies that there was a period when Person X was alive and Show Y was available, allowing for potential access.
Thought 3: Assuming no other barriers, the fact that Show Y was produced before Person X died means there was a possibility for Person X to have accessed Show Y.
Answer 1: Based on the Thought 1, the statement is False.
Answer 2: Based on the Thought 2, the statement is False.
Answer 3: Based on the Thought 3, the statement is False.
Final Answer: The statement is False.

Input: {each_rule}
Thought 1:
'''
    return input

In [47]:
def get_accessibility_verbalized_critic_input_v2(each_rule):    
    input = "Right or Wrong? Please predict whether the input rule is valid and correct, and also explain why. Please note that the rule need not necessarily but just very likely to be valid and correct. \n\nExamples:\n" + \
            "Input: If Show Y was produced at Time Period Z1, Person X died at a Time Period Z2, and Time Period Z1 is earlier than Time Period Z2, then Person X had no chance to access Show Y.\n" + \
            "Output: Wrong. Because Show Y was available before Person X died.  \n" + \
            "Input: If Person X lives in Region Z and Animal Y inhabits the same Region Z, then Person X can access Animal Y.\n" + \
            "Output: Right. Because person and animal exist in the same region. \n\n" + \
            "Input: " + each_rule + " \n" + \
            "Output:\n"
    return input

In [189]:
def get_accessibility_symbolic_critic_input_v2(each_rule):    
    input = "Right or Wrong? Please predict whether the input rule is valid and correct, and also explain why. Please note that the rule need not necessarily but just very likely to be valid and correct. \n\nExamples:\n" + \
            "Input: CanNotAccess(Person X, Show Y):- ProducedAt(Show Y, Time Period Z1), DiedAt(Person X, Time Period Z2), EarlierThan(Time Period Z1, Time Period Z2); \n" + \
            "Output: Wrong. Because Show Y was available before Person X died. \n" + \
            "Input: CanAccess(Person X, Animal Y):- LivesIn(Person X, Region Z), Inhabits(Animal Y, Region Z); \n" + \
            "Output: Right. Because person and animal exist in the same region. \n\n" + \
            "Input: " + each_rule + " \n" + \
            "Output:\n"
    return input

In [16]:
def get_accessibility_verbalized_critic_input_v2_Think(each_rule):    
    input = f'''Right or Wrong? Please predict whether the input rule is valid and correct. Please first briefly explain your thought process in one sentence, and then give your answer. Note that the rule need not necessarily but just very likely to be valid and correct. 

Examples:
Input: If Show Y was produced at Time Period Z1, Person X died at a Time Period Z2, and Time Period Z1 is earlier than Time Period Z2, then Person X had no chance to access Show Y.
Thought: If Show Y was produced in Time Period Z1, and Person X died in a later Time Period Z2, then logically, Person X lived during the time when Show Y was available, suggesting a possibility of access.
Answer: Based on this thought process, the statement is Wrong.
Input: If Person X lives in Region Z and Animal Y inhabits the same Region Z, then Person X can access Animal Y.
Thought: If Person X lives in Region Z and Animal Y also inhabits the same Region Z, then Person X is in the same geographical area as Animal Y, which makes access feasible.
Answer: Based on this thought process, the statement is Right.

Input: {each_rule}
Thought:
'''
    return input

def get_accessibility_verbalized_critic_input_v2_Think_v2(each_rule):    
    input = f'''Right or Wrong? Please predict whether the input rule is valid and correct. Note that the rule need not necessarily but just very likely to be valid and correct. 
Please first generate three different sentences to respectively explain your three thought processes briefly, and then based on the corresponding thought to give your answer. Finally, output the final answer according to majority voting. 

Examples:
Input: If Show Y was produced at Time Period Z1, Person X died at a Time Period Z2, and Time Period Z1 is earlier than Time Period Z2, then Person X had no chance to access Show Y.
Thought 1: Since Show Y was produced in Time Period Z1 and Person X died in Time Period Z2, with Z1 being earlier than Z2, it suggests that Show Y existed during Person X's lifetime.
Thought 2: The chronological order of Show Y's production and Person X's death implies that there was a period when Person X was alive and Show Y was available, allowing for potential access.
Thought 3: Assuming no other barriers, the fact that Show Y was produced before Person X died means there was a possibility for Person X to have accessed Show Y.
Answer 1: Based on the Thought 1, the statement is Wrong.
Answer 2: Based on the Thought 2, the statement is Wrong.
Answer 3: Based on the Thought 3, the statement is Wrong.
Final Answer: The statement is Wrong.

Input: {each_rule}
Thought 1:
'''
    return input

In [49]:
def get_accessibility_verbalized_critic_input_v3(premise, conclusion):    
    input = "Yes or No? Please predict whether the premise entails the conclusion, and also explain why. Please note that the premise need not necessarily but just very likely to entail the conclusion. \n\nExamples:\n" + \
            "Premise: Show Y was produced at Time Period Z1, Person X died at a Time Period Z2, and Time Period Z1 is earlier than Time Period Z2. \n" + \
            "Conclusion: Person X had no chance to access Show Y. \n" + \
            "Output: No. Because the premise implies that Show Y was available before Person X died. \n" + \
            "Premise: Person X lives in Region Z and Animal Y inhabits the same Region Z. \n" + \
            "Conclusion: Person X can access Animal Y. \n" + \
            "Output: Yes. Because the premise implies that person and animal exist in the same region. \n\n"  + \
            "Premise: " + premise + ". \n" + \
            "Conclusion: " + conclusion + " \n" + \
            "Output:\n"
    return input

In [191]:
def get_accessibility_symbolic_critic_input_v3(premise, conclusion):    
    input = "Yes or No? Please predict whether the premise entails the conclusion, and also explain why. Please note that the premise need not necessarily but just very likely to entail the conclusion. \n\nExamples:\n" + \
            "Premise: ProducedAt(Show Y, Time Period Z1), DiedAt(Person X, Time Period Z2), EarlierThan(Time Period Z1, Time Period Z2). \n" + \
            "Conclusion: CanNotAccess(Person X, Show Y). \n" + \
            "Output: No. Because the premise implies that Show Y was available before Person X died. \n" + \
            "Premise: LivesIn(Person X, Region Z), Inhabits(Animal Y, Region Z). \n" + \
            "Conclusion: CanAccess(Person X, Animal Y). \n" + \
            "Output: Yes. Because the premise implies that person and animal exist in the same region. \n\n"  + \
            "Premise: " + premise + " \n" + \
            "Conclusion: " + conclusion + " \n" + \
            "Output:\n"
    return input

In [17]:
def get_accessibility_verbalized_critic_input_v3_Think(premise, conclusion):    
    input = f'''Yes or No? Please predict whether the premise entails the conclusion. Please first briefly explain your thought process in one sentence, and then give your answer. Note that the premise need not necessarily but just very likely to entail the conclusion.

Examples:
Premise: Show Y was produced at Time Period Z1, Person X died at a Time Period Z2, and Time Period Z1 is earlier than Time Period Z2.
Conclusion: Person X had no chance to access Show Y.
Thought: If Show Y was produced in Time Period Z1, and Person X died in a later Time Period Z2, then logically, Person X lived during the time when Show Y was available, suggesting a possibility of access.
Answer: Based on this thought process, the answer is No.
Premise: Person X lives in Region Z and Animal Y inhabits the same Region Z.
Conclusion: Person X can access Animal Y.
Thought: If Person X lives in Region Z and Animal Y also inhabits the same Region Z, then Person X is in the same geographical area as Animal Y, which makes access feasible.
Answer: Based on this thought process, the answer is Yes.

Premise: {premise}.
Conclusion: {conclusion}
Thought:
'''
    return input

def get_accessibility_verbalized_critic_input_v3_Think_v2(premise, conclusion):    
    input = f'''Yes or No? Please predict whether the premise entails the conclusion. Note that the premise need not necessarily but just very likely to entail the conclusion.  
Please first generate three different sentences to respectively explain your three thought processes briefly, and then based on the corresponding thought to give your answer. Finally, output the final answer according to majority voting. 

Examples:
Premise: Show Y was produced at Time Period Z1, Person X died at a Time Period Z2, and Time Period Z1 is earlier than Time Period Z2.
Conclusion: Person X had no chance to access Show Y.
Thought 1: Since Show Y was produced in Time Period Z1 and Person X died in Time Period Z2, with Z1 being earlier than Z2, it suggests that Show Y existed during Person X's lifetime.
Thought 2: The chronological order of Show Y's production and Person X's death implies that there was a period when Person X was alive and Show Y was available, allowing for potential access.
Thought 3: Assuming no other barriers, the fact that Show Y was produced before Person X died means there was a possibility for Person X to have accessed Show Y.
Answer 1: Based on the Thought 1, the answer is No.
Answer 2: Based on the Thought 2, the answer is No.
Answer 3: Based on the Thought 3, the answer is No.
Final Answer: The answer is No.

Premise: {premise}.
Conclusion: {conclusion}
Thought 1:
'''
    return input

In [50]:
def get_accessibility_verbalized_critic_input_v4(premise, conclusion): 
    input = f'''Examples:
Premise: Show Y was produced at Time Period Z1, Person X died at a Time Period Z2, and Time Period Z1 is earlier than Time Period Z2. 
Conclusion: Person X had no chance to access Show Y.
Is this conclusion logically supported by the given premise? Please answer Yes or No, and also explain why. Note that the premise need not necessarily but just very likely to support the conclusion.
Output: No. Because the premise implies that Show Y was available before Person X died.
Premise: Person X lives in Region Z and Animal Y inhabits the same Region Z. 
Conclusion: Person X can access Animal Y.
Is this conclusion logically supported by the given premise? Please answer Yes or No, and also explain why. Note that the premise need not necessarily but just very likely to support the conclusion.
Output: Yes. Because the premise implies that person and animal exist in the same region.

Premise: {premise}. 
Conclusion: {conclusion}
Is this conclusion logically supported by the given premise? Please answer Yes or No, and also explain why. Note that the premise need not necessarily but just very likely to support the conclusion.
Output:
'''
    return input

In [198]:
def get_accessibility_symbolic_critic_input_v4(premise, conclusion):    
    input = f'''Examples:
Premise: ProducedAt(Show Y, Time Period Z1), DiedAt(Person X, Time Period Z2), EarlierThan(Time Period Z1, Time Period Z2).
Conclusion: CanNotAccess(Person X, Show Y).
Is this conclusion logically supported by the given premise? Please answer Yes or No, and also explain why. Note that the premise need not necessarily but just very likely to support the conclusion.
Output: No. Because the premise implies that Show Y was available before Person X died.
Premise: LivesIn(Person X, Region Z), Inhabits(Animal Y, Region Z).
Conclusion: CanAccess(Person X, Animal Y).
Is this conclusion logically supported by the given premise? Please answer Yes or No, and also explain why. Note that the premise need not necessarily but just very likely to support the conclusion.
Output: Yes. Because the premise implies that person and animal exist in the same region.

Premise: {premise}
Conclusion: {conclusion}
Is this conclusion logically supported by the given premise? Please answer Yes or No, and also explain why. Note that the premise need not necessarily but just very likely to support the conclusion.
Output:
'''
    return input

In [18]:
def get_accessibility_verbalized_critic_input_v4_Think(premise, conclusion):    
    input = f'''Examples:
Premise: Show Y was produced at Time Period Z1, Person X died at a Time Period Z2, and Time Period Z1 is earlier than Time Period Z2.
Conclusion: Person X had no chance to access Show Y.
Is this conclusion logically supported by the given premise? Please first briefly explain your thought process in one sentence, and then answer Yes or No. Note that the premise need not necessarily but just very likely to support the conclusion.
Thought: If Show Y was produced in Time Period Z1, and Person X died in a later Time Period Z2, then logically, Person X lived during the time when Show Y was available, suggesting a possibility of access.
Answer: Based on this thought process, the answer is No.
Premise: Person X lives in Region Z and Animal Y inhabits the same Region Z.
Conclusion: Person X can access Animal Y.
Is this conclusion logically supported by the given premise? Please first briefly explain your thought process in one sentence, and then answer Yes or No. Note that the premise need not necessarily but just very likely to support the conclusion.
Thought: If Person X lives in Region Z and Animal Y also inhabits the same Region Z, then Person X is in the same geographical area as Animal Y, which makes access feasible.
Answer: Based on this thought process, the answer is Yes.

Premise: {premise}.
Conclusion: {conclusion}
Is this conclusion logically supported by the given premise? Please first briefly explain your thought process in one sentence, and then answer Yes or No. Note that the premise need not necessarily but just very likely to support the conclusion.
Thought:
    '''
    return input

def get_accessibility_verbalized_critic_input_v4_Think_v2(premise, conclusion):    
    input = f'''Examples:
Premise: Show Y was produced at Time Period Z1, Person X died at a Time Period Z2, and Time Period Z1 is earlier than Time Period Z2.
Conclusion: Person X had no chance to access Show Y.
Is this conclusion logically supported by the given premise? Note that the premise need not necessarily but just very likely to support the conclusion.
Please first generate three different sentences to respectively explain your three thought processes briefly, and then based on the corresponding thought to answer Yes or No. Finally, output the final answer according to majority voting.
Thought 1: Since Show Y was produced in Time Period Z1 and Person X died in Time Period Z2, with Z1 being earlier than Z2, it suggests that Show Y existed during Person X's lifetime.
Thought 2: The chronological order of Show Y's production and Person X's death implies that there was a period when Person X was alive and Show Y was available, allowing for potential access.
Thought 3: Assuming no other barriers, the fact that Show Y was produced before Person X died means there was a possibility for Person X to have accessed Show Y.
Answer 1: Based on the Thought 1, the answer is No.
Answer 2: Based on the Thought 2, the answer is No.
Answer 3: Based on the Thought 3, the answer is No.
Final Answer: The answer is No.

Premise: {premise}.
Conclusion: {conclusion}
Is this conclusion logically supported by the given premise? Note that the premise need not necessarily but just very likely to support the conclusion.
Please first generate three different sentences to respectively explain your three thought processes briefly, and then based on the corresponding thought to answer Yes or No. Finally, output the final answer according to majority voting.
Thought 1:
'''

    return input

In [52]:
def get_accessibility_verbalized_critic_input_v5(premise, conclusion): 
    input = f'''Examples:
Given the observations that Show Y was produced at Time Period Z1, Person X died at a Time Period Z2, and Time Period Z1 is earlier than Time Period Z2, can we draw the conlcusion that Person X had no chance to access Show Y?
Please answer Yes or No, and also explain why. Note that the observations need not necessarily but just very likely to draw the conclusion.
Output: No. Because the premise implies that Show Y was available before Person X died.
Given the observations that Person X lives in Region Z and Animal Y inhabits the same Region Z, can we draw the conlcusion that Person X can access Animal Y?
Please answer Yes or No, and also explain why. Note that the observations need not necessarily but just very likely to draw the conclusion.
Output: Yes. Because the premise implies that person and animal exist in the same region.

Given the observations that {premise}, can we draw the conlcusion that {conclusion[:-1]}?
Please answer Yes or No, and also explain why. Note that the observations need not necessarily but just very likely to draw the conclusion.
Output: 
'''
    return input

In [199]:
def get_accessibility_symbolic_critic_input_v5(premise, conclusion):    
    input = f'''Examples:
Given the observations that ProducedAt(Show Y, Time Period Z1), DiedAt(Person X, Time Period Z2), EarlierThan(Time Period Z1, Time Period Z2), can we draw the conlcusion that CanNotAccess(Person X, Show Y)?
Please answer Yes or No, and also explain why. Note that the observations need not necessarily but just very likely to draw the conclusion.
Output: No. Because the premise implies that Show Y was available before Person X died.
Given the observations that LivesIn(Person X, Region Z), Inhabits(Animal Y, Region Z), can we draw the conlcusion that CanAccess(Person X, Animal Y)?
Please answer Yes or No, and also explain why. Note that the observations need not necessarily but just very likely to draw the conclusion.
Output: Yes. Because the premise implies that person and animal exist in the same region.

Given the observations that {premise[:-1]}, can we draw the conlcusion that {conclusion[:-1]}?
Please answer Yes or No, and also explain why. Note that the observations need not necessarily but just very likely to draw the conclusion.
Output:
'''
    
    return input

In [19]:
def get_accessibility_verbalized_critic_input_v5_Think(premise, conclusion):    
    input = f'''Examples:
Given the observations that Show Y was produced at Time Period Z1, Person X died at a Time Period Z2, and Time Period Z1 is earlier than Time Period Z2, can we draw the conlcusion that Person X had no chance to access Show Y?
Please first briefly explain your thought process in one sentence, and then answer Yes or No. Note that the observations need not necessarily but just very likely to draw the conclusion.
Thought: If Show Y was produced in Time Period Z1, and Person X died in a later Time Period Z2, then logically, Person X lived during the time when Show Y was available, suggesting a possibility of access.
Answer: Based on this thought process, the answer is No.
Given the observations that Person X lives in Region Z and Animal Y inhabits the same Region Z, can we draw the conlcusion that Person X can access Animal Y?
Please first briefly explain your thought process in one sentence, and then answer Yes or No. Note that the observations need not necessarily but just very likely to draw the conclusion.
Thought: If Person X lives in Region Z and Animal Y also inhabits the same Region Z, then Person X is in the same geographical area as Animal Y, which makes access feasible.
Answer: Based on this thought process, the answer is Yes.

Given the observations that {premise}, can we draw the conlcusion that {conclusion[:-1]}?
Please first briefly explain your thought process in one sentence, and then answer Yes or No. Note that the observations need not necessarily but just very likely to draw the conclusion.
Thought:
'''
    return input

def get_accessibility_verbalized_critic_input_v5_Think_v2(premise, conclusion):    
    input = f'''Examples:
Given the observations that Show Y was produced at Time Period Z1, Person X died at a Time Period Z2, and Time Period Z1 is earlier than Time Period Z2, can we draw the conlcusion that Person X had no chance to access Show Y? Note that the observations need not necessarily but just very likely to draw the conclusion.
Please first generate three different sentences to respectively explain your three thought processes briefly, and then based on the corresponding thought to answer Yes or No. Finally, output the final answer according to majority voting.
Thought 1: Since Show Y was produced in Time Period Z1 and Person X died in Time Period Z2, with Z1 being earlier than Z2, it suggests that Show Y existed during Person X's lifetime.
Thought 2: The chronological order of Show Y's production and Person X's death implies that there was a period when Person X was alive and Show Y was available, allowing for potential access.
Thought 3: Assuming no other barriers, the fact that Show Y was produced before Person X died means there was a possibility for Person X to have accessed Show Y.
Answer 1: Based on the Thought 1, the answer is No.
Answer 2: Based on the Thought 2, the answer is No.
Answer 3: Based on the Thought 3, the answer is No.
Final Answer: The answer is No.

Given the observations that {premise}, can we draw the conlcusion that {conclusion[:-1]}? Note that the observations need not necessarily but just very likely to draw the conclusion.
Please first generate three different sentences to respectively explain your three thought processes briefly, and then based on the corresponding thought to answer Yes or No. Finally, output the final answer according to majority voting.
Thought 1:
'''
    return input

In [20]:
import csv
def get_human_result(result_file):
    all_data = []
    with open(result_file) as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            all_data.append(row)

    annotation_dict_correct = {}
    for i in range(1, len(all_data)):
        entry_id = all_data[i][27]
        if entry_id not in annotation_dict_correct:
            annotation_dict_correct[entry_id] = [all_data[i][32]]
        else:
            annotation_dict_correct[entry_id].append(all_data[i][32]) 
    print(len(annotation_dict_correct))
    return annotation_dict_correct

In [54]:
import random
from tqdm import tqdm
import os

def probing_both_valid_invalid_rules(rules, model='GPT-4', type="", version="", think_v="", max_tokens=400):

    if os.path.exists(f'ScriptData/Primitive/Analysis_data/annotations{version}/{model}.json'):
        with open(f'ScriptData/Primitive/Analysis_data/annotations{version}/{model}.json', 'r') as r_f:
            label_list = json.load(r_f)
    else:
        label_list = {}
        
    random_seed = 234
    random.seed(random_seed)
    selected_index = random.sample(list(range(len(rules))), len(rules))
    if len(rules) >= 150: 
        selected_index = selected_index[:150]
    print(len(selected_index))

    acc_num = 0
    unnecessary_num = 0
    for i in tqdm(selected_index):
        if True:
            each_rule = rules[i]['v_rule'].strip()

            # modify pos rules into neg rules 
            assert each_rule[:3] == "If " and ", then " in each_rule
            premise, conclusion = each_rule.split(", then ")
            original_conclusion = conclusion

            if "cannot" in conclusion:
                conclusion = conclusion.replace("cannot", "can")
            elif "can" in conclusion:
                conclusion = conclusion.replace("can", "cannot")
            elif " not" in conclusion or "not " in conclusion:
                conclusion = conclusion.replace(" not", "").replace("not ", "")
            else:
                conc_tokens = conclusion.split()
                if "X" in conc_tokens and "Y" in conc_tokens:
                    person_index = min(conc_tokens.index("X"), conc_tokens.index("Y"))
                elif "X" in conc_tokens:
                    person_index = conc_tokens.index("X")
                elif "Y" in conc_tokens:
                    person_index = conc_tokens.index("Y")
                else:
                    print(conc_tokens)
                    person_index = conc_tokens.index("Y.")
                conc_tokens = conc_tokens[:person_index+2] + ['not'] + conc_tokens[person_index+2:]
                conclusion = " ".join(conc_tokens)
            edit_rule = premise + ", then " + conclusion

            if version == "" or version == "_v2":
                if rules[i]['domain']=="affordance":
                    probing_input = eval(f"get_affordance_verbalized_critic_input{version}{think_v}")(each_rule)
                    valid_probing_input = eval(f"get_affordance_verbalized_critic_input{version}{think_v}")(edit_rule)
                elif rules[i]['domain'] == "accessibility":
                    probing_input = eval(f"get_accessibility_verbalized_critic_input{version}{think_v}")(each_rule)
                    valid_probing_input = eval(f"get_accessibility_verbalized_critic_input{version}{think_v}")(edit_rule)
                else:
                    probing_input = eval(f"get_location_verbalized_critic_input{version}{think_v}")(each_rule)
                    valid_probing_input = eval(f"get_location_verbalized_critic_input{version}{think_v}")(edit_rule)
            else:
                if rules[i]['domain']=="affordance":
                    probing_input = eval(f"get_affordance_verbalized_critic_input{version}{think_v}")(premise[3:], original_conclusion)
                    valid_probing_input = eval(f"get_affordance_verbalized_critic_input{version}{think_v}")(premise[3:], conclusion)
                elif rules[i]['domain'] == "accessibility":
                    probing_input = eval(f"get_accessibility_verbalized_critic_input{version}{think_v}")(premise[3:], original_conclusion)
                    valid_probing_input = eval(f"get_accessibility_verbalized_critic_input{version}{think_v}")(premise[3:], conclusion)
                else:
                    probing_input = eval(f"get_location_verbalized_critic_input{version}{think_v}")(premise[3:], original_conclusion)
                    valid_probing_input = eval(f"get_location_verbalized_critic_input{version}{think_v}")(premise[3:], conclusion)

            if model == 'GPT-4' or model == 'GPT-4-preview':
                response = get_GPT4_response(probing_input, max_tokens=max_tokens, temp=0)
                valid_response = get_GPT4_response(valid_probing_input, max_tokens=max_tokens, temp=0)
            elif model == 'GPT-3.5':
                response = get_GPT4_response(probing_input, max_tokens=max_tokens, temp=0, model="gpt-3.5-turbo-0613")
                valid_response = get_GPT4_response(valid_probing_input, max_tokens=max_tokens, temp=0, model="gpt-3.5-turbo-0613")
            else:
                response = get_davinci3_response(probing_input, max_tokens=max_tokens, temp=0)
                valid_response = get_davinci3_response(valid_probing_input, max_tokens=max_tokens, temp=0)
            
            # print(response)
            assert "Final Answer:" in response and "Final Answer:" in valid_response, "Error in response"
            response = response.split("Final Answer:")[-1].strip()
            valid_response = valid_response.split("Final Answer:")[-1].strip()

            if version == "":
                if "True" in response and "False" not in response and "False" in valid_response and "True" not in valid_response:
                    acc_num += 1
                    label_list[f"{think_v}{rules[i]['s_rule'].strip()}"] = 1
                else:
                    label_list[f"{think_v}{rules[i]['s_rule'].strip()}"] = 0
            elif version == "_v2":
                if "Right" in response and "Wrong" not in response and "Wrong" in valid_response and "Right" not in valid_response:
                    acc_num += 1
                    label_list[f"{think_v}{rules[i]['s_rule'].strip()}"] = 1
                else:
                    label_list[f"{think_v}{rules[i]['s_rule'].strip()}"] = 0
            elif version == "_v3" or version == "_v4" or version == "_v5":
                if "Yes" in response and "No" not in response and "No" in valid_response and "Yes" not in valid_response:
                    acc_num += 1
                    label_list[f"{think_v}{rules[i]['s_rule'].strip()}"] = 1
                else:
                    label_list[f"{think_v}{rules[i]['s_rule'].strip()}"] = 0

    print(acc_num, unnecessary_num)
    # with open(f'ScriptData/Primitive/Analysis_data_v2/annotations{version}/{model}.json', 'w') as w_f:
    with open(f'ScriptData/Primitive/Analysis_data/annotations{version}/{model}.json', 'w') as w_f:
        json.dump(label_list, w_f, indent=1)
    print(model, type, acc_num, acc_num/len(selected_index))

In [200]:
import random
from tqdm import tqdm
import os

def probing_both_valid_invalid_symbolic_rules(rules, model='GPT-4', type="", version="", rule_type="symbolic"):
    if os.path.exists(f'ScriptData/Primitive/Analysis_data/annotations{version}/{model}.json'):
        with open(f'ScriptData/Primitive/Analysis_data/annotations{version}/{model}.json', 'r') as r_f:
            label_list = json.load(r_f)
    else:
        label_list = {}
        
    random_seed = 234
    random.seed(random_seed)
    selected_index = random.sample(list(range(len(rules))), len(rules))
    if len(rules) >= 150: 
        selected_index = selected_index[:150]
    print(len(selected_index))

    acc_num = 0
    for i in tqdm(selected_index):
        if "Sym " + rules[i]['s_rule'].strip() in label_list:
            acc_num += label_list["Sym " + rules[i]['s_rule'].strip()]
        else:
            each_rule = rules[i]['s_rule'].strip()
            # modify pos rules into neg rules 
            assert each_rule[-1] == ";"
            conclusion, premise = each_rule[:-1].split(":- ")
            original_conclusion = conclusion

            if "CanNot" in conclusion:
                conclusion = conclusion.replace("CanNot", "Can")
            elif "Cannot" in conclusion:
                conclusion = conclusion.replace("Cannot", "Can")
            elif "Can" in conclusion:
                conclusion = conclusion.replace("Can", "CanNot")
            elif "Not" in conclusion or "not" in conclusion:
                conclusion = conclusion.replace("Not", "").replace("not ", "")
            else:
                conclusion = "Not" + conclusion
            edit_rule = conclusion + ":- " + premise + ";"

            if version == "" or version == "_v2":
                if rules[i]['domain']=="affordance":
                    probing_input = eval(f"get_affordance_{rule_type}_critic_input{version}")(each_rule)
                    valid_probing_input = eval(f"get_affordance_{rule_type}_critic_input{version}")(edit_rule)
                elif rules[i]['domain'] == "accessibility":
                    probing_input = eval(f"get_accessibility_{rule_type}_critic_input{version}")(each_rule)
                    valid_probing_input = eval(f"get_accessibility_{rule_type}_critic_input{version}")(edit_rule)
                else:
                    probing_input = eval(f"get_location_{rule_type}_critic_input{version}")(each_rule)
                    valid_probing_input = eval(f"get_location_{rule_type}_critic_input{version}")(edit_rule)
            else:
                if rules[i]['domain']=="affordance":
                    probing_input = eval(f"get_affordance_{rule_type}_critic_input{version}")(premise+".", original_conclusion+".")
                    valid_probing_input = eval(f"get_affordance_{rule_type}_critic_input{version}")(premise+".", conclusion+".")
                elif rules[i]['domain'] == "accessibility":
                    probing_input = eval(f"get_accessibility_{rule_type}_critic_input{version}")(premise+".", original_conclusion+".")
                    valid_probing_input = eval(f"get_accessibility_{rule_type}_critic_input{version}")(premise+".", conclusion+".")
                else:
                    probing_input = eval(f"get_location_{rule_type}_critic_input{version}")(premise+".", original_conclusion+".")
                    valid_probing_input = eval(f"get_location_{rule_type}_critic_input{version}")(premise+".", conclusion+".")

            if model == 'GPT-4':
                response = get_GPT4_response(probing_input, max_tokens=50, temp=0)
                valid_response = get_GPT4_response(valid_probing_input, max_tokens=50, temp=0)
            elif model == 'GPT-3.5':
                response = get_GPT4_response(probing_input, max_tokens=50, temp=0, model="gpt-3.5-turbo-0613")
                valid_response = get_GPT4_response(valid_probing_input, max_tokens=50, temp=0, model="gpt-3.5-turbo-0613")
            else:
                response = get_davinci3_response(probing_input, max_tokens=50, temp=0)
                valid_response = get_davinci3_response(valid_probing_input, max_tokens=50, temp=0)
            # print(each_rule)
            # print(response[:150])
            # print(edit_rule)
            # print(valid_response[:150])
            # print("*"*50)
            if version == "":
                if "True" in response and "False" not in response and "False" in valid_response and "True" not in valid_response:
                    acc_num += 1
                    label_list[f"Sym {rules[i]['s_rule'].strip()}"] = 1
                else:
                    label_list[f"Sym {rules[i]['s_rule'].strip()}"] = 0
            elif version == "_v2":
                if "Right" in response and "Wrong" not in response and "Wrong" in valid_response and "Right" not in valid_response:
                    acc_num += 1
                    label_list[f"Sym {rules[i]['s_rule'].strip()}"] = 1
                else:
                    label_list[f"Sym {rules[i]['s_rule'].strip()}"] = 0
            elif version == "_v3" or version == "_v4" or version == "_v5":
                if "Yes" in response and "No" not in response and "No" in valid_response and "Yes" not in valid_response:
                    acc_num += 1
                    label_list[f"Sym {rules[i]['s_rule'].strip()}"] = 1
                else:
                    label_list[f"Sym {rules[i]['s_rule'].strip()}"] = 0
            
    with open(f'ScriptData/Primitive/Analysis_data/annotations{version}/{model}.json', 'w') as w_f:
        json.dump(label_list, w_f, indent=1)
    print(model, type, acc_num, acc_num/len(selected_index))

In [None]:
from tqdm import tqdm
import random
def probing_valid_rules_human(rules, type=""):
    random_seed = 234
    random.seed(random_seed)
    selected_index = random.sample(list(range(len(rules))), len(rules))
    if len(rules) >= 150: 
        selected_index = selected_index[:150]

        
    acc_num = 0
    for i in tqdm(selected_index):
        if rules[i]['label']:
            if rules[i]['original_turk'] == "2" and rules[i]['flipped_turk'] == "1":
                acc_num += 1
    print("human", type, acc_num/len(selected_index))

#### Probing experiment with high-quality data

In [205]:
from tqdm import tqdm
def compute_overlap(candidate_list, new_sentence):
    new_sentence_tokens = set(new_sentence.lower().split())
    for each in candidate_list:
        if len(set(each) & new_sentence_tokens) / len(new_sentence_tokens) > 0.8 or len(set(each) & new_sentence_tokens) / len(set(each)) > 0.8:
            return False
    return True

# seed = 42
import json
def analysis_length(model='GPT-4', valid=True, positive=False, version=""):
    all_data_file = 'ScriptData/Primitive/Analysis_data/high_quality_probing_data.json'
    with open(all_data_file, 'r') as r_f:
        all_data = json.load(r_f)

    length_1_list = []
    length_2_list = []
    length_3_list = []
    length_4_list = []
    length_1_list_tokens = []
    length_2_list_tokens = []
    length_3_list_tokens = []
    length_4_list_tokens = []
    for _ in range(len(all_data)):
        if all_data[_]['length'] == 1:
            if compute_overlap(length_1_list_tokens, all_data[_]['v_rule']):
                length_1_list_tokens.append(all_data[_]['v_rule'].lower().split())
                length_1_list.append(all_data[_])
        elif all_data[_]['length'] == 2:
            if compute_overlap(length_2_list_tokens, all_data[_]['v_rule']):
                length_2_list_tokens.append(all_data[_]['v_rule'].lower().split())
                length_2_list.append(all_data[_])
        elif all_data[_]['length'] == 3:
            if compute_overlap(length_3_list_tokens, all_data[_]['v_rule']):
                length_3_list_tokens.append(all_data[_]['v_rule'].lower().split())
                length_3_list.append(all_data[_])
        elif all_data[_]['length'] == 4:
            if compute_overlap(length_4_list_tokens, all_data[_]['v_rule']):
                length_4_list_tokens.append(all_data[_]['v_rule'].lower().split())
                length_4_list.append(all_data[_])
    print(len(length_1_list), len(length_2_list), len(length_3_list), len(length_4_list))

    if model == "human":
        probing_valid_rules_human(length_4_list, type="Length 4")
        probing_valid_rules_human(length_3_list, type="Length 3")
        probing_valid_rules_human(length_2_list, type="Length 2")
        probing_valid_rules_human(length_1_list, type="Length 1")
    else:
        # probing_both_valid_invalid_rules(length_4_list, model=model, type="Length 4", version=version)
        # probing_both_valid_invalid_rules(length_3_list, model=model, type="Length 3", version=version)
        # probing_both_valid_invalid_rules(length_2_list, model=model, type="Length 2", version=version)
        # probing_both_valid_invalid_rules(length_1_list, model=model, type="Length 1", version=version)

        probing_both_valid_invalid_symbolic_rules(length_4_list, model=model, type="Length 4", version=version)
        probing_both_valid_invalid_symbolic_rules(length_3_list, model=model, type="Length 3", version=version)
        probing_both_valid_invalid_symbolic_rules(length_2_list, model=model, type="Length 2", version=version)
        probing_both_valid_invalid_symbolic_rules(length_1_list, model=model, type="Length 1", version=version)

In [153]:
analysis_length("human")

254 432 195 116


100%|██████████| 116/116 [00:00<00:00, 181747.95it/s]


human Length 4 0.896551724137931


100%|██████████| 150/150 [00:00<00:00, 294130.72it/s]


human Length 3 0.9333333333333333


100%|██████████| 150/150 [00:00<00:00, 251256.23it/s]


human Length 2 0.9866666666666667


100%|██████████| 150/150 [00:00<00:00, 60096.06it/s]

human Length 1 1.0





In [206]:
analysis_length("GPT-3.5")
analysis_length("GPT-3.5-Instruct")
analysis_length("GPT-4")

# analysis_length("GPT-3.5", version="_v2")
# analysis_length("GPT-3.5", version="_v3")
# analysis_length("GPT-3.5", version="_v4")
# analysis_length("GPT-3.5", version="_v5")

# analysis_length("GPT-4", version="_v2")
# analysis_length("GPT-4", version="_v3")
# analysis_length("GPT-4", version="_v4")
# analysis_length("GPT-4", version="_v5")

# analysis_length("GPT-3.5-Instruct", version="_v2")
# analysis_length("GPT-3.5-Instruct", version="_v3")
# analysis_length("GPT-3.5-Instruct", version="_v4")
# analysis_length("GPT-3.5-Instruct", version="_v5")

254 432 195 116
116


100%|██████████| 116/116 [00:00<00:00, 323067.24it/s]


GPT-3.5 Length 4 22 0.1896551724137931
150


100%|██████████| 150/150 [00:00<00:00, 181624.02it/s]


GPT-3.5 Length 3 29 0.19333333333333333
150


100%|██████████| 150/150 [00:00<00:00, 287806.77it/s]


GPT-3.5 Length 2 49 0.32666666666666666
150


100%|██████████| 150/150 [00:00<00:00, 139654.96it/s]

GPT-3.5 Length 1 98 0.6533333333333333





254 432 195 116
116


100%|██████████| 116/116 [00:00<00:00, 152233.81it/s]


GPT-3.5-Instruct Length 4 2 0.017241379310344827
150


100%|██████████| 150/150 [00:00<00:00, 216797.24it/s]


GPT-3.5-Instruct Length 3 7 0.04666666666666667
150


100%|██████████| 150/150 [00:00<00:00, 209785.13it/s]


GPT-3.5-Instruct Length 2 5 0.03333333333333333
150


100%|██████████| 150/150 [00:00<00:00, 182097.13it/s]

GPT-3.5-Instruct Length 1 47 0.31333333333333335





254 432 195 116
116


100%|██████████| 116/116 [00:00<00:00, 395560.38it/s]


GPT-4 Length 4 69 0.5948275862068966
150


100%|██████████| 150/150 [00:00<00:00, 134663.01it/s]


GPT-4 Length 3 106 0.7066666666666667
150


100%|██████████| 150/150 [00:00<00:00, 266474.21it/s]


GPT-4 Length 2 143 0.9533333333333334
150


100%|██████████| 150/150 [00:00<00:00, 353850.17it/s]

GPT-4 Length 1 128 0.8533333333333334





In [214]:
# analysis_length("GPT-3.5", version="_v2")
# analysis_length("GPT-3.5", version="_v3")
# analysis_length("GPT-3.5", version="_v4")
# analysis_length("GPT-3.5", version="_v5")

# analysis_length("GPT-4", version="_v2")
# analysis_length("GPT-4", version="_v3")
# analysis_length("GPT-4", version="_v4")
# analysis_length("GPT-4", version="_v5")

analysis_length("GPT-3.5-Instruct", version="_v2")
analysis_length("GPT-3.5-Instruct", version="_v3")
analysis_length("GPT-3.5-Instruct", version="_v4")
analysis_length("GPT-3.5-Instruct", version="_v5")

254 432 195 116
116


100%|██████████| 116/116 [00:00<00:00, 136822.07it/s]


GPT-3.5-Instruct Length 4 0 0.0
150


100%|██████████| 150/150 [00:00<00:00, 56771.85it/s]


GPT-3.5-Instruct Length 3 0 0.0
150


100%|██████████| 150/150 [00:00<00:00, 175005.73it/s]


GPT-3.5-Instruct Length 2 1 0.006666666666666667
150


100%|██████████| 150/150 [00:00<00:00, 251256.23it/s]

GPT-3.5-Instruct Length 1 5 0.03333333333333333





254 432 195 116
116


100%|██████████| 116/116 [00:00<00:00, 192080.25it/s]


GPT-3.5-Instruct Length 4 10 0.08620689655172414
150


100%|██████████| 150/150 [00:00<00:00, 136622.28it/s]


GPT-3.5-Instruct Length 3 13 0.08666666666666667
150


100%|██████████| 150/150 [00:00<00:00, 151528.32it/s]


GPT-3.5-Instruct Length 2 50 0.3333333333333333
150


100%|██████████| 150/150 [00:00<00:00, 172984.77it/s]

GPT-3.5-Instruct Length 1 116 0.7733333333333333





254 432 195 116
116


100%|██████████| 116/116 [00:00<00:00, 143015.66it/s]


GPT-3.5-Instruct Length 4 1 0.008620689655172414
150


100%|██████████| 150/150 [00:00<00:00, 115736.87it/s]


GPT-3.5-Instruct Length 3 4 0.02666666666666667
150


100%|██████████| 150/150 [00:00<00:00, 457560.44it/s]


GPT-3.5-Instruct Length 2 27 0.18
150


100%|██████████| 150/150 [00:00<00:00, 462607.06it/s]

GPT-3.5-Instruct Length 1 98 0.6533333333333333





254 432 195 116
116


100%|██████████| 116/116 [00:00<00:00, 208994.53it/s]


GPT-3.5-Instruct Length 4 24 0.20689655172413793
150


100%|██████████| 150/150 [00:00<00:00, 187524.77it/s]


GPT-3.5-Instruct Length 3 44 0.29333333333333333
150


100%|██████████| 150/150 [00:00<00:00, 207775.96it/s]


GPT-3.5-Instruct Length 2 73 0.4866666666666667
150


100%|██████████| 150/150 [00:00<00:00, 230794.42it/s]

GPT-3.5-Instruct Length 1 108 0.72





In [14]:
import json
import numpy as np
import os

def get_accuracy(rules, model='GPT-4', version=""):
    assert os.path.exists(f'ScriptData/Primitive/Analysis_data/annotations{version}/{model}.json')
    with open(f'ScriptData/Primitive/Analysis_data/annotations{version}/{model}.json', 'r') as r_f:
        label_list = json.load(r_f)

    random_seed = 234
    random.seed(random_seed)
    selected_index = random.sample(list(range(len(rules))), len(rules))
    if len(rules) >= 150: 
        selected_index = selected_index[:150]
    # print(len(selected_index))

    acc_num = 0
    for i in selected_index:
        assert rules[i]['s_rule'].strip() in label_list
        acc_num += label_list[rules[i]['s_rule'].strip()]
        # assert "Sym " + rules[i]['s_rule'].strip() in label_list
        # acc_num += label_list["Sym " + rules[i]['s_rule'].strip()]
    return acc_num/len(selected_index)
    
from tqdm import tqdm
def compute_overlap(candidate_list, new_sentence):
    new_sentence_tokens = set(new_sentence.lower().split())
    for each in candidate_list:
        if len(set(each) & new_sentence_tokens) / len(new_sentence_tokens) > 0.8 or len(set(each) & new_sentence_tokens) / len(set(each)) > 0.8:
            return False
    return True

def analysis_length_alltogether(model='GPT-4', positive=False):
    all_data_file = 'ScriptData/Primitive/Analysis_data/high_quality_probing_data.json'
    with open(all_data_file, 'r') as r_f:
        all_data = json.load(r_f)

    version = ["", "_v2", "_v3", "_v4", "_v5"]

    length_1_list = []
    length_2_list = []
    length_3_list = []
    length_4_list = []
    length_5_list = []
    length_1_list_tokens = []
    length_2_list_tokens = []
    length_3_list_tokens = []
    length_4_list_tokens = []
    for _ in range(len(all_data)):
        if all_data[_]['length'] == 1:
            if compute_overlap(length_1_list_tokens, all_data[_]['v_rule']):
                length_1_list_tokens.append(all_data[_]['v_rule'].lower().split())
                length_1_list.append(all_data[_])
        elif all_data[_]['length'] == 2:
            if compute_overlap(length_2_list_tokens, all_data[_]['v_rule']):
                length_2_list_tokens.append(all_data[_]['v_rule'].lower().split())
                length_2_list.append(all_data[_])
        elif all_data[_]['length'] == 3:
            if compute_overlap(length_3_list_tokens, all_data[_]['v_rule']):
                length_3_list_tokens.append(all_data[_]['v_rule'].lower().split())
                length_3_list.append(all_data[_])
        elif all_data[_]['length'] == 4:
            if compute_overlap(length_4_list_tokens, all_data[_]['v_rule']):
                length_4_list_tokens.append(all_data[_]['v_rule'].lower().split())
                length_4_list.append(all_data[_])
        else:
            length_5_list.append(all_data[_])
    print(len(length_1_list), len(length_2_list), len(length_3_list), len(length_4_list), len(length_5_list))
    
    if model == "human":
        probing_valid_rules_human(length_1_list, type="Length 1")
        probing_valid_rules_human(length_2_list, type="Length 2")
        probing_valid_rules_human(length_3_list, type="Length 3")
        probing_valid_rules_human(length_4_list, type="Length 4")
        probing_valid_rules_human(length_5_list, type="Length 5")
    else:
        all_acc_list = []
        all_deviation_list = []
        for type in ["Length 1", "Length 2", "Length 3", "Length 4", "Length 5"]: 
            if type == "Length 1": 
                rules = length_1_list
            elif type == "Length 2":
                rules = length_2_list
            elif type == "Length 3":
                rules = length_3_list
            elif type == "Length 4":
                rules = length_4_list
            elif type == "Length 5":
                rules = length_5_list
            all_acc = []
            for each_v in version:
                cur_acc = get_accuracy(rules, model=model, version=each_v)
                all_acc.append(cur_acc)
            avg_acc = sum(all_acc)/len(all_acc)
            deviation = np.std(all_acc)
            
            all_acc_list.append(round(avg_acc, 3))
            all_deviation_list.append(round(deviation, 3))
        print(all_acc_list)
        print(all_deviation_list)
    

In [15]:
analysis_length_alltogether(model='GPT-4')

254 432 195 116 107
[0.904, 0.885, 0.823, 0.716, 0.665]
[0.037, 0.027, 0.029, 0.034, 0.049]


In [16]:
analysis_length_alltogether(model='GPT-3.5')
analysis_length_alltogether(model='GPT-3.5-Instruct')

254 432 195 116 107
[0.913, 0.917, 0.715, 0.693, 0.568]
[0.033, 0.037, 0.051, 0.06, 0.051]
254 432 195 116 107
[0.811, 0.732, 0.483, 0.448, 0.321]
[0.092, 0.177, 0.161, 0.176, 0.207]


In [163]:
# seed = 42
import json
def analysis_depth(model='GPT-4', valid=True, positive=False, version=""):
    all_data_file = 'ScriptData/Primitive/Analysis_data/high_quality_probing_data.json'
    with open(all_data_file, 'r') as r_f:
        all_data = json.load(r_f)

    depth_0_list = []
    depth_1_list = []
    depth_2_list = []
    depth_3_list = []
    for _ in range(len(all_data)):
        # if all_data[_]['positive'] == positive:
        if all_data[_]['depth'] == 0:
            depth_0_list.append(all_data[_])
        elif all_data[_]['depth'] == 1:
            depth_1_list.append(all_data[_])
        elif all_data[_]['depth'] == 2:
            depth_2_list.append(all_data[_])
        elif all_data[_]['depth'] == 3:
            depth_3_list.append(all_data[_])
    print(len(depth_0_list), len(depth_1_list), len(depth_2_list), len(depth_3_list))
    
    if model == "human":
        probing_valid_rules_human(depth_3_list, type="Depth 3")
        probing_valid_rules_human(depth_2_list, type="Depth 2")
        probing_valid_rules_human(depth_1_list, type="Depth 1")
        probing_valid_rules_human(depth_0_list, type="Depth 0")
    else:
        probing_both_valid_invalid_rules(depth_0_list, model=model, type="Depth 0", version=version)
        probing_both_valid_invalid_rules(depth_1_list, model=model, type="Depth 1", version=version)
        probing_both_valid_invalid_rules(depth_2_list, model=model, type="Depth 2", version=version)
        probing_both_valid_invalid_rules(depth_3_list, model=model, type="Depth 3", version=version)

In [175]:
analysis_depth("human")

764 149 114 77


100%|██████████| 77/77 [00:00<00:00, 58956.08it/s]


human Depth 3 0.7792207792207793


100%|██████████| 114/114 [00:00<00:00, 527760.11it/s]


human Depth 2 0.8421052631578947


100%|██████████| 149/149 [00:00<00:00, 289195.42it/s]


human Depth 1 0.912751677852349


100%|██████████| 150/150 [00:00<00:00, 94225.79it/s]

human Depth 0 0.98





In [None]:
# analysis_depth("GPT-3.5")
# analysis_depth("GPT-4")
# analysis_depth("GPT-3.5-Instruct")

analysis_depth("GPT-4", version="_v2")
analysis_depth("GPT-4", version="_v3")
analysis_depth("GPT-4", version="_v4")
analysis_depth("GPT-4", version="_v5")

In [179]:
import json
import numpy as np

def get_accuracy(rules, model='GPT-4', version=""):
    assert os.path.exists(f'ScriptData/Primitive/Analysis_data/annotations{version}/{model}.json')
    with open(f'ScriptData/Primitive/Analysis_data/annotations{version}/{model}.json', 'r') as r_f:
        label_list = json.load(r_f)

    random_seed = 234
    random.seed(random_seed)
    selected_index = random.sample(list(range(len(rules))), len(rules))
    if len(rules) >= 150: 
        selected_index = selected_index[:150]
    # print(len(selected_index))

    acc_num = 0
    for i in selected_index:
        assert rules[i]['s_rule'].strip() in label_list
        acc_num += label_list[rules[i]['s_rule'].strip()]
    return acc_num/len(selected_index)
    

def analysis_depth_alltogether(model='GPT-4', positive=False):
    all_data_file = 'ScriptData/Primitive/Analysis_data/high_quality_probing_data.json'
    with open(all_data_file, 'r') as r_f:
        all_data = json.load(r_f)

    version = ["", "_v2", "_v3", "_v4", "_v5"]

    depth_0_list = []
    depth_1_list = []
    depth_2_list = []
    depth_3_list = []
    for _ in range(len(all_data)):
        if all_data[_]['depth'] == 0:
            depth_0_list.append(all_data[_])
        elif all_data[_]['depth'] == 1:
            depth_1_list.append(all_data[_])
        elif all_data[_]['depth'] == 2:
            depth_2_list.append(all_data[_])
        elif all_data[_]['depth'] == 3:
            depth_3_list.append(all_data[_])
    print(len(depth_0_list), len(depth_1_list), len(depth_2_list), len(depth_3_list))
    
    if model == "human":
        probing_valid_rules_human(depth_3_list, type="Depth 3")
        probing_valid_rules_human(depth_2_list, type="Depth 2")
        probing_valid_rules_human(depth_1_list, type="Depth 1")
        probing_valid_rules_human(depth_0_list, type="Depth 0")
    else:
        all_acc_list = []
        all_deviation_list = []
        for type in ["Depth 0", "Depth 1", "Depth 2", "Depth 3"]: #
            if type == "Depth 0":
                rules = depth_0_list
            elif type == "Depth 1":
                rules = depth_1_list
            elif type == "Depth 2":
                rules = depth_2_list
            else:
                rules = depth_3_list
            all_acc = []
            for each_v in version:
                cur_acc = get_accuracy(rules, model=model, version=each_v)
                all_acc.append(cur_acc)
            avg_acc = sum(all_acc)/len(all_acc)
            deviation = np.std(all_acc)
            
            all_acc_list.append(round(avg_acc, 3))
            all_deviation_list.append(round(deviation, 3))
        print(all_acc_list)
        print(all_deviation_list)
    

In [180]:
analysis_depth_alltogether(model='GPT-4')
analysis_depth_alltogether("GPT-3.5")
analysis_depth_alltogether("GPT-3.5-Instruct")

764 149 114 77
[0.915, 0.777, 0.681, 0.665]
[0.02, 0.04, 0.039, 0.052]
764 149 114 77
[0.859, 0.737, 0.686, 0.629]
[0.026, 0.044, 0.055, 0.077]
764 149 114 77
[0.672, 0.514, 0.463, 0.348]
[0.166, 0.16, 0.186, 0.234]


In [55]:
import json
def analysis_detailed_structure(model='GPT-4', positive=True, version = "", think_v="_Think_v2"): #_Think_v2
    all_data_file = 'ScriptData/Primitive/Analysis_data/high_quality_probing_data.json'
    with open(all_data_file, 'r') as r_f:
        all_data = json.load(r_f)

    structure_d_list = []
    structure_t_0_list = []
    structure_t_more_list = []
    structure_t_1_list = []
    structure_t_2_list = []
    structure_t_3_list = []
    structure_joint_list = []
    structure_joint_1_list = []
    structure_joint_2_list = []
    structure_joint_3_list = []
    for _ in range(len(all_data)):
        if all_data[_]['positive'] == positive: # and all_data[_]['length'] > 1: #or all_data[_]['positive'] != positive:
            structure_t_3_list.append(all_data[_])

            if all_data[_]['structure'] == "disjunctive":
                structure_d_list.append(all_data[_])
            elif all_data[_]['structure'] == "transitive" and all_data[_]['length'] > 1:
                structure_t_more_list.append(all_data[_])
                if all_data[_]['depth'] == 0:
                    structure_t_0_list.append(all_data[_])
                elif all_data[_]['depth'] >= 1 and all_data[_]['depth'] <= 3:
                    if all_data[_]['depth'] == 1:
                        structure_t_1_list.append(all_data[_])
                    elif all_data[_]['depth'] == 2:
                        structure_t_2_list.append(all_data[_])
                    # else:
                    #     structure_t_3_list.append(all_data[_])
            elif all_data[_]['structure'] == "transitive-disjunctive":
                structure_joint_list.append(all_data[_])
                if all_data[_]['depth'] == 1:
                    structure_joint_1_list.append(all_data[_])
                elif all_data[_]['depth'] == 2:
                    structure_joint_2_list.append(all_data[_])
                elif all_data[_]['depth'] == 3:
                    structure_joint_3_list.append(all_data[_])
    print(len(structure_d_list))
    print(len(structure_t_more_list), len(structure_t_0_list), len(structure_t_1_list), len(structure_t_2_list), len(structure_t_3_list))
    print(len(structure_joint_list), len(structure_joint_1_list), len(structure_joint_2_list), len(structure_joint_3_list))

    if model == "human":
        # probing_valid_rules_human(structure_d_list, type="Disjunctive")
        # probing_valid_rules_human(structure_t_more_list, type="Transitive-more")
        # probing_valid_rules_human(structure_joint_list, type="Transitive-Disjunctive")
        probing_valid_rules_human(structure_t_3_list, type=positive)
    else:
        # probing_both_valid_invalid_rules(structure_d_list, model=model, type="Disjunctive", version=version)
        # probing_both_valid_invalid_rules(structure_t_more_list, model=model, type="Transitive-more", version=version)
        # probing_both_valid_invalid_rules(structure_joint_list, model=model, type="Transitive-Disjunctive", version=version)

        probing_both_valid_invalid_rules(structure_t_3_list, model=model, type=positive, version=version, think_v=think_v)

In [89]:
analysis_detailed_structure("human", positive=False)

181
83 51 17 15 373
109 49 34 26


100%|██████████| 150/150 [00:00<00:00, 660867.23it/s]

human False 0.9333333333333333





In [58]:
analysis_detailed_structure("GPT-3.5", positive=True)
analysis_detailed_structure("GPT-4", positive=False)
analysis_detailed_structure("GPT-3.5-Instruct", positive=False)

181
83 51 17 15 438
109 49 34 26
150


  0%|          | 0/150 [00:00<?, ?it/s]

100%|██████████| 150/150 [50:46<00:00, 20.31s/it]

105 0
GPT-4 False 105 0.7





In [57]:
analysis_detailed_structure("GPT-3.5", version="_v2", positive=True)
analysis_detailed_structure("GPT-3.5", version="_v3", positive=True)
analysis_detailed_structure("GPT-3.5", version="_v4", positive=True)
analysis_detailed_structure("GPT-3.5", version="_v5", positive=True)

analysis_detailed_structure("GPT-4", version="_v2", positive=True)
analysis_detailed_structure("GPT-4", version="_v3", positive=True)
analysis_detailed_structure("GPT-4", version="_v4", positive=True)
analysis_detailed_structure("GPT-4", version="_v5", positive=True)

analysis_detailed_structure("GPT-3.5-Instruct", version="_v2", positive=False)
analysis_detailed_structure("GPT-3.5-Instruct", version="_v3", positive=False)
analysis_detailed_structure("GPT-3.5-Instruct", version="_v4", positive=False)
analysis_detailed_structure("GPT-3.5-Instruct", version="_v5", positive=False)


137
206 141 35 20 666
134 48 45 41
150


  0%|          | 0/150 [00:00<?, ?it/s]

100%|██████████| 150/150 [58:53<00:00, 23.56s/it] 


97 0
GPT-4 True 97 0.6466666666666666
137
206 141 35 20 666
134 48 45 41
150


100%|██████████| 150/150 [1:01:06<00:00, 24.44s/it]


123 0
GPT-4 True 123 0.82
137
206 141 35 20 666
134 48 45 41
150


100%|██████████| 150/150 [53:43<00:00, 21.49s/it] 


107 0
GPT-4 True 107 0.7133333333333334
137
206 141 35 20 666
134 48 45 41
150


100%|██████████| 150/150 [52:11<00:00, 20.87s/it]

114 0
GPT-4 True 114 0.76





In [59]:
analysis_detailed_structure("GPT-4", version="_v2", positive=False)
analysis_detailed_structure("GPT-4", version="_v3", positive=False)
analysis_detailed_structure("GPT-4", version="_v4", positive=False)
analysis_detailed_structure("GPT-4", version="_v5", positive=False)

181
83 51 17 15 438
109 49 34 26
150


100%|██████████| 150/150 [55:35<00:00, 22.23s/it]


102 0
GPT-4 False 102 0.68
181
83 51 17 15 438
109 49 34 26
150


100%|██████████| 150/150 [48:54<00:00, 19.57s/it]


118 0
GPT-4 False 118 0.7866666666666666
181
83 51 17 15 438
109 49 34 26
150


100%|██████████| 150/150 [43:35<00:00, 17.44s/it]


111 0
GPT-4 False 111 0.74
181
83 51 17 15 438
109 49 34 26
150


100%|██████████| 150/150 [43:05<00:00, 17.24s/it]

113 0
GPT-4 False 113 0.7533333333333333





In [72]:
import json
import numpy as np

def get_accuracy(rules, model='GPT-4', version="", think_v=""):
    assert os.path.exists(f'ScriptData/Primitive/Analysis_data/annotations{version}/{model}.json')
    with open(f'ScriptData/Primitive/Analysis_data/annotations{version}/{model}.json', 'r') as r_f:
        label_list = json.load(r_f)

    random_seed = 234
    random.seed(random_seed)
    selected_index = random.sample(list(range(len(rules))), len(rules))
    if len(rules) >= 150: 
        selected_index = selected_index[:150]

    acc_num = 0
    for i in selected_index:
        assert think_v + rules[i]['s_rule'].strip() in label_list
        acc_num += label_list[think_v + rules[i]['s_rule'].strip()]
    return acc_num/len(selected_index)
    

def analysis_structure_alltogether(model='GPT-4', positive=True, think_v=""):
    all_data_file = 'ScriptData/Primitive/Analysis_data/high_quality_probing_data.json'
    with open(all_data_file, 'r') as r_f:
        all_data = json.load(r_f)

    version = ["", "_v2", "_v3", "_v4", "_v5"]

    structure_d_list = []
    structure_t_0_list = []
    structure_t_more_list = []
    structure_t_1_list = []
    structure_t_2_list = []
    structure_t_3_list = []
    structure_joint_list = []
    structure_joint_1_list = []
    structure_joint_2_list = []
    structure_joint_3_list = []
    for _ in range(len(all_data)):
        if all_data[_]['positive'] == positive: # and all_data[_]['length'] > 1:
            structure_t_3_list.append(all_data[_])

            if all_data[_]['structure'] == "disjunctive":
                structure_d_list.append(all_data[_])
            elif all_data[_]['structure'] == "transitive" and all_data[_]['length'] > 1:
                structure_t_more_list.append(all_data[_])
                if all_data[_]['depth'] == 0:
                    structure_t_0_list.append(all_data[_])
                elif all_data[_]['depth'] >= 1 and all_data[_]['depth'] <= 3:
                    if all_data[_]['depth'] == 1:
                        structure_t_1_list.append(all_data[_])
                    elif all_data[_]['depth'] == 2:
                        structure_t_2_list.append(all_data[_])
                    # else:
                    #     structure_t_3_list.append(all_data[_])
            elif all_data[_]['structure'] == "transitive-disjunctive":
                structure_joint_list.append(all_data[_])
                if all_data[_]['depth'] == 1:
                    structure_joint_1_list.append(all_data[_])
                elif all_data[_]['depth'] == 2:
                    structure_joint_2_list.append(all_data[_])
                elif all_data[_]['depth'] == 3:
                    structure_joint_3_list.append(all_data[_])

    all_acc_list = []
    all_deviation_list = []
    # for type in ["Transitive", "Disjunctive", "Transitive-Disjunctive"]: #
    #     if type == "Disjunctive":
    #         rules = structure_d_list
    #     elif type == "Transitive":
    #         rules = structure_t_more_list
    #     else:
    #         rules = structure_joint_list
    for type in ["Positive"]: 
        rules = structure_t_3_list
            
        all_acc = []
        for each_v in version:
            cur_acc = get_accuracy(rules, model=model, version=each_v, think_v=think_v)
            all_acc.append(cur_acc)
        avg_acc = sum(all_acc)/len(all_acc)
        deviation = np.std(all_acc)
        
        all_acc_list.append(round(avg_acc, 3))
        all_deviation_list.append(round(deviation, 3))
    print(all_acc_list)
    print(all_deviation_list)
    

In [74]:
analysis_structure_alltogether("GPT-4")
analysis_structure_alltogether("GPT-3.5")
analysis_structure_alltogether("GPT-3.5-Instruct")

[0.851]
[0.041]
[0.809]
[0.033]
[0.661]
[0.174]


In [76]:
analysis_structure_alltogether("GPT-4", positive=False)
analysis_structure_alltogether("GPT-3.5", positive=False)
analysis_structure_alltogether("GPT-3.5-Instruct", positive=False)

[0.787]
[0.035]
[0.808]
[0.037]
[0.539]
[0.193]


In [69]:
analysis_structure_alltogether("GPT-4", positive=False)
analysis_structure_alltogether("GPT-3.5", positive=False)
analysis_structure_alltogether("GPT-3.5-Instruct", positive=False)

[0.787]
[0.035]
