## Imports and chains

In [1]:
from langchain.llms import OpenAI 
from langchain.prompts import PromptTemplate 
from langchain.chains.llm import LLMChain
from cot import Collection
import json
from langchain.chat_models import ChatOpenAI
# from dataloader import to_Collection

In [2]:
"""CoT Chain"""

llm = ChatOpenAI(temperature=.0,model_name="gpt-3.5-turbo") #ADA #for chat: gpt-3.5-turbo
#llm = ChatOpenAI(temperature=.0,model_name="gpt-4") 
#llm = OpenAI(temperature=.0,model_name="text-davinci-003") 
"""answer extraction chain"""

extraction_template = """{instruction}

Question: {question}
Answer_choices: {answer_choices}

Cot: {cot_trigger}{cot}
{answer_extraction}
"""

prompt_template = PromptTemplate(input_variables=["instruction","question","answer_choices","cot_trigger","cot","answer_extraction"], template=extraction_template)
answer_chain = LLMChain(llm=llm, prompt=prompt_template,output_key="predicted_answer")

In [3]:
reflect_template = """
    Question: {question}
    Answer_choices: {answer_choices}

    {cot_trigger} {cot}
    {answer_extraction} {answer}
    
    Reflection: {reflection_prompt}
    """
reflect_prompt_template = PromptTemplate(input_variables=["question","answer_choices","cot_trigger","cot","answer_extraction",'answer','reflection_prompt'], template=reflect_template)
reflect_chain = LLMChain(llm=llm, prompt=reflect_prompt_template,output_key="reflection")

#{instruction}
extraction_template = """

    Question: {question}
    Answer_choices: {answer_choices}

    {cot_trigger} {cot}
    {answer_extraction} {answer}
    
    Reflection: {reflection_prompt}
    {reflection}

    {reflect_answer_extraction}
    """
    #Get reflection
ans_prompt_template = PromptTemplate(input_variables=["question","answer_choices","cot_trigger","cot","answer_extraction",'answer','reflection_prompt','reflection','reflect_answer_extraction'], template=extraction_template)
reflect_answer_chain = LLMChain(llm=llm, prompt=ans_prompt_template,output_key="reflection_answer")

    # This is the overall chain where we run these two chains in sequence.
from langchain.chains import SequentialChain
reflect_overall_chain = SequentialChain(chains=[reflect_chain, reflect_answer_chain],input_variables=["question","answer_choices","cot_trigger","answer_extraction",'cot','answer','reflection_prompt','reflect_answer_extraction'],
        output_variables=["reflection", "reflection_answer"],
        verbose=True)


In [None]:
"""Extract script: Assumes there are CoTs in the dataset already"""
ts_hard = Collection.from_json('ts_hard_v1.json')
ts_hard.unload_datasets(["med_qa","medmc_qa","commonsense_qa","open_book_qa","worldtree"])


input_dict = {
    "instruction": "",
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "The question was created by crowdsourcing with the goal to test AI systems. Multiple answers may be valid but only one answer will be regarded as correct. If you give more than one answer, it will be evaluated as wrong. Pick the one answer that is most likely to be correct for this question. Therefore, among A through D, the answer is" 
}

extract = ts_hard.extract_flexible(chain=answer_chain,input_dict=input_dict)

In [5]:
test = Collection.to_Collection(extract,"strategy_qa",'train','file_test')
eval = test.evaluate()
from rich.pretty import pprint
pprint(eval)

  0%|          | 0/11 [00:00<?, ?ex/s]

In [None]:
ts_hard = Collection.from_json('ts_hard_v1.json')
ts_hard.unload_datasets(["med_qa","medmc_qa","commonsense_qa","strategy_qa","worldtree"])
extract = ts_hard.extract_flexible(chain=answer_chain,input_dict=input_dict)

In [7]:
test = Collection.to_Collection(extract,"open_book_qa",'test','file_test')
eval = test.evaluate()
from rich.pretty import pprint
pprint(eval)

  0%|          | 0/21 [00:00<?, ?ex/s]

In [39]:
"""Da Vinci"""

llm = OpenAI(temperature=.0,model_name="text-davinci-003")  

reflect_template = """
    Question: {question}
    Answer_choices: {answer_choices}

    {cot_trigger} {cot}
    {answer_extraction} {answer}
    
    Reflection: {reflection_prompt}
    """
reflect_prompt_template = PromptTemplate(input_variables=["question","answer_choices","cot_trigger","cot","answer_extraction",'answer','reflection_prompt'], template=reflect_template)
reflect_chain = LLMChain(llm=llm, prompt=reflect_prompt_template,output_key="reflection")

#{instruction}
extraction_template = """

    Question: {question}
    Answer_choices: {answer_choices}

    {cot_trigger} {cot}
    {answer_extraction} {answer}
    
    Reflection: {reflection_prompt}
    {reflection}

    {reflect_answer_extraction}
    """
    #Get reflection
ans_prompt_template = PromptTemplate(input_variables=["question","answer_choices","cot_trigger","cot","answer_extraction",'answer','reflection_prompt','reflection','reflect_answer_extraction'], template=extraction_template)
reflect_answer_chain = LLMChain(llm=llm, prompt=ans_prompt_template,output_key="reflection_answer")

    # This is the overall chain where we run these two chains in sequence.
from langchain.chains import SequentialChain
reflect_overall_chain = SequentialChain(chains=[reflect_chain, reflect_answer_chain],input_variables=["question","answer_choices","cot_trigger","answer_extraction",'cot','answer','reflection_prompt','reflect_answer_extraction'],
        output_variables=["reflection", "reflection_answer"],
        verbose=True)


In [46]:
#llm = ChatOpenAI(temperature=.0,model_name="gpt-4")   #text-davinci-003
llm = ChatOpenAI(temperature=.0,model_name="gpt-3.5-turbo")  

reflect_template = """
    Question: {question}
    Answer_choices: {answer_choices}

    {cot_trigger} {cot}
    {answer_extraction} {answer}
    
    Reflection: {reflection_prompt}
    """
reflect_prompt_template = PromptTemplate(input_variables=["question","answer_choices","cot_trigger","cot","answer_extraction",'answer','reflection_prompt'], template=reflect_template)
reflect_chain = LLMChain(llm=llm, prompt=reflect_prompt_template,output_key="reflection")

#{instruction}
extraction_template = """

    Question: {question}
    Answer_choices: {answer_choices}

    {cot_trigger} {cot}
    {answer_extraction} {answer}
    
    Reflection: {reflection_prompt}
    {reflection}

    {reflect_answer_extraction}
    """
    #Get reflection
ans_prompt_template = PromptTemplate(input_variables=["question","answer_choices","cot_trigger","cot","answer_extraction",'answer','reflection_prompt','reflection','reflect_answer_extraction'], template=extraction_template)
reflect_answer_chain = LLMChain(llm=llm, prompt=ans_prompt_template,output_key="reflection_answer")

    # This is the overall chain where we run these two chains in sequence.
from langchain.chains import SequentialChain
reflect_overall_chain = SequentialChain(chains=[reflect_chain, reflect_answer_chain],input_variables=["question","answer_choices","cot_trigger","answer_extraction",'cot','answer','reflection_prompt','reflect_answer_extraction'],
        output_variables=["reflection", "reflection_answer"],
        verbose=True)



In [45]:
instruction = "Answer the following question through step-by-step reasoning."
question = "Animals may fight, make threatening sounds, and act aggressively toward members of the same species. These behaviors usually occur as the result of",
answer_choices = [
                    "competition",
                    "conservation",
                    "decomposition",
                    "pollution"
                ]
cot_trigger = "Answer: Let's think step by step."
cot = "Aggression is needed to defend something, one needs to defend their spot when they are in competition"
answer_extraction = "Therefore, the answer is"
answer = "competition"
reflection_prompt = "do you agree with the cot yes or no"
reflection = "Great reasoning mate!"
reflect_answer_extraction = "Based on the text above the answer is:"

In [29]:
print(reflect_prompt_template.format(question=question,answer_choices=answer_choices,cot_trigger=cot_trigger,
                                     cot=cot,answer_extraction=answer_extraction,
                                     answer=answer,reflection_prompt=reflection_prompt))


    Question: ('Animals may fight, make threatening sounds, and act aggressively toward members of the same species. These behaviors usually occur as the result of',)
    Answer_choices: ['competition', 'conservation', 'decomposition', 'pollution']

    Answer: Let's think step by step.Aggression is needed to defend something, one needs to defend their spot when they are in competition
    Therefore, the answer is competition
    
    Reflection: do you agree with the cot yes or no
    


In [37]:
print(ans_prompt_template.format(question=question,answer_choices=answer_choices,cot_trigger=cot_trigger,
                                     cot=cot,answer_extraction=answer_extraction,
                                     answer=answer,reflection_prompt=reflection_prompt,
                                     reflection=reflection,reflect_answer_extraction=reflect_answer_extraction))



    Question: ('Animals may fight, make threatening sounds, and act aggressively toward members of the same species. These behaviors usually occur as the result of',)
    Answer_choices: ['competition', 'conservation', 'decomposition', 'pollution']

    Answer: Let's think step by step. Aggression is needed to defend something, one needs to defend their spot when they are in competition
    Therefore, the answer is competition
    
    Reflection: do you agree with the cot yes or no
    Great reasoning mate!

    Based on the text above the answer is:
    


In [7]:
coll = Collection.load_thoughtsource_100(names='strategy_qa',load_pregenerated_cots=True) #random_sample=False?
coll = coll.select(split="all", number_samples=1)
coll.select_generated_cots(cot_trigger = "kojima-01", api_service='cohere') #have one

In [11]:
coll

| Name        |   Train | Valid   | Test   |
|-------------|---------|---------|--------|
| strategy_qa |       1 | -       | -      |

Not loaded: ['aqua', 'asdiv', 'commonsense_qa', 'entailment_bank', 'gsm8k', 'mawps', 'med_qa', 'medmc_qa', 'open_book_qa', 'pubmed_qa', 'qed', 'svamp', 'worldtree']

## Run reflection

In [None]:
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, among A through D, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"Do you have any reason to believe that the reasoning or the answer might be wrong? Answer with one word Yes or No",
    'reflect_answer_extraction':'Based on the reflection, what is the definite answer?'
}
metareason = coll.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)


In [None]:
#These prompts only let the model say no reason to believe the answer is wrong

In [None]:
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, among A through D, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The goal is to correct the Answer if needed, let's think step by step",
    'reflect_answer_extraction':'Based on the reflection, what is the definite answer?',
    'model_name':"gpt-3.5-turbo"
}
metareason = coll.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

In [27]:
#sloppy metareasoning leads to a performance decrease

  0%|          | 0/20 [00:00<?, ?ex/s]

In [None]:
"""Use TS_hard dataset"""
ts_hard = Collection.from_json('ts_hard_v1.json')
ts_sqa = ts_hard.select(split="all", number_samples=1)

In [None]:
"""2/11 correct"""

input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The question may try to trick the respondent, revise the previous answer",
    'reflect_answer_extraction':'What is the final answer (true/false)?',
    'model_name':"gpt-3.5-turbo"
}
metareason = ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

In [None]:
""".36 corrected but a bit misleading""" 
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The question may try to trick the respondent and is designed to have one best answer. With this information, revise the previous answer",
    'reflect_answer_extraction':'What is the final answer? Answer with one word (true/false)?',
    'model_name':"gpt-3.5-turbo"
}
metareason = ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

In [None]:
"""18% accuracy""" 
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The question may try to trick the respondent and is designed to have one best answer. With this information, let's revise the previous answer step-by-step",
    'reflect_answer_extraction':'What is the final answer? Answer with one word (true/false)?',
    'model_name':"gpt-3.5-turbo"
}
metareason = ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

In [None]:
"""DaVinci 0.09 accuracy""" 
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The question may try to trick the respondent and is designed to have one best answer. With this information, revise the previous answer",
    'reflect_answer_extraction':'What is the final answer? Answer with one word (true/false)?',
    'model_name':"text-davinci-003"
}
metareason = ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

In [None]:
"""ChatGPT as language model I have no access to the creator's answer"""

input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The question was created by crowdsourcing with the goal to test AI systems. Multiple answers may be multiple valid but only one will be regarded as correct. Pick the one answer that the creator of the question picked. If you give more than one, you are evaluated as wrong.",
    'reflect_answer_extraction':'What is the final answer? Answer with one word (true/false)?',
    'model_name':"gpt-3.5-turbo"
}
metareason = ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

In [None]:
"""27% accuracy"""
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The question was created by crowdsourcing with the goal to test AI systems. Multiple answers may be valid but only one answer will be regarded as correct. If you give more than one answer, it will be evaluated as wrong. Pick the one answer that is most likely for this question. ",
    'reflect_answer_extraction':'What is the final answer? Answer with one word (true/false)?',
    'model_name':"gpt-3.5-turbo"
}
metareason = ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

In [53]:
test = Collection.to_Collection(metareason,"strategy_qa",'train','file_test')
eval = test.evaluate()
from rich.pretty import pprint
pprint(eval)

In [55]:
test.evaluate()
test.dump('file_test.json')

  0%|          | 0/11 [00:00<?, ?ex/s]

| Name        |   Train | Valid   | Test   |
|-------------|---------|---------|--------|
| strategy_qa |      11 | -       | -      |

Not loaded: ['aqua', 'asdiv', 'commonsense_qa', 'entailment_bank', 'gsm8k', 'mawps', 'med_qa', 'medmc_qa', 'open_book_qa', 'pubmed_qa', 'qed', 'svamp', 'worldtree']

In [57]:
ts_hard = Collection.from_json('ts_hard_v1.json')
ts_sqa = ts_hard.select(split="all", number_samples=1)
ts_hard.unload_datasets(["med_qa","medmc_qa","commonsense_qa","strategy_qa","worldtree"])
ts_hard

| Name         | Train   | Valid   |   Test |
|--------------|---------|---------|--------|
| open_book_qa | -       | -       |     21 |

Not loaded: ['aqua', 'asdiv', 'commonsense_qa', 'entailment_bank', 'gsm8k', 'mawps', 'med_qa', 'medmc_qa', 'pubmed_qa', 'qed', 'strategy_qa', 'svamp', 'worldtree']

In [None]:
"""As an AI language model, I agree with the given answer"""
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The question was created by crowdsourcing with the goal to test AI systems. Multiple answers may be valid but only one answer will be regarded as correct. If you give more than one answer, it will be evaluated as wrong. Pick the one answer that is most likely for this question.",
    'reflect_answer_extraction':'What is the final answer from A through D?',
    'model_name':"gpt-3.5-turbo"
}
metareason = ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

In [None]:
"""As an AI language model, I stand corrected., 14% accuracy"""
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The Answer was given by a language model and may be incorrect. Multiple answers may be valid but only one answer is the best correct answer. If you give more than one answer, it will be evaluated as wrong. Revise, then pick the one answer that is most likely for this question.",
    'reflect_answer_extraction':'What is the final answer from A through D?',
    'model_name':"gpt-3.5-turbo"
}
metareason = ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

In [None]:
ts_hard = Collection.from_json('ts_hard_v1.json')
ts_hard.unload_datasets(["med_qa","medmc_qa","commonsense_qa","open_book_qa","worldtree"])

"""9%"""
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The Answer was given by a language model and may be incorrect. Multiple answers may be valid but only one answer is the best correct answer. If you give more than one answer, it will be evaluated as wrong. Revise, then pick the one answer that is most likely for this question.",
    'reflect_answer_extraction':'What is the final answer (true/false)?',
    'model_name':"gpt-3.5-turbo"
}
metareason = ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)



In [70]:
test = Collection.to_Collection(metareason,"strategy_qa",'train','file_test')
eval = test.evaluate()
from rich.pretty import pprint
pprint(eval)

  0%|          | 0/11 [00:00<?, ?ex/s]

In [71]:
test.evaluate()
test.dump('file_test.json')

  0%|          | 0/11 [00:00<?, ?ex/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

## Overall review

In [18]:
from cot import Collection
ts_hard = Collection.from_json('ts_hard_v1.json')

In [None]:
"The question was created by crowdsourcing with the goal to test AI systems. Multiple answers may be valid but only one answer will be regarded as correct. If you give more than one answer, it will be evaluated as wrong. Pick the one answer that is most likely for this question. "

In [None]:
"The goal is to correct the Answer (generated by a language model) if needed, let's think step by step" 
"The question may try to trick the respondent and is designed to have one best answer. With this information, revise the previous answer made by a language model"
"The Answer was given by a language model and may be incorrect. Multiple answers may be valid but only one answer is the best correct answer. If you give more than one answer, it will be evaluated as wrong. Revise, then pick the one answer that is most likely for this question."

## First prompt

In [None]:
#check answer extraction
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The goal is to correct the Answer (generated by a language model) if needed, let's think step by step" ,
    'reflect_answer_extraction':'Based on the reflection, the answer is therefore',
    'model_name':"gpt-3.5-turbo"
}
ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)
ts_hard.evaluate()
ts_hard.dump("first_prompt.json")

In [None]:
ts_hard.evaluate()
ts_hard.dump("first_prompt_evaluated.json")

In [17]:
ts_hard.evaluate()

  0%|          | 0/12 [00:00<?, ?ex/s]

  0%|          | 0/7 [00:00<?, ?ex/s]

  0%|          | 0/11 [00:00<?, ?ex/s]

  0%|          | 0/21 [00:00<?, ?ex/s]

  0%|          | 0/20 [00:00<?, ?ex/s]

{'commonsense_qa': {'validation': {'accuracy': {'': {"_The goal is to correct the Answer (generated by a language model) if needed, let's think step by step_Based on the reflection, the answer is therefore": 0.0},
    'text-davinci-003': {'None_kojima-01_kojima-A-E': 0.0}}}},
 'worldtree': {'test': {'accuracy': {'': {"_The goal is to correct the Answer (generated by a language model) if needed, let's think step by step_Based on the reflection, the answer is therefore": 0.142857},
    'text-davinci-003': {'None_kojima-01_kojima-A-D': 0.0}}}},
 'strategy_qa': {'train': {'accuracy': {'': {"_The goal is to correct the Answer (generated by a language model) if needed, let's think step by step_Based on the reflection, the answer is therefore": 0.0},
    'text-davinci-003': {'None_kojima-01_kojima-yes-no': 0.0}}}},
 'open_book_qa': {'test': {'accuracy': {'': {"_The goal is to correct the Answer (generated by a language model) if needed, let's think step by step_Based on the reflection, the an

## Second prompt

In [4]:
ts_hard = Collection.from_json('ts_hard_v1.json')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/robertpraas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The question may try to trick the respondent and is designed to have one best answer. With this information, revise the previous answer made by a language model" ,
    'reflect_answer_extraction':'Based on the reflection, the answer is therefore',
    'model_name':"gpt-3.5-turbo"
}
ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

In [6]:
ts_hard.dump("second_prompt.json")

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [7]:
ts_hard.evaluate()

  0%|          | 0/12 [00:00<?, ?ex/s]

  0%|          | 0/7 [00:00<?, ?ex/s]

  0%|          | 0/11 [00:00<?, ?ex/s]

  0%|          | 0/21 [00:00<?, ?ex/s]

  0%|          | 0/20 [00:00<?, ?ex/s]

{'commonsense_qa': {'validation': {'accuracy': {'': {"_Answer: Let's think step by step._Therefore, the answer is": 0.083333},
    'text-davinci-003': {'None_kojima-01_kojima-A-E': 0.0}}}},
 'worldtree': {'test': {'accuracy': {'': {"_Answer: Let's think step by step._Therefore, the answer is": 0.142857},
    'text-davinci-003': {'None_kojima-01_kojima-A-D': 0.0}}}},
 'strategy_qa': {'train': {'accuracy': {'': {"_Answer: Let's think step by step._Therefore, the answer is": 0.090909},
    'text-davinci-003': {'None_kojima-01_kojima-yes-no': 0.0}}}},
 'open_book_qa': {'test': {'accuracy': {'': {"_Answer: Let's think step by step._Therefore, the answer is": 0.047619},
    'text-davinci-003': {'None_kojima-01_kojima-A-D': 0.0}}}},
 'med_qa': {'test': {'accuracy': {'': {"_Answer: Let's think step by step._Therefore, the answer is": 0.0},
    'text-davinci-003': {'None_kojima-01_kojima-A-E': 0.0}}}}}

In [11]:
ts_hard.dump("second_evaluated")

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
ts_hard_2 = Collection.from_json('ts_hard_v1.json')
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The Answer was given by a language model and may be incorrect. Multiple answers may be valid but only one answer is the best correct answer. If you give more than one answer, it will be evaluated as wrong. Revise, then pick the one answer that is most likely for this question.",
    'reflect_answer_extraction':'Based on the reflection, the answer is therefore',
    'model_name':"gpt-3.5-turbo"
}
ts_hard_2.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)
ts_hard_2.dump("third_prompt.json")

In [9]:
ts_hard_2.evaluate()

  0%|          | 0/12 [00:00<?, ?ex/s]

  0%|          | 0/7 [00:00<?, ?ex/s]

  0%|          | 0/11 [00:00<?, ?ex/s]

  0%|          | 0/21 [00:00<?, ?ex/s]

  0%|          | 0/20 [00:00<?, ?ex/s]

{'commonsense_qa': {'validation': {'accuracy': {'': {"_Answer: Let's think step by step._Therefore, the answer is": 0.333333},
    'text-davinci-003': {'None_kojima-01_kojima-A-E': 0.0}}}},
 'worldtree': {'test': {'accuracy': {'': {"_Answer: Let's think step by step._Therefore, the answer is": 0.428571},
    'text-davinci-003': {'None_kojima-01_kojima-A-D': 0.0}}}},
 'strategy_qa': {'train': {'accuracy': {'': {"_Answer: Let's think step by step._Therefore, the answer is": 0.0},
    'text-davinci-003': {'None_kojima-01_kojima-yes-no': 0.0}}}},
 'open_book_qa': {'test': {'accuracy': {'': {"_Answer: Let's think step by step._Therefore, the answer is": 0.238095},
    'text-davinci-003': {'None_kojima-01_kojima-A-D': 0.0}}}},
 'med_qa': {'test': {'accuracy': {'': {"_Answer: Let's think step by step._Therefore, the answer is": 0.05},
    'text-davinci-003': {'None_kojima-01_kojima-A-E': 0.0}}}}}

In [10]:
ts_hard_2.dump("third_evaluated")

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
#next step: inspect if other ans extraction can boost

In [5]:
from cot.stats import evaluation_as_table
test = Collection.from_json("third_evaluated.json")
evaluation_as_table(test)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/robertpraas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


IndexError: list index out of range

In [4]:
# join strings from a list with underscore
def join_strings(list_of_strings):
    if list_of_strings is None:
        list_of_strings = ["None"]
    if isinstance(list_of_strings, str):
        list_of_strings = [list_of_strings]
    joined_string = ""
    for string in list_of_strings:
        joined_string += ("-" + str(string))
    # delete first underscore
    joined_string = joined_string[1:]
    return(joined_string)

In [None]:
coll.dump("thoughtsource_100" + "_" + config['api_service'] + "_" + config['engine'].replace("/", "_") + "_" + join_strings(config["instruction_keys"]) + "_" + join_strings(config["cot_trigger_keys"]) + ".json")

In [6]:
ts_hard = Collection.from_json('ts_hard_v1.json')

In [7]:
ts_hard

| Name           | Train   | Valid   | Test   |
|----------------|---------|---------|--------|
| commonsense_qa | -       | 12      | -      |
| worldtree      | -       | -       | 7      |
| strategy_qa    | 11      | -       | -      |
| open_book_qa   | -       | -       | 21     |
| med_qa         | -       | -       | 20     |

Not loaded: ['aqua', 'asdiv', 'entailment_bank', 'gsm8k', 'mawps', 'medmc_qa', 'pubmed_qa', 'qed', 'svamp']

In [None]:
#Prompt 3 and GPT-4

In [18]:
ts_hard = Collection.from_json('ts_hard_v1.json')

In [None]:
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The Answer was given by a language model and may be incorrect. Multiple answers may be valid but only one answer is the best correct answer. If you give more than one answer, it will be evaluated as wrong. Revise, then pick the one answer that is most likely for this question.",
    'reflect_answer_extraction':'Based on the reflection, the answer is therefore',
    'model_name':"gpt-4"
}
ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)
ts_hard.dump("third_prompt_gpt_4.json")

In [22]:
ts_hard.evaluate()

  0%|          | 0/12 [00:00<?, ?ex/s]

  0%|          | 0/7 [00:00<?, ?ex/s]

  0%|          | 0/11 [00:00<?, ?ex/s]

  0%|          | 0/21 [00:00<?, ?ex/s]

  0%|          | 0/20 [00:00<?, ?ex/s]

{'commonsense_qa': {'validation': {'accuracy': {'': {'_The Answer was given by a language model and may be incorrect. Multiple answers may be valid but only one answer is the best correct answer. If you give more than one answer, it will be evaluated as wrong. Revise, then pick the one answer that is most likely for this question._Based on the reflection, the answer is therefore': 0.75},
    'text-davinci-003': {'None_kojima-01_kojima-A-E': 0.0}}}},
 'worldtree': {'test': {'accuracy': {'': {'_The Answer was given by a language model and may be incorrect. Multiple answers may be valid but only one answer is the best correct answer. If you give more than one answer, it will be evaluated as wrong. Revise, then pick the one answer that is most likely for this question._Based on the reflection, the answer is therefore': 0.857143},
    'text-davinci-003': {'None_kojima-01_kojima-A-D': 0.0}}}},
 'strategy_qa': {'train': {'accuracy': {'': {'_The Answer was given by a language model and may be 

In [23]:
ts_hard.dump("third_evaluated")

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

## Da Vinci on Da Vinci

In [29]:
ts_hard = Collection.from_json('ts_hard_v1.json')

In [None]:
#check answer extraction
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The goal is to correct the Answer (generated by a language model) if needed, let's think step by step" ,
    'reflect_answer_extraction':'Based on the reflection, the answer is therefore',
    'model_name':"gpt-3.5-turbo"
}
ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)
ts_hard.evaluate()
ts_hard.dump("first_prompt_davinci.json")

In [33]:
ts_hard.evaluate()

  0%|          | 0/12 [00:00<?, ?ex/s]

  0%|          | 0/7 [00:00<?, ?ex/s]

  0%|          | 0/11 [00:00<?, ?ex/s]

  0%|          | 0/21 [00:00<?, ?ex/s]

  0%|          | 0/20 [00:00<?, ?ex/s]

{'commonsense_qa': {'validation': {'accuracy': {'': {"_The goal is to correct the Answer (generated by a language model) if needed, let's think step by step_Based on the reflection, the answer is therefore": 0.0},
    'text-davinci-003': {'None_kojima-01_kojima-A-E': 0.0}}}},
 'worldtree': {'test': {'accuracy': {'': {"_The goal is to correct the Answer (generated by a language model) if needed, let's think step by step_Based on the reflection, the answer is therefore": 0.0},
    'text-davinci-003': {'None_kojima-01_kojima-A-D': 0.0}}}},
 'strategy_qa': {'train': {'accuracy': {'': {"_The goal is to correct the Answer (generated by a language model) if needed, let's think step by step_Based on the reflection, the answer is therefore": 0.0},
    'text-davinci-003': {'None_kojima-01_kojima-yes-no': 0.0}}}},
 'open_book_qa': {'test': {'accuracy': {'': {"_The goal is to correct the Answer (generated by a language model) if needed, let's think step by step_Based on the reflection, the answer 

In [None]:
ts_hard_2 = Collection.from_json('ts_hard_v1.json')
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The question may try to trick the respondent and is designed to have one best answer. With this information, revise the previous answer made by a language model" ,
    'reflect_answer_extraction':'Based on the reflection, the answer is therefore',
    'model_name':"text-davinci-003"
}
ts_hard_2.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)
ts_hard_2.evaluate()
ts_hard_2.dump("second_prompt_davinci.json")

In [34]:
ts_hard_2.evaluate()

  0%|          | 0/12 [00:00<?, ?ex/s]

  0%|          | 0/7 [00:00<?, ?ex/s]

  0%|          | 0/11 [00:00<?, ?ex/s]

  0%|          | 0/21 [00:00<?, ?ex/s]

  0%|          | 0/20 [00:00<?, ?ex/s]

{'commonsense_qa': {'validation': {'accuracy': {'': {'_The question may try to trick the respondent and is designed to have one best answer. With this information, revise the previous answer made by a language model_Based on the reflection, the answer is therefore': 0.0},
    'text-davinci-003': {'None_kojima-01_kojima-A-E': 0.0}}}},
 'worldtree': {'test': {'accuracy': {'': {'_The question may try to trick the respondent and is designed to have one best answer. With this information, revise the previous answer made by a language model_Based on the reflection, the answer is therefore': 0.0},
    'text-davinci-003': {'None_kojima-01_kojima-A-D': 0.0}}}},
 'strategy_qa': {'train': {'accuracy': {'': {'_The question may try to trick the respondent and is designed to have one best answer. With this information, revise the previous answer made by a language model_Based on the reflection, the answer is therefore': 0.0},
    'text-davinci-003': {'None_kojima-01_kojima-yes-no': 0.0}}}},
 'open_b

In [None]:
ts_hard_3 = Collection.from_json('ts_hard_v1.json')
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The Answer was given by a language model and may be incorrect. Multiple answers may be valid but only one answer is the best correct answer. If you give more than one answer, it will be evaluated as wrong. Revise, then pick the one answer that is most likely for this question.",
    'reflect_answer_extraction':'Based on the reflection, the answer is therefore',
    'model_name':"text-davinci-003"
}
ts_hard_3.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)
ts_hard_3.evaluate()
ts_hard_3.dump("third_prompt_davinci.json")

In [36]:
ts_hard_3.evaluate()

  0%|          | 0/12 [00:00<?, ?ex/s]

  0%|          | 0/7 [00:00<?, ?ex/s]

  0%|          | 0/11 [00:00<?, ?ex/s]

  0%|          | 0/21 [00:00<?, ?ex/s]

  0%|          | 0/20 [00:00<?, ?ex/s]

{'commonsense_qa': {'validation': {'accuracy': {'': {'_The Answer was given by a language model and may be incorrect. Multiple answers may be valid but only one answer is the best correct answer. If you give more than one answer, it will be evaluated as wrong. Revise, then pick the one answer that is most likely for this question._Based on the reflection, the answer is therefore': 0.083333},
    'text-davinci-003': {'None_kojima-01_kojima-A-E': 0.0}}}},
 'worldtree': {'test': {'accuracy': {'': {'_The Answer was given by a language model and may be incorrect. Multiple answers may be valid but only one answer is the best correct answer. If you give more than one answer, it will be evaluated as wrong. Revise, then pick the one answer that is most likely for this question._Based on the reflection, the answer is therefore': 0.0},
    'text-davinci-003': {'None_kojima-01_kojima-A-D': 0.0}}}},
 'strategy_qa': {'train': {'accuracy': {'': {'_The Answer was given by a language model and may be i

## Binary classification

In [15]:
#llm = ChatOpenAI(temperature=.0,model_name="gpt-3.5-turbo") #ADA #for chat: gpt-3.5-turbo
#llm = ChatOpenAI(temperature=.0,model_name="gpt-4") 
llm = OpenAI(temperature=.0,model_name="text-davinci-003") 

"""answer extraction chain"""

extraction_template = """{instruction}

Question: {question}
Answer_choices: {answer_choices}

Reasoning: {cot_trigger}{cot}
{answer_extraction}
"""

prompt_template = PromptTemplate(input_variables=["instruction","question","answer_choices","cot_trigger","cot","answer_extraction"], template=extraction_template)
answer_chain = LLMChain(llm=llm, prompt=prompt_template,output_key="predicted_answer")


In [42]:
#{instruction}
llm = OpenAI(temperature=.0,model_name="text-davinci-003") 
reflect_extraction_template = """

    Question: {question}
    Answer_choices: {answer_choices}

    {cot_trigger} {cot}
    {answer_extraction} {answer}
    
    Reflection: {reflect_answer_extraction}
    """
    #Get reflection
reflect_ans_prompt_template = PromptTemplate(input_variables=["question","answer_choices","cot_trigger","cot","answer_extraction",'answer','reflect_answer_extraction'], template=reflect_extraction_template)
single_reflect_answer_chain = LLMChain(llm=llm, prompt=reflect_ans_prompt_template,output_key="predicted_answer")

In [5]:
ts_hard = Collection.from_json('ts_hard_v1.json')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/robertpraas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
ts_hard

| Name           | Train   | Valid   | Test   |
|----------------|---------|---------|--------|
| commonsense_qa | -       | 4       | -      |
| med_qa         | -       | -       | 1      |
| open_book_qa   | -       | -       | 12     |
| strategy_qa    | 3       | -       | -      |
| worldtree      | -       | -       | 1      |

Not loaded: ['aqua', 'asdiv', 'entailment_bank', 'gsm8k', 'mawps', 'medmc_qa', 'pubmed_qa', 'qed', 'svamp']

In [None]:
#connect answer
"""

First: "answer_extraction": "The Reasoning is done by a language model and may contain bias, noise or factual inconsistencies. If the reasoning is correct, answer with 'True'. If there is suspicion that the Reasoning is incorrect, answer with 'False'" 
Second: "answer_extraction": "You are a corrector model that needs to judge whether the provided Reasoning, which was done by a language model, is factually correct. If you think there might be a problem with the Reasoning, state 'False.' If you believe nothing is wrong with the reasoning, state 'True'."
Third: "You will provide a Critique of the Reasoning, highlighting the limitations, inaccuracies, or areas that need improvement or expansion, while providing guidance on how to address these issues in a response. Start with 'False' if the Reasoning leads to the wrong answer, else start with 'True'." 
Fourth: "A corrector model indicates the Reasoning leads to the wrong answer to the question. Provide a better answer."
"""

In [17]:
ts_hard = Collection.from_json("ts_hard_chatgpt.json")

"""Extract script: Assumes there are CoTs in the dataset already"""

input_dict = {
    "instruction": "",
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction":  "A corrector model indicates the Reasoning leads to the wrong answer to the question. Provide a better answer."
}

ts_hard.extract_flexible(chain=answer_chain,input_dict=input_dict)

Generating commonsense_qa...
Generating med_qa...
Generating open_book_qa...
Generating strategy_qa...
Generating worldtree...


In [18]:
ts_hard.dump("chatgpt_checks_chatgpt_4")

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [20]:
ts_hard.evaluate()

  0%|          | 0/4 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/12 [00:00<?, ?ex/s]

  0%|          | 0/3 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

{'commonsense_qa': {'validation': {'accuracy': {'gpt-3.5-turbo': {'None_kojima-01_A corrector model indicates the Reasoning leads to the wrong answer to the question. Provide a better answer.': 0.25,
     'None_kojima-01_kojima-A-E': 0.0}}}},
 'med_qa': {'test': {'accuracy': {'gpt-3.5-turbo': {'None_kojima-01_A corrector model indicates the Reasoning leads to the wrong answer to the question. Provide a better answer.': 0.0,
     'None_kojima-01_kojima-A-E': 0.0}}}},
 'open_book_qa': {'test': {'accuracy': {'gpt-3.5-turbo': {'None_kojima-01_A corrector model indicates the Reasoning leads to the wrong answer to the question. Provide a better answer.': 0.25,
     'None_kojima-01_kojima-A-D': 0.0}}}},
 'strategy_qa': {'train': {'accuracy': {'gpt-3.5-turbo': {'None_kojima-01_A corrector model indicates the Reasoning leads to the wrong answer to the question. Provide a better answer.': 0.0,
     'None_kojima-01_kojima-yes-no': 0.0}}}},
 'worldtree': {'test': {'accuracy': {'gpt-3.5-turbo': {'N

In [19]:
#fourth
import re
data = [('open_book_qa','test'),('strategy_qa','train'),('commonsense_qa','validation'),('med_qa','test'),('worldtree','test')]

correct = 0
incorrect = 0
undecided = 0
unclassified = 0

for pair in data:
    #df_pandas = .to_pandas()
    for row in ts_hard[pair[0]][pair[1]]:
        text = row['generated_cot'][0]['answers'][1]['answer']
        print(text)
        true = bool(re.search(r"(?i)True", text))
        false = bool(re.search(r"(?i)False", text))
        if true & false:
            undecided +=1 
        elif true:
            correct+=1
        elif false:
            incorrect+=1
        else:
            unclassified+=1

print(f"Correct:{correct}")
print(f"Incorrect:{incorrect}")
print(f"Undecided:{undecided}")
print(f"Unclassified:{unclassified}")

Correct answer: C) annually.

Reasoning: An equinox occurs twice a year, in March and September, when the tilt of the Earth's axis is inclined neither away from nor towards the Sun, resulting in a nearly equal amount of daylight and darkness at all latitudes. Therefore, a person has a chance to experience an equinox once a year.
A better answer would be:

Answer: A large cargo ship or a freight train.

As mentioned in the reasoning, these modes of transportation are designed to carry heavy loads and have a high total weight, making them the transportation modes with the most mass.
Correct answer: A) The color of my hair. 

Reasoning: Over a period of time, the weather can cause changes in the amount of sunlight and UV radiation that a person is exposed to, which can lead to changes in hair color. This is especially true for people who spend a lot of time outdoors, as the sun can bleach or darken hair over time. While the weather can also affect the performance of electronic devices lik

In [79]:
ts_hard = Collection.from_json("../notebooks/chatgpt_checks_chatgpt_1.json")

In [80]:
#1st
import re
data = [('open_book_qa','test'),('strategy_qa','train'),('commonsense_qa','validation'),('med_qa','test'),('worldtree','test')]

correct = 0
incorrect = 0
undecided = 0
unclassified = 0

for pair in data:
    #df_pandas = .to_pandas()
    for row in ts_hard[pair[0]][pair[1]]:
        text = row['generated_cot'][0]['answers'][1]['answer']
        print(text)
        true = bool(re.search(r"(?i)True", text))
        false = bool(re.search(r"(?i)False", text))
        if true & false:
            undecided +=1 
        elif true:
            correct+=1
        elif false:
            incorrect+=1
        else:
            unclassified+=1

print(f"Correct:{correct}")
print(f"Incorrect:{incorrect}")
print(f"Undecided:{undecided}")
print(f"Unclassified:{unclassified}")

True
True
True
True
True
True
True
True
True
True
True
True
True
False
True.
True
True
True
True
True
False
Correct:19
Incorrect:2
Undecided:0
Unclassified:0


In [78]:
#2nd
import re
data = [('open_book_qa','test'),('strategy_qa','train'),('commonsense_qa','validation'),('med_qa','test'),('worldtree','test')]

correct = 0
incorrect = 0
undecided = 0
unclassified = 0

for pair in data:
    #df_pandas = .to_pandas()
    for row in ts_hard[pair[0]][pair[1]]:
        text = row['generated_cot'][0]['answers'][1]['answer']
        print(text)
        true = bool(re.search(r"(?i)True", text))
        false = bool(re.search(r"(?i)False", text))
        if true & false:
            undecided +=1 
        elif true:
            correct+=1
        elif false:
            incorrect+=1
        else:
            unclassified+=1

print(f"Correct:{correct}")
print(f"Incorrect:{incorrect}")
print(f"Undecided:{undecided}")
print(f"Unclassified:{unclassified}")

True.
True.
True.
True.
True.
True.
True.
True.
True.
True.
True.
True.
True.
True.
True.
True.
True.
True.
True.
True.
False. The Moon cycles through its phases occur once per lunar month, not once per day.
Correct:20
Incorrect:1
Undecided:0
Unclassified:0


In [16]:
#3rd        

True. The reasoning is accurate and correctly identifies that an equinox occurs twice a year, resulting in nearly equal amounts of daylight and darkness at all latitudes. The answer choice C) annually is the correct choice. No limitations, inaccuracies, or areas needing improvement or expansion were identified in the reasoning.
True: The reasoning is accurate in explaining the concept of mass and transportation modes. However, the options provided in the question are limited and do not include all possible transportation modes. The correct answer, as mentioned in the reasoning, would be a large cargo ship or a freight train. To improve the question, the options should be expanded to include all possible transportation modes that could have the most mass. Additionally, the question could be more specific about the type of transportation being referred to, such as passenger transportation or cargo transportation.
True. The reasoning is accurate and correctly eliminates options A, B, and 

## try with double call

In [None]:
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "The answer is therefore", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"A corrector model indicates the Reasoning leads to the wrong answer to the question. Provide a better answer.",
    'reflect_answer_extraction':'Based on the reflection, what is the definite answer?'
}
ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

In [25]:
ts_hard.evaluate()

  0%|          | 0/4 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/12 [00:00<?, ?ex/s]

  0%|          | 0/3 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

{'commonsense_qa': {'validation': {'accuracy': {'': {'_A corrector model indicates the Reasoning leads to the wrong answer to the question. Provide a better answer._Based on the reflection, what is the definite answer?': 0.25},
    'gpt-3.5-turbo': {'None_kojima-01_A corrector model indicates the Reasoning leads to the wrong answer to the question. Provide a better answer.': 0.25,
     'None_kojima-01_kojima-A-E': 0.0}}}},
 'med_qa': {'test': {'accuracy': {'': {'_A corrector model indicates the Reasoning leads to the wrong answer to the question. Provide a better answer._Based on the reflection, what is the definite answer?': 0.0},
    'gpt-3.5-turbo': {'None_kojima-01_A corrector model indicates the Reasoning leads to the wrong answer to the question. Provide a better answer.': 0.0,
     'None_kojima-01_kojima-A-E': 0.0}}}},
 'open_book_qa': {'test': {'accuracy': {'': {'_A corrector model indicates the Reasoning leads to the wrong answer to the question. Provide a better answer._Based

In [26]:
ts_hard.dump("chatgpt_checks_chatgpt_double_4")

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
print(ans_prompt_template.format(question=question,answer_choices=answer_choices,cot_trigger=cot_trigger,
                                     cot=cot,answer_extraction=answer_extraction,
                                     answer=answer,reflection_prompt=reflection_prompt,
                                     reflection=reflection,reflect_answer_extraction=reflect_answer_extraction))

## Continue

In [43]:
multiple_ans_test = Collection.from_json("clean_chat_hard_multiple_ans.json")

In [37]:
multiple_ans_test

| Name           | Train   | Valid   | Test   |
|----------------|---------|---------|--------|
| commonsense_qa | -       | 7       | -      |
| med_qa         | -       | -       | 3      |
| open_book_qa   | -       | -       | 4      |

Not loaded: ['aqua', 'asdiv', 'entailment_bank', 'gsm8k', 'mawps', 'medmc_qa', 'pubmed_qa', 'qed', 'strategy_qa', 'svamp', 'worldtree']

In [81]:
"""
1:"The above Answer to the Question contains multiple answers, however, only one answer is correct. The best answer is therefore"
2:"The above Answer to the Question contains multiple answers, however, only one answer is correct. Select one answer. The best answer is therefore"
3:"The multiple-choice question only contains one correct answer, giving multiple answers is wrong. From the Answer given, select the one best answer. The best answer is therefore"
"""

'\n1:"The above Answer to the Question contains multiple answers, however, only one answer is correct. The best answer is therefore"\n2:"The above Answer to the Question contains multiple answers, however, only one answer is correct. Select one answer. The best answer is therefore"\n3:"The multiple-choice question only contains one correct answer, giving multiple answers is wrong. From the Answer given, select the one best answer. The best answer is therefore"\n'

In [70]:


"""Extract script: Assumes there are CoTs in the dataset already"""

input_dict = {
    "instruction": "",
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction":  "The answer is therefore",
    'reflect_answer_extraction': "The multiple-choice question only contains one correct answer, giving multiple answers is wrong. From the Answer given, select the one best answer. The best answer is therefore"
}

multiple_ans_test.extract_flexible(chain=single_reflect_answer_chain,input_dict=input_dict)

Generating commonsense_qa...
Generating med_qa...
Generating open_book_qa...


In [42]:
#1
multiple_ans_test.evaluate()

  0%|          | 0/7 [00:00<?, ?ex/s]

  0%|          | 0/3 [00:00<?, ?ex/s]

  0%|          | 0/4 [00:00<?, ?ex/s]

{'commonsense_qa': {'validation': {'accuracy': {'': {"_Answer: Let's think step by step._The answer is therefore": 0.142857},
    'gpt-3.5-turbo': {'None_kojima-01_kojima-A-E': 0.0}}}},
 'med_qa': {'test': {'accuracy': {'': {"_Answer: Let's think step by step._The answer is therefore": 0.666667},
    'gpt-3.5-turbo': {'None_kojima-01_kojima-A-E': 0.0}}}},
 'open_book_qa': {'test': {'accuracy': {'': {"_Answer: Let's think step by step._The answer is therefore": 0.25},
    'gpt-3.5-turbo': {'None_kojima-01_kojima-A-D': 0.0}}}}}

In [45]:
#2
multiple_ans_test.evaluate()

  0%|          | 0/7 [00:00<?, ?ex/s]

  0%|          | 0/3 [00:00<?, ?ex/s]

  0%|          | 0/4 [00:00<?, ?ex/s]

{'commonsense_qa': {'validation': {'accuracy': {'': {"_Answer: Let's think step by step._The answer is therefore": 0.214286},
    'gpt-3.5-turbo': {'None_kojima-01_kojima-A-E': 0.0}}}},
 'med_qa': {'test': {'accuracy': {'': {"_Answer: Let's think step by step._The answer is therefore": 0.5},
    'gpt-3.5-turbo': {'None_kojima-01_kojima-A-E': 0.0}}}},
 'open_book_qa': {'test': {'accuracy': {'': {"_Answer: Let's think step by step._The answer is therefore": 0.25},
    'gpt-3.5-turbo': {'None_kojima-01_kojima-A-D': 0.0}}}}}

In [71]:
#3
multiple_ans_test.evaluate()

  0%|          | 0/7 [00:00<?, ?ex/s]

  0%|          | 0/3 [00:00<?, ?ex/s]

  0%|          | 0/4 [00:00<?, ?ex/s]

{'commonsense_qa': {'validation': {'accuracy': {'': {"_Answer: Let's think step by step._The answer is therefore": 0.142857},
    'gpt-3.5-turbo': {'None_kojima-01_kojima-A-E': 0.0}}}},
 'med_qa': {'test': {'accuracy': {'': {"_Answer: Let's think step by step._The answer is therefore": 0.333333},
    'gpt-3.5-turbo': {'None_kojima-01_kojima-A-E': 0.0}}}},
 'open_book_qa': {'test': {'accuracy': {'': {"_Answer: Let's think step by step._The answer is therefore": 0.25},
    'gpt-3.5-turbo': {'None_kojima-01_kojima-A-D': 0.0}}}}}

In [74]:
data = [('open_book_qa','test'),('commonsense_qa','validation'),('med_qa','test')]


for pair in data:
    for row in multiple_ans_test[pair[0]][pair[1]]:
        text = row['generated_cot'][1]['answers'][0]['answer']
        print(text)


either A, B, or C, depending on the situation.
A and D. It is important to carefully read and analyze each answer choice before selecting the best answer.
either A or C, depending on the situation and context. It is important to carefully consider all options and eliminate those that are clearly incorrect before making a final decision.
A) adding salt or C) adding sand. It is important to understand the reasoning behind each option and how it can effectively reduce ice buildup on a sidewalk.
D) Exhaustion.
C) aggravation and E) stress.
A) apartment and E) surface of earth. It is important to carefully read and consider all options before selecting the best answer.
C) instruments, as it directly states that metal is used to make instruments. The other options may involve the use of metal, but they are not directly related to the question of what metal is used to make.
E) hurt feelings and C) resentment.
B) cardboard box or E) container.
either C) forest or D) countryside.
C) Increases l

In [43]:
multiple_ans_test.dump("multiple_ans_test_1")

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [44]:
llm

OpenAI(cache=None, verbose=False, callback_manager=<langchain.callbacks.shared.SharedCallbackManager object at 0x7fcd11f25b50>, client=<class 'openai.api_resources.completion.Completion'>, model_name='text-davinci-003', temperature=0.0, max_tokens=256, top_p=1, frequency_penalty=0, presence_penalty=0, n=1, best_of=1, model_kwargs={}, openai_api_key=None, batch_size=20, request_timeout=None, logit_bias={}, max_retries=6, streaming=False)

In [None]:
"""Test with prompt-2 for GPT-4 and Da Vinci"""

In [None]:
"""Code notes:
find right answer extraction from item
"""

In [36]:
input_dict = {
    "instruction": "",
    "api_service": "chat_openai",
    "model": "GPT-4",
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction":  "The answer is therefore",
    'reflect_answer_extraction': "The above Answer to the Question contains multiple answers, however, only one answer is correct. Select one answer. The best answer is therefore"
}

multiple_ans_test.extract_flexible(chain=single_reflect_answer_chain,input_dict=input_dict)

Generating commonsense_qa...
Generating med_qa...
Generating open_book_qa...


In [37]:
#1
multiple_ans_test.evaluate()

  0%|          | 0/7 [00:00<?, ?ex/s]

  0%|          | 0/3 [00:00<?, ?ex/s]

  0%|          | 0/4 [00:00<?, ?ex/s]

{'commonsense_qa': {'validation': {'accuracy': {'GPT-4': {"_Answer: Let's think step by step._The answer is therefore": 0.428571},
    'gpt-3.5-turbo': {'None_kojima-01_kojima-A-E': 0.0}}}},
 'med_qa': {'test': {'accuracy': {'GPT-4': {"_Answer: Let's think step by step._The answer is therefore": 1.0},
    'gpt-3.5-turbo': {'None_kojima-01_kojima-A-E': 0.0}}}},
 'open_book_qa': {'test': {'accuracy': {'GPT-4': {"_Answer: Let's think step by step._The answer is therefore": 0.75},
    'gpt-3.5-turbo': {'None_kojima-01_kojima-A-D': 0.0}}}}}

In [40]:
data = [('open_book_qa','test'),('commonsense_qa','validation'),('med_qa','test')]


for pair in data:
    for row in multiple_ans_test[pair[0]][pair[1]]:
        text = row['generated_cot'][1]['answers'][0]['answer']
        print(text)

B) melting into sand
A) It is made from rock
C) making an adjustment to the rudder
A) adding salt
D) exhaustion
E) stress.
A) apartment
C) instruments.
C) resentment
E) container.
D) countryside
C) Increases levels of fetal hemoglobin (HgbF).
B) Meningocele
C) Nifedipine.


In [41]:
multiple_ans_test.dump("answer_correction_gpt_4")

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [45]:
input_dict = {
    "instruction": "",
    "api_service": "openai",
    "model": "text-davinci-003",
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction":  "The answer is therefore",
    'reflect_answer_extraction': "The above Answer to the Question contains multiple answers, however, only one answer is correct. Select one answer. The best answer is therefore"
}

multiple_ans_test.extract_flexible(chain=single_reflect_answer_chain,input_dict=input_dict)

Generating commonsense_qa...
Generating med_qa...
Generating open_book_qa...


In [46]:
multiple_ans_test.evaluate()

  0%|          | 0/7 [00:00<?, ?ex/s]

  0%|          | 0/3 [00:00<?, ?ex/s]

  0%|          | 0/4 [00:00<?, ?ex/s]

{'commonsense_qa': {'validation': {'accuracy': {'gpt-3.5-turbo': {'None_kojima-01_kojima-A-E': 0.0},
    'text-davinci-003': {"_Answer: Let's think step by step._The answer is therefore": 0.571429}}}},
 'med_qa': {'test': {'accuracy': {'gpt-3.5-turbo': {'None_kojima-01_kojima-A-E': 0.0},
    'text-davinci-003': {"_Answer: Let's think step by step._The answer is therefore": 1.0}}}},
 'open_book_qa': {'test': {'accuracy': {'gpt-3.5-turbo': {'None_kojima-01_kojima-A-D': 0.0},
    'text-davinci-003': {"_Answer: Let's think step by step._The answer is therefore": 0.75}}}}}

In [47]:
data = [('open_book_qa','test'),('commonsense_qa','validation'),('med_qa','test')]


for pair in data:
    for row in multiple_ans_test[pair[0]][pair[1]]:
        text = row['generated_cot'][1]['answers'][0]['answer']
        print(text)


A) Mimicking other things

A and D.

Answer C: Making an adjustment to the rudder.

A) adding salt

Answer: D) Exhaustion

C) aggravation.

A) apartment.

C) Instruments.

C) Resentment.

B) cardboard box or E) container.

Answer: C) Forest

C) Increases levels of fetal hemoglobin (HgbF)

Answer: B) Meningocele

Answer: C) Nifedipine


## One answer, extraction only

In [2]:
from cot import Collection
multiple_ans_test = Collection.from_json("clean_chat_hard_multiple_ans.json")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/robertpraas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
#chatgpt

input_dict = {
    "instruction": "",
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction":  "This is a multiple-choice question that only contains one correct answer, giving multiple answers is wrong. Only select the one best answer. The best answer is therefore"
}

multiple_ans_test.extract_flexible(chain=answer_chain,input_dict=input_dict)

Generating commonsense_qa...
Generating med_qa...
Generating open_book_qa...


In [8]:
multiple_ans_test.evaluate()

  0%|          | 0/7 [00:00<?, ?ex/s]

  0%|          | 0/3 [00:00<?, ?ex/s]

  0%|          | 0/4 [00:00<?, ?ex/s]

{'commonsense_qa': {'validation': {'accuracy': {'': {"_Answer: Let's think step by step._This is a multiple-choice question that only contains one correct answer, giving multiple answers is wrong. Only select the one best answer. The best answer is therefore": 0.571429},
    'gpt-3.5-turbo': {'None_kojima-01_kojima-A-E': 0.0}}}},
 'med_qa': {'test': {'accuracy': {'': {"_Answer: Let's think step by step._This is a multiple-choice question that only contains one correct answer, giving multiple answers is wrong. Only select the one best answer. The best answer is therefore": 0.666667},
    'gpt-3.5-turbo': {'None_kojima-01_kojima-A-E': 0.0}}}},
 'open_book_qa': {'test': {'accuracy': {'': {"_Answer: Let's think step by step._This is a multiple-choice question that only contains one correct answer, giving multiple answers is wrong. Only select the one best answer. The best answer is therefore": 0.25},
    'gpt-3.5-turbo': {'None_kojima-01_kojima-A-D': 0.0}}}}}

In [9]:
data = [('open_book_qa','test'),('commonsense_qa','validation'),('med_qa','test')]


for pair in data:
    for row in multiple_ans_test[pair[0]][pair[1]]:
        text = row['generated_cot'][1]['answers'][0]['answer']
        print(text)

either A, B, or C, depending on the situation.
A) It is made from rock and D) It is made from metal.
either A or C, depending on the situation and the sailor's preferences.
A) adding salt.
D) Exhaustion.
C) aggravation.
E) surface of earth.
C) instruments, as it is the most specific and accurate answer to the question.
C) resentment.
either B) cardboard box or E) container.
either C) forest or D) countryside.
C) Increases levels of fetal hemoglobin (HgbF).
B) Meningocele or D) Sensorineural hearing loss.
either C) Nifedipine or D) Hydrochlorothiazide.


In [11]:
#GPT-4

input_dict = {
    "instruction": "",
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction":  "This is a multiple-choice question that only contains one correct answer, giving multiple answers is wrong. Only select the one best answer. The best answer is therefore"
}

multiple_ans_test.extract_flexible(chain=answer_chain,input_dict=input_dict)

Generating commonsense_qa...
Generating med_qa...
Generating open_book_qa...


In [12]:
multiple_ans_test.evaluate()

  0%|          | 0/7 [00:00<?, ?ex/s]

  0%|          | 0/3 [00:00<?, ?ex/s]

  0%|          | 0/4 [00:00<?, ?ex/s]

{'commonsense_qa': {'validation': {'accuracy': {'': {"_Answer: Let's think step by step._This is a multiple-choice question that only contains one correct answer, giving multiple answers is wrong. Only select the one best answer. The best answer is therefore": 0.5},
    'gpt-3.5-turbo': {'None_kojima-01_kojima-A-E': 0.0}}}},
 'med_qa': {'test': {'accuracy': {'': {"_Answer: Let's think step by step._This is a multiple-choice question that only contains one correct answer, giving multiple answers is wrong. Only select the one best answer. The best answer is therefore": 0.833333},
    'gpt-3.5-turbo': {'None_kojima-01_kojima-A-E': 0.0}}}},
 'open_book_qa': {'test': {'accuracy': {'': {"_Answer: Let's think step by step._This is a multiple-choice question that only contains one correct answer, giving multiple answers is wrong. Only select the one best answer. The best answer is therefore": 0.625},
    'gpt-3.5-turbo': {'None_kojima-01_kojima-A-D': 0.0}}}}}

In [14]:
data = [('open_book_qa','test'),('commonsense_qa','validation'),('med_qa','test')]


for pair in data:
    for row in multiple_ans_test[pair[0]][pair[1]]:
        text = row['generated_cot'][2]['answers'][0]['answer']
        print(text)

A) mimicking other things
A) It is made from rock
C) making an adjustment to the rudder
A) adding salt
D) exhaustion
C) aggravation
A) apartment
C) instruments
C) resentment
B) cardboard box
D) countryside
C) Increases levels of fetal hemoglobin (HgbF).
B) Meningocele, as it is a neural tube defect directly associated with carbamazepine use during pregnancy.
C) Nifedipine, as it is a calcium channel blocker commonly used as a first-line anti-hypertensive medication and can cause lower extremity swelling as a side effect.


In [16]:
#Da Vinci

input_dict = {
    "instruction": "",
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction":  "This is a multiple-choice question that only contains one correct answer, giving multiple answers is wrong. Only select the one best answer. The best answer is therefore"
}

multiple_ans_test.extract_flexible(chain=answer_chain,input_dict=input_dict)

Generating commonsense_qa...
Generating med_qa...
Generating open_book_qa...


In [17]:
multiple_ans_test.evaluate()

  0%|          | 0/7 [00:00<?, ?ex/s]

  0%|          | 0/3 [00:00<?, ?ex/s]

  0%|          | 0/4 [00:00<?, ?ex/s]

{'commonsense_qa': {'validation': {'accuracy': {'': {"_Answer: Let's think step by step._This is a multiple-choice question that only contains one correct answer, giving multiple answers is wrong. Only select the one best answer. The best answer is therefore": 0.52381},
    'gpt-3.5-turbo': {'None_kojima-01_kojima-A-E': 0.0}}}},
 'med_qa': {'test': {'accuracy': {'': {"_Answer: Let's think step by step._This is a multiple-choice question that only contains one correct answer, giving multiple answers is wrong. Only select the one best answer. The best answer is therefore": 0.777778},
    'gpt-3.5-turbo': {'None_kojima-01_kojima-A-E': 0.0}}}},
 'open_book_qa': {'test': {'accuracy': {'': {"_Answer: Let's think step by step._This is a multiple-choice question that only contains one correct answer, giving multiple answers is wrong. Only select the one best answer. The best answer is therefore": 0.75},
    'gpt-3.5-turbo': {'None_kojima-01_kojima-A-D': 0.0}}}}}

In [23]:
data = [('open_book_qa','test'),('commonsense_qa','validation'),('med_qa','test')]


for pair in data:
    for row in multiple_ans_test[pair[0]][pair[1]]:
        text = row['generated_cot'][3]
        print(text)

{'id': '99bcd0b6-f9da-4c4d-9b46-2d1ada4f4ec1', 'fragments_version': '0.01', 'instruction': '', 'cot_trigger': "Answer: Let's think step by step.", 'cot_trigger_template': '', 'prompt_text': '', 'cot': '\n\nFirst, we need to understand that an octopus is a highly intelligent and adaptable creature. When faced with danger, it has several defense mechanisms to protect itself.\n\nOption A: Mimicking other things\n\nOctopuses are known for their ability to change color and texture to blend in with their surroundings. This is called camouflage, and it helps them avoid predators. However, it is not a foolproof defense mechanism, and sometimes an octopus may need to take more drastic measures.\n\nOption B: Melting into sand\n\nSome species of octopus have the ability to change not only their color and texture but also their shape. They can flatten themselves out and blend in with the sand or rocks on the ocean floor. This makes them almost invisible to predators.\n\nOption C: Creating new home

In [24]:
multiple_ans_test.dump("Force_one_ans_three_models")

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

## Single reflection prompt

In [6]:
from cot import Collection
coll = Collection.load_thoughtsource_100(names=['worldtree'],load_pregenerated_cots=False) #random_sample=False?
coll = coll.select(split="all", number_samples=10)

  0%|          | 0/100 [00:00<?, ?ex/s]

In [7]:
coll

| Name      | Train   | Valid   |   Test |
|-----------|---------|---------|--------|
| worldtree | -       | -       |     10 |

Not loaded: ['aqua', 'asdiv', 'commonsense_qa', 'entailment_bank', 'gsm8k', 'mawps', 'med_qa', 'medmc_qa', 'open_book_qa', 'pubmed_qa', 'qed', 'strategy_qa', 'svamp']

In [None]:
# #check answer extraction
# input_dict = {
#     "cot_trigger": "Answer: Let's think step by step.",
#     "answer_extraction": "Therefore, the answer is", 
#     'answer':"", 
#     'cot': "", 
#     'reflection_prompt':"The goal is to correct the Answer (generated by a language model) if needed, let's think step by step" ,
#     'reflect_answer_extraction':'Based on the reflection, the answer is therefore',
#     'model_name':"gpt-4",
#     'temperature': 0,
#     'max_tokens':800
# }
# coll.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)
# coll.evaluate()
# #coll.dump("single_meta.json")
# coll

In [12]:
config={
    # We compare three different prompts for the chain of thought generation:
    # "Answer: Let's think step by step." and 'Answer: We should think about this step by step.', and "Answer: First," 
    "cot_trigger_keys": ['praas-01'],

    # We use the same answer extraction prompt for all three prompts
    # "Therefore, among A through D, the answer is"
    "answer_extraction_keys": ['auto-kojima'],#['kojima-A-D'], 
    
    "author" : "Robert",
    "api_service": "openai_chat", 
    "engine": "gpt-4", 
    "temperature": 0,
    "max_tokens": 512,
    "verbose": False,
    "warn": True,
}

In [13]:
coll.generate(config=config)

Generating worldtree...


  0%|          | 0/5 [00:00<?, ?ex/s]

In [14]:
coll.dump("single_meta_experiment.json")

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [9]:
coll['worldtree']['test'][0]['generated_cot'][27]

{'id': '52d6be0f-3757-46a5-961f-ec1e85730ac2',
 'fragments_version': '0.01',
 'instruction': None,
 'cot_trigger': 'praas-01',
 'cot_trigger_template': '{instruction}\n\n{question}\n{answer_choices}\n\n{cot_trigger}',
 'prompt_text': '',
 'cot': "Answer: A) the mass of Earth.\n\nProblems with the answer: The answer is correct, but it does not provide a complete explanation of how gravity works. It only mentions the mass of Earth as the cause of gravity but does not mention the role of distance between objects and the gravitational constant.\n\nImproved answer: Gravity on Earth is caused by the mass of Earth, the distance between objects, and the gravitational constant. The force of gravity is directly proportional to the product of the masses of the objects and inversely proportional to the square of the distance between their centers. This relationship is described by Newton's law of universal gravitation.",
 'answers': [{'id': 'ead68434-f09f-49b1-9c6b-16ba80913022',
   'answer_extrac

In [21]:
# On GPT-4 hard dataset

gpt_4_hard = Collection.from_json("filtered_thoughtsource_100_gpt-4_false_only_gpt-4_cots.json")
gpt_4_hard.select_generated_cots(cot_trigger = "zhou-01", api_service='openai_chat',model = 'gpt-4',answer=False)

In [22]:
gpt_4_hard = gpt_4_hard.filter(lambda x: len(x["generated_cot"])==1)
gpt_4_hard

| Name           | Train   | Valid   | Test   |
|----------------|---------|---------|--------|
| commonsense_qa | -       | 28      | -      |
| med_qa         | -       | -       | 24     |
| medmc_qa       | -       | 30      | -      |
| open_book_qa   | -       | -       | 5      |
| strategy_qa    | 20      | -       | -      |
| worldtree      | -       | -       | 1      |

Not loaded: ['aqua', 'asdiv', 'entailment_bank', 'gsm8k', 'mawps', 'pubmed_qa', 'qed', 'svamp']

In [23]:
gpt_4_hard_backup = gpt_4_hard

In [24]:
gpt_4_hard.unload_datasets(["med_qa","medmc_qa","commonsense_qa","strategy_qa"])
gpt_4_hard

| Name         | Train   | Valid   |   Test |
|--------------|---------|---------|--------|
| open_book_qa | -       | -       |      5 |
| worldtree    | -       | -       |      1 |

Not loaded: ['aqua', 'asdiv', 'commonsense_qa', 'entailment_bank', 'gsm8k', 'mawps', 'med_qa', 'medmc_qa', 'pubmed_qa', 'qed', 'strategy_qa', 'svamp']

In [25]:
gpt_4_hard.generate(config=config)
gpt_4_hard.evaluate()
gpt_4_hard.dump("single_meta_experiment_2.json")


Generating open_book_qa...
Generating worldtree...


In [16]:
#Second prompt
config={
    # We compare three different prompts for the chain of thought generation:
    # "Answer: Let's think step by step." and 'Answer: We should think about this step by step.', and "Answer: First," 
    "cot_trigger_keys": ['praas-02'],

    # We use the same answer extraction prompt for all three prompts
    # "Therefore, among A through D, the answer is"
    "answer_extraction_keys": ['auto-kojima'],#['kojima-A-D'], 
    
    "author" : "Robert",
    "api_service": "openai_chat", 
    "engine": "gpt-4", 
    "temperature": 0,
    "max_tokens": 512,
    "verbose": False,
    "warn": True,
}

In [27]:
from cot import Collection
gpt_4_hard = Collection.from_json("filtered_thoughtsource_100_gpt-4_false_only_gpt-4_cots.json")



In [28]:
gpt_4_hard.select_generated_cots(cot_trigger = "zhou-01", api_service='openai_chat',model = 'gpt-4',answer=False)
gpt_4_hard = gpt_4_hard.filter(lambda x: len(x["generated_cot"])==1)
gpt_4_hard

| Name           | Train   | Valid   | Test   |
|----------------|---------|---------|--------|
| commonsense_qa | -       | 28      | -      |
| med_qa         | -       | -       | 24     |
| medmc_qa       | -       | 30      | -      |
| open_book_qa   | -       | -       | 5      |
| strategy_qa    | 20      | -       | -      |
| worldtree      | -       | -       | 1      |

Not loaded: ['aqua', 'asdiv', 'entailment_bank', 'gsm8k', 'mawps', 'pubmed_qa', 'qed', 'svamp']

In [35]:
gpt_4_hard.unload_datasets(["commonsense_qa","medmc_qa","worldtree","med_qa",'open_book_qa'])
# gpt_4_hard.select_generated_cots(cot_trigger = "zhou-01", api_service='openai_chat',model = 'gpt-4',answer=False)
# gpt_4_hard

In [36]:
gpt_4_hard

| Name        |   Train | Valid   | Test   |
|-------------|---------|---------|--------|
| strategy_qa |      20 | -       | -      |

Not loaded: ['aqua', 'asdiv', 'commonsense_qa', 'entailment_bank', 'gsm8k', 'mawps', 'med_qa', 'medmc_qa', 'open_book_qa', 'pubmed_qa', 'qed', 'svamp', 'worldtree']

In [8]:
gpt_4_hard.generate(config=config)
gpt_4_hard.evaluate()
gpt_4_hard.dump("single_meta_experiment_prompt_2.json")

Generating open_book_qa...
Generating worldtree...


In [3]:
#Third prompt
config={
    # We compare three different prompts for the chain of thought generation:
    # "Answer: Let's think step by step." and 'Answer: We should think about this step by step.', and "Answer: First," 
    "cot_trigger_keys": ['praas-03'],

    # We use the same answer extraction prompt for all three prompts
    # "Therefore, among A through D, the answer is"
    "answer_extraction_keys": ['auto-kojima'], #['kojima-A-D'], 
    
    "author" : "Robert",
    "api_service": "openai_chat", 
    "engine": "gpt-4", 
    "temperature": 0,
    "max_tokens": 512,
    "verbose": False,
    "warn": True,
}

In [4]:
gpt_4_hard

| Name         | Train   | Valid   |   Test |
|--------------|---------|---------|--------|
| open_book_qa | -       | -       |      5 |
| worldtree    | -       | -       |      1 |

Not loaded: ['aqua', 'asdiv', 'commonsense_qa', 'entailment_bank', 'gsm8k', 'mawps', 'med_qa', 'medmc_qa', 'pubmed_qa', 'qed', 'strategy_qa', 'svamp']

In [5]:
gpt_4_hard.generate(config=config)
gpt_4_hard.evaluate()
gpt_4_hard.dump("single_meta_experiment_prompt_3.json")

Generating open_book_qa...
Generating worldtree...


In [9]:
coll.generate(config=config)
coll.evaluate()
coll.dump("single_meta_experiment_prompt_2_worldtree.json")

Generating worldtree...


  0%|          | 0/10 [00:00<?, ?ex/s]

  0%|          | 0/10 [00:00<?, ?ex/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [17]:
#try commmonsense prompt 2

gpt_4_hard.generate(config=config)
gpt_4_hard.evaluate()
gpt_4_hard.dump("prompt_2_cs.json")

Generating commonsense_qa...


Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIError: Bad gateway. {"error":{"code":502,"message":"Bad gateway.","param":null,"type":"cf_bad_gateway"}} 502 {'error': {'code': 502, 'message': 'Bad gateway.', 'param': None, 'type': 'cf_bad_gateway'}} {'Date': 'Mon, 17 Apr 2023 08:24:51 GMT', 'Content-Type': 'application/json', 'Content-Length': '84', 'Connection': 'keep-alive', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'same-origin', 'Cache-Control': 'private, max-age=0, no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': 'Thu, 01 Jan 1970 00:00:01 GMT', 'Server': 'cloudflare', 'CF-RAY': '7b933e4e6b6577eb-VIE'}.


In [20]:
config={
    # We compare three different prompts for the chain of thought generation:
    # "Answer: Let's think step by step." and 'Answer: We should think about this step by step.', and "Answer: First," 
    "cot_trigger_keys": ['praas-03'],

    # We use the same answer extraction prompt for all three prompts
    # "Therefore, among A through D, the answer is"
    "answer_extraction_keys": ['auto-kojima'],#['kojima-A-D'], 
    
    "author" : "Robert",
    "api_service": "openai_chat", 
    "engine": "gpt-4", 
    "temperature": 0,
    "max_tokens": 512,
    "verbose": False,
    "warn": True,
}

In [19]:
#try commmonsense prompt 3

gpt_4_hard.generate(config=config)
gpt_4_hard.evaluate()
gpt_4_hard.dump("prompt_3_cs.json")

Generating commonsense_qa...


In [21]:
coll.generate(config=config)
coll.evaluate()
coll.dump("single_meta_experiment_prompt_3_worldtree.json")

Generating worldtree...


In [1]:
from cot import Collection
gpt_4_hard = Collection.from_json("prompt_3_cs.json")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/robertpraas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
config={
    # We compare three different prompts for the chain of thought generation:
    # "Answer: Let's think step by step." and 'Answer: We should think about this step by step.', and "Answer: First," 
    "cot_trigger_keys": ['praas-04'],

    # We use the same answer extraction prompt for all three prompts
    # "Therefore, among A through D, the answer is"
    "answer_extraction_keys": ['auto-kojima'],#['kojima-A-D'], 
    
    "author" : "Robert",
    "api_service": "openai_chat", 
    "engine": "gpt-4", 
    "temperature": 0,
    "max_tokens": 512,
    "verbose": False,
    "warn": True,
}

In [3]:
gpt_4_hard.generate(config=config)
gpt_4_hard.evaluate()
gpt_4_hard.dump("prompt_4_cs.json")

Generating commonsense_qa...


  0%|          | 0/28 [00:00<?, ?ex/s]

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIError: Bad gateway. {"error":{"code":502,"message":"Bad gateway.","param":null,"type":"cf_bad_gateway"}} 502 {'error': {'code': 502, 'message': 'Bad gateway.', 'param': None, 'type': 'cf_bad_gateway'}} {'Date': 'Mon, 17 Apr 2023 12:23:01 GMT', 'Content-Type': 'application/json', 'Content-Length': '84', 'Connection': 'keep-alive', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'same-origin', 'Cache-Control': 'private, max-age=0, no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': 'Thu, 01 Jan 1970 00:00:01 GMT', 'Server': 'cloudflare', 'CF-RAY': '7b949b0b8f3dc320-VIE'}.


  0%|          | 0/28 [00:00<?, ?ex/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [37]:
config={
    # We compare three different prompts for the chain of thought generation:
    # "Answer: Let's think step by step." and 'Answer: We should think about this step by step.', and "Answer: First," 
    "cot_trigger_keys": ['praas-05'],

    # We use the same answer extraction prompt for all three prompts
    # "Therefore, among A through D, the answer is"
    "answer_extraction_keys": ['auto-kojima'],#['kojima-A-D'], 
    
    "author" : "Robert",
    "api_service": "openai_chat", 
    "engine": "gpt-4", 
    "temperature": 0,
    "max_tokens": 512,
    "verbose": False,
    "warn": True,
}


In [38]:
gpt_4_hard.generate(config=config)
gpt_4_hard.evaluate()
gpt_4_hard.dump("single_meta_experiment_prompt_5_new_data.json")

Generating strategy_qa...


Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 264bc179d64057c76b92a4864983b846 in your message.).
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID ef6cf4fe94dd7d65e4fe1bd99f365433 in your message.).


In [33]:
gpt_4_hard.evaluate()

{'open_book_qa': {'test': {'accuracy': {'gpt-4': {'None_praas-05_kojima-A-D': 0.0,
     'None_zhou-01_kojima-A-D': 0.0}}}},
 'strategy_qa': {'train': {'accuracy': {'gpt-4': {'None_zhou-01_kojima-yes-no': 0.0}}}}}

In [34]:
gpt_4_hard.dump("single_meta_experiment_prompt_5_new_data.json")

In [39]:
config={
    # We compare three different prompts for the chain of thought generation:
    # "Answer: Let's think step by step." and 'Answer: We should think about this step by step.', and "Answer: First," 
    "cot_trigger_keys": ['praas-06'],

    # We use the same answer extraction prompt for all three prompts
    # "Therefore, among A through D, the answer is"
    "answer_extraction_keys": ['auto-kojima'],#['kojima-A-D'], 
    
    "author" : "Robert",
    "api_service": "openai_chat", 
    "engine": "gpt-4", 
    "temperature": 0,
    "max_tokens": 512,
    "verbose": False,
    "warn": True,
}

In [40]:
gpt_4_hard.generate(config=config)
gpt_4_hard.evaluate()
gpt_4_hard.dump("single_meta_experiment_prompt_6.json")

Generating strategy_qa...


Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIError: Bad gateway. {"error":{"code":502,"message":"Bad gateway.","param":null,"type":"cf_bad_gateway"}} 502 {'error': {'code': 502, 'message': 'Bad gateway.', 'param': None, 'type': 'cf_bad_gateway'}} {'Date': 'Fri, 21 Apr 2023 15:37:59 GMT', 'Content-Type': 'application/json', 'Content-Length': '84', 'Connection': 'keep-alive', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'same-origin', 'Cache-Control': 'private, max-age=0, no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': 'Thu, 01 Jan 1970 00:00:01 GMT', 'Server': 'cloudflare', 'CF-RAY': '7bb6ae894d5f0a30-ARN'}.
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIError: Bad gateway. {"error":{"code":502,"message":"Bad gateway.","param":null,"type":"cf_bad_gateway"}} 502 {'error': {'code':

In [41]:
gpt_4_hard.evaluate()

{'strategy_qa': {'train': {'accuracy': {'gpt-4': {'None_praas-05_kojima-yes-no': 0.2,
     'None_praas-06_kojima-yes-no': 0.1,
     'None_zhou-01_kojima-yes-no': 0.0}}}}}

In [None]:
"Answer the question, then critique the answer. Based on the critique, reconsider the other answer options and give a single final answer. Answer: Let's work this out in a step by step way to be sure we have the right answer."


In [None]:
Answer: Let's work this out in a step by step way to be sure we have the right answer. Critique the initial answer, reconsider the other answer options and then give a single final answer.