## Imports and chains

In [1]:
from langchain.llms import OpenAI 
from langchain.prompts import PromptTemplate 
from langchain.chains.llm import LLMChain
from cot import Collection
import json
from langchain.chat_models import ChatOpenAI
# from dataloader import to_Collection

In [3]:
"""CoT Chain"""

llm = ChatOpenAI(temperature=.0,model_name="gpt-3.5-turbo") #ADA #for chat: gpt-3.5-turbo
"""answer extraction chain"""

extraction_template = """{instruction}

Question: {question}
Answer_choices: {answer_choices}

Cot: {cot_trigger}{cot}
{answer_extraction}
"""

prompt_template = PromptTemplate(input_variables=["instruction","question","answer_choices","cot_trigger","cot","answer_extraction"], template=extraction_template)
answer_chain = LLMChain(llm=llm, prompt=prompt_template,output_key="predicted_answer")

In [None]:
"""Extract script: Assumes there are CoTs in the dataset already"""
ts_hard = Collection.from_json('ts_hard_v1.json')
ts_hard.unload_datasets(["med_qa","medmc_qa","commonsense_qa","open_book_qa","worldtree"])


input_dict = {
    "instruction": "",
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "The question was created by crowdsourcing with the goal to test AI systems. Multiple answers may be valid but only one answer will be regarded as correct. If you give more than one answer, it will be evaluated as wrong. Pick the one answer that is most likely to be correct for this question. Therefore, among A through D, the answer is" 
}

extract = ts_hard.extract_flexible(chain=answer_chain,input_dict=input_dict)

In [5]:
test = Collection.to_Collection(extract,"strategy_qa",'train','file_test')
eval = test.evaluate()
from rich.pretty import pprint
pprint(eval)

  0%|          | 0/11 [00:00<?, ?ex/s]

In [None]:
ts_hard = Collection.from_json('ts_hard_v1.json')
ts_hard.unload_datasets(["med_qa","medmc_qa","commonsense_qa","strategy_qa","worldtree"])
extract = ts_hard.extract_flexible(chain=answer_chain,input_dict=input_dict)

In [7]:
test = Collection.to_Collection(extract,"open_book_qa",'test','file_test')
eval = test.evaluate()
from rich.pretty import pprint
pprint(eval)

  0%|          | 0/21 [00:00<?, ?ex/s]

In [39]:
"""Da Vinci"""

llm = OpenAI(temperature=.0,model_name="text-davinci-003")  

reflect_template = """
    Question: {question}
    Answer_choices: {answer_choices}

    {cot_trigger} {cot}
    {answer_extraction} {answer}
    
    Reflection: {reflection_prompt}
    """
reflect_prompt_template = PromptTemplate(input_variables=["question","answer_choices","cot_trigger","cot","answer_extraction",'answer','reflection_prompt'], template=reflect_template)
reflect_chain = LLMChain(llm=llm, prompt=reflect_prompt_template,output_key="reflection")

#{instruction}
extraction_template = """

    Question: {question}
    Answer_choices: {answer_choices}

    {cot_trigger} {cot}
    {answer_extraction} {answer}
    
    Reflection: {reflection_prompt}
    {reflection}

    {reflect_answer_extraction}
    """
    #Get reflection
ans_prompt_template = PromptTemplate(input_variables=["question","answer_choices","cot_trigger","cot","answer_extraction",'answer','reflection_prompt','reflection','reflect_answer_extraction'], template=extraction_template)
reflect_answer_chain = LLMChain(llm=llm, prompt=ans_prompt_template,output_key="reflection_answer")

    # This is the overall chain where we run these two chains in sequence.
from langchain.chains import SequentialChain
reflect_overall_chain = SequentialChain(chains=[reflect_chain, reflect_answer_chain],input_variables=["question","answer_choices","cot_trigger","answer_extraction",'cot','answer','reflection_prompt','reflect_answer_extraction'],
        output_variables=["reflection", "reflection_answer"],
        verbose=True)


In [46]:
#llm = ChatOpenAI(temperature=.0,model_name="gpt-4")   #text-davinci-003
llm = ChatOpenAI(temperature=.0,model_name="gpt-3.5-turbo")  

reflect_template = """
    Question: {question}
    Answer_choices: {answer_choices}

    {cot_trigger} {cot}
    {answer_extraction} {answer}
    
    Reflection: {reflection_prompt}
    """
reflect_prompt_template = PromptTemplate(input_variables=["question","answer_choices","cot_trigger","cot","answer_extraction",'answer','reflection_prompt'], template=reflect_template)
reflect_chain = LLMChain(llm=llm, prompt=reflect_prompt_template,output_key="reflection")

#{instruction}
extraction_template = """

    Question: {question}
    Answer_choices: {answer_choices}

    {cot_trigger} {cot}
    {answer_extraction} {answer}
    
    Reflection: {reflection_prompt}
    {reflection}

    {reflect_answer_extraction}
    """
    #Get reflection
ans_prompt_template = PromptTemplate(input_variables=["question","answer_choices","cot_trigger","cot","answer_extraction",'answer','reflection_prompt','reflection','reflect_answer_extraction'], template=extraction_template)
reflect_answer_chain = LLMChain(llm=llm, prompt=ans_prompt_template,output_key="reflection_answer")

    # This is the overall chain where we run these two chains in sequence.
from langchain.chains import SequentialChain
reflect_overall_chain = SequentialChain(chains=[reflect_chain, reflect_answer_chain],input_variables=["question","answer_choices","cot_trigger","answer_extraction",'cot','answer','reflection_prompt','reflect_answer_extraction'],
        output_variables=["reflection", "reflection_answer"],
        verbose=True)



In [45]:
instruction = "Answer the following question through step-by-step reasoning."
question = "Animals may fight, make threatening sounds, and act aggressively toward members of the same species. These behaviors usually occur as the result of",
answer_choices = [
                    "competition",
                    "conservation",
                    "decomposition",
                    "pollution"
                ]
cot_trigger = "Answer: Let's think step by step."
cot = "Aggression is needed to defend something, one needs to defend their spot when they are in competition"
answer_extraction = "Therefore, the answer is"
answer = "competition"
reflection_prompt = "do you agree with the cot yes or no"
reflection = "Great reasoning mate!"
reflect_answer_extraction = "Based on the text above the answer is:"

In [29]:
print(reflect_prompt_template.format(question=question,answer_choices=answer_choices,cot_trigger=cot_trigger,
                                     cot=cot,answer_extraction=answer_extraction,
                                     answer=answer,reflection_prompt=reflection_prompt))


    Question: ('Animals may fight, make threatening sounds, and act aggressively toward members of the same species. These behaviors usually occur as the result of',)
    Answer_choices: ['competition', 'conservation', 'decomposition', 'pollution']

    Answer: Let's think step by step.Aggression is needed to defend something, one needs to defend their spot when they are in competition
    Therefore, the answer is competition
    
    Reflection: do you agree with the cot yes or no
    


In [37]:
print(ans_prompt_template.format(question=question,answer_choices=answer_choices,cot_trigger=cot_trigger,
                                     cot=cot,answer_extraction=answer_extraction,
                                     answer=answer,reflection_prompt=reflection_prompt,
                                     reflection=reflection,reflect_answer_extraction=reflect_answer_extraction))



    Question: ('Animals may fight, make threatening sounds, and act aggressively toward members of the same species. These behaviors usually occur as the result of',)
    Answer_choices: ['competition', 'conservation', 'decomposition', 'pollution']

    Answer: Let's think step by step. Aggression is needed to defend something, one needs to defend their spot when they are in competition
    Therefore, the answer is competition
    
    Reflection: do you agree with the cot yes or no
    Great reasoning mate!

    Based on the text above the answer is:
    


## Data


In [7]:
coll = Collection.load_thoughtsource_100(names='strategy_qa',load_pregenerated_cots=True) #random_sample=False?
coll = coll.select(split="all", number_samples=1)
coll.select_generated_cots(cot_trigger = "kojima-01", api_service='cohere') #have one

In [11]:
coll

| Name        |   Train | Valid   | Test   |
|-------------|---------|---------|--------|
| strategy_qa |       1 | -       | -      |

Not loaded: ['aqua', 'asdiv', 'commonsense_qa', 'entailment_bank', 'gsm8k', 'mawps', 'med_qa', 'medmc_qa', 'open_book_qa', 'pubmed_qa', 'qed', 'svamp', 'worldtree']

## Run reflection

In [None]:
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, among A through D, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"Do you have any reason to believe that the reasoning or the answer might be wrong? Answer with one word Yes or No",
    'reflect_answer_extraction':'Based on the reflection, what is the definite answer?'
}
metareason = coll.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)


In [None]:
#These prompts only let the model say no reason to believe the answer is wrong

In [None]:
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, among A through D, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The goal is to correct the Answer if needed, let's think step by step",
    'reflect_answer_extraction':'Based on the reflection, what is the definite answer?',
    'model_name':"gpt-3.5-turbo"
}
metareason = coll.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

In [27]:
#sloppy metareasoning leads to a performance decrease

  0%|          | 0/20 [00:00<?, ?ex/s]

In [None]:
"""Use TS_hard dataset"""
ts_hard = Collection.from_json('ts_hard_v1.json')
ts_sqa = ts_hard.select(split="all", number_samples=1)

In [None]:
"""2/11 correct"""

input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The question may try to trick the respondent, revise the previous answer",
    'reflect_answer_extraction':'What is the final answer (true/false)?',
    'model_name':"gpt-3.5-turbo"
}
metareason = ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

In [None]:
""".36 corrected but a bit misleading""" 

input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The question may try to trick the respondent, revise the previous answer",
    'reflect_answer_extraction':'What is the final answer (true/false)?',
    'model_name':"gpt-3.5-turbo"
}
metareason = ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

In [None]:
""".36 corrected but a bit misleading""" 
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The question may try to trick the respondent and is designed to have one best answer. With this information, revise the previous answer",
    'reflect_answer_extraction':'What is the final answer? Answer with one word (true/false)?',
    'model_name':"gpt-3.5-turbo"
}
metareason = ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

In [None]:
"""18% accuracy""" 
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The question may try to trick the respondent and is designed to have one best answer. With this information, let's revise the previous answer step-by-step",
    'reflect_answer_extraction':'What is the final answer? Answer with one word (true/false)?',
    'model_name':"gpt-3.5-turbo"
}
metareason = ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

In [None]:
"""DaVinci 0.09 accuracy""" 
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The question may try to trick the respondent and is designed to have one best answer. With this information, revise the previous answer",
    'reflect_answer_extraction':'What is the final answer? Answer with one word (true/false)?',
    'model_name':"text-davinci-003"
}
metareason = ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

In [None]:
"""ChatGPT as language model I have no access to the creator's answer"""

input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The question was created by crowdsourcing with the goal to test AI systems. Multiple answers may be multiple valid but only one will be regarded as correct. Pick the one answer that the creator of the question picked. If you give more than one, you are evaluated as wrong.",
    'reflect_answer_extraction':'What is the final answer? Answer with one word (true/false)?',
    'model_name':"gpt-3.5-turbo"
}
metareason = ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

In [None]:
"""27% accuracy"""
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The question was created by crowdsourcing with the goal to test AI systems. Multiple answers may be valid but only one answer will be regarded as correct. If you give more than one answer, it will be evaluated as wrong. Pick the one answer that is most likely for this question. ",
    'reflect_answer_extraction':'What is the final answer? Answer with one word (true/false)?',
    'model_name':"gpt-3.5-turbo"
}
metareason = ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

In [53]:
test = Collection.to_Collection(metareason,"strategy_qa",'train','file_test')
eval = test.evaluate()
from rich.pretty import pprint
pprint(eval)

In [55]:
test.evaluate()
test.dump('file_test.json')

  0%|          | 0/11 [00:00<?, ?ex/s]

| Name        |   Train | Valid   | Test   |
|-------------|---------|---------|--------|
| strategy_qa |      11 | -       | -      |

Not loaded: ['aqua', 'asdiv', 'commonsense_qa', 'entailment_bank', 'gsm8k', 'mawps', 'med_qa', 'medmc_qa', 'open_book_qa', 'pubmed_qa', 'qed', 'svamp', 'worldtree']

In [57]:
ts_hard = Collection.from_json('ts_hard_v1.json')
ts_sqa = ts_hard.select(split="all", number_samples=1)
ts_hard.unload_datasets(["med_qa","medmc_qa","commonsense_qa","strategy_qa","worldtree"])
ts_hard

| Name         | Train   | Valid   |   Test |
|--------------|---------|---------|--------|
| open_book_qa | -       | -       |     21 |

Not loaded: ['aqua', 'asdiv', 'commonsense_qa', 'entailment_bank', 'gsm8k', 'mawps', 'med_qa', 'medmc_qa', 'pubmed_qa', 'qed', 'strategy_qa', 'svamp', 'worldtree']

In [None]:
"""As an AI language model, I agree with the given answer"""
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The question was created by crowdsourcing with the goal to test AI systems. Multiple answers may be valid but only one answer will be regarded as correct. If you give more than one answer, it will be evaluated as wrong. Pick the one answer that is most likely for this question.",
    'reflect_answer_extraction':'What is the final answer from A through D?',
    'model_name':"gpt-3.5-turbo"
}
metareason = ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

In [None]:
"""As an AI language model, I stand corrected., 14% accuracy"""
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The Answer was given by a language model and may be incorrect. Multiple answers may be valid but only one answer is the best correct answer. If you give more than one answer, it will be evaluated as wrong. Revise, then pick the one answer that is most likely for this question.",
    'reflect_answer_extraction':'What is the final answer from A through D?',
    'model_name':"gpt-3.5-turbo"
}
metareason = ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

In [None]:
ts_hard = Collection.from_json('ts_hard_v1.json')
ts_hard.unload_datasets(["med_qa","medmc_qa","commonsense_qa","open_book_qa","worldtree"])

"""9%"""
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The Answer was given by a language model and may be incorrect. Multiple answers may be valid but only one answer is the best correct answer. If you give more than one answer, it will be evaluated as wrong. Revise, then pick the one answer that is most likely for this question.",
    'reflect_answer_extraction':'What is the final answer (true/false)?',
    'model_name':"gpt-3.5-turbo"
}
metareason = ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)



In [70]:
test = Collection.to_Collection(metareason,"strategy_qa",'train','file_test')
eval = test.evaluate()
from rich.pretty import pprint
pprint(eval)

  0%|          | 0/11 [00:00<?, ?ex/s]

In [71]:
test.evaluate()
test.dump('file_test.json')

  0%|          | 0/11 [00:00<?, ?ex/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

## Long prompt