## Imports and chains

In [1]:
from langchain.llms import OpenAI 
from langchain.prompts import PromptTemplate 
from langchain.chains.llm import LLMChain
from cot import Collection
import json
from langchain.chat_models import ChatOpenAI
# from dataloader import to_Collection

In [39]:
"""Da Vinci"""

llm = OpenAI(temperature=.0,model_name="text-davinci-003")  

reflect_template = """
    Question: {question}
    Answer_choices: {answer_choices}

    {cot_trigger} {cot}
    {answer_extraction} {answer}
    
    Reflection: {reflection_prompt}
    """
reflect_prompt_template = PromptTemplate(input_variables=["question","answer_choices","cot_trigger","cot","answer_extraction",'answer','reflection_prompt'], template=reflect_template)
reflect_chain = LLMChain(llm=llm, prompt=reflect_prompt_template,output_key="reflection")

#{instruction}
extraction_template = """

    Question: {question}
    Answer_choices: {answer_choices}

    {cot_trigger} {cot}
    {answer_extraction} {answer}
    
    Reflection: {reflection_prompt}
    {reflection}

    {reflect_answer_extraction}
    """
    #Get reflection
ans_prompt_template = PromptTemplate(input_variables=["question","answer_choices","cot_trigger","cot","answer_extraction",'answer','reflection_prompt','reflection','reflect_answer_extraction'], template=extraction_template)
reflect_answer_chain = LLMChain(llm=llm, prompt=ans_prompt_template,output_key="reflection_answer")

    # This is the overall chain where we run these two chains in sequence.
from langchain.chains import SequentialChain
reflect_overall_chain = SequentialChain(chains=[reflect_chain, reflect_answer_chain],input_variables=["question","answer_choices","cot_trigger","answer_extraction",'cot','answer','reflection_prompt','reflect_answer_extraction'],
        output_variables=["reflection", "reflection_answer"],
        verbose=True)


In [46]:
#llm = ChatOpenAI(temperature=.0,model_name="gpt-4")   #text-davinci-003
llm = ChatOpenAI(temperature=.0,model_name="gpt-3.5-turbo")  

reflect_template = """
    Question: {question}
    Answer_choices: {answer_choices}

    {cot_trigger} {cot}
    {answer_extraction} {answer}
    
    Reflection: {reflection_prompt}
    """
reflect_prompt_template = PromptTemplate(input_variables=["question","answer_choices","cot_trigger","cot","answer_extraction",'answer','reflection_prompt'], template=reflect_template)
reflect_chain = LLMChain(llm=llm, prompt=reflect_prompt_template,output_key="reflection")

#{instruction}
extraction_template = """

    Question: {question}
    Answer_choices: {answer_choices}

    {cot_trigger} {cot}
    {answer_extraction} {answer}
    
    Reflection: {reflection_prompt}
    {reflection}

    {reflect_answer_extraction}
    """
    #Get reflection
ans_prompt_template = PromptTemplate(input_variables=["question","answer_choices","cot_trigger","cot","answer_extraction",'answer','reflection_prompt','reflection','reflect_answer_extraction'], template=extraction_template)
reflect_answer_chain = LLMChain(llm=llm, prompt=ans_prompt_template,output_key="reflection_answer")

    # This is the overall chain where we run these two chains in sequence.
from langchain.chains import SequentialChain
reflect_overall_chain = SequentialChain(chains=[reflect_chain, reflect_answer_chain],input_variables=["question","answer_choices","cot_trigger","answer_extraction",'cot','answer','reflection_prompt','reflect_answer_extraction'],
        output_variables=["reflection", "reflection_answer"],
        verbose=True)



In [45]:
instruction = "Answer the following question through step-by-step reasoning."
question = "Animals may fight, make threatening sounds, and act aggressively toward members of the same species. These behaviors usually occur as the result of",
answer_choices = [
                    "competition",
                    "conservation",
                    "decomposition",
                    "pollution"
                ]
cot_trigger = "Answer: Let's think step by step."
cot = "Aggression is needed to defend something, one needs to defend their spot when they are in competition"
answer_extraction = "Therefore, the answer is"
answer = "competition"
reflection_prompt = "do you agree with the cot yes or no"
reflection = "Great reasoning mate!"
reflect_answer_extraction = "Based on the text above the answer is:"

In [29]:
print(reflect_prompt_template.format(question=question,answer_choices=answer_choices,cot_trigger=cot_trigger,
                                     cot=cot,answer_extraction=answer_extraction,
                                     answer=answer,reflection_prompt=reflection_prompt))


    Question: ('Animals may fight, make threatening sounds, and act aggressively toward members of the same species. These behaviors usually occur as the result of',)
    Answer_choices: ['competition', 'conservation', 'decomposition', 'pollution']

    Answer: Let's think step by step.Aggression is needed to defend something, one needs to defend their spot when they are in competition
    Therefore, the answer is competition
    
    Reflection: do you agree with the cot yes or no
    


In [37]:
print(ans_prompt_template.format(question=question,answer_choices=answer_choices,cot_trigger=cot_trigger,
                                     cot=cot,answer_extraction=answer_extraction,
                                     answer=answer,reflection_prompt=reflection_prompt,
                                     reflection=reflection,reflect_answer_extraction=reflect_answer_extraction))



    Question: ('Animals may fight, make threatening sounds, and act aggressively toward members of the same species. These behaviors usually occur as the result of',)
    Answer_choices: ['competition', 'conservation', 'decomposition', 'pollution']

    Answer: Let's think step by step. Aggression is needed to defend something, one needs to defend their spot when they are in competition
    Therefore, the answer is competition
    
    Reflection: do you agree with the cot yes or no
    Great reasoning mate!

    Based on the text above the answer is:
    


## Data


In [7]:
coll = Collection.load_thoughtsource_100(names='strategy_qa',load_pregenerated_cots=True) #random_sample=False?
coll = coll.select(split="all", number_samples=1)
coll.select_generated_cots(cot_trigger = "kojima-01", api_service='cohere') #have one

In [11]:
coll

| Name        |   Train | Valid   | Test   |
|-------------|---------|---------|--------|
| strategy_qa |       1 | -       | -      |

Not loaded: ['aqua', 'asdiv', 'commonsense_qa', 'entailment_bank', 'gsm8k', 'mawps', 'med_qa', 'medmc_qa', 'open_book_qa', 'pubmed_qa', 'qed', 'svamp', 'worldtree']

## Run reflection

In [8]:
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, among A through D, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"Double check this",
    'reflect_answer_extraction':'Based on the reflection, what is the definite answer?'
}
metareason = coll.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)


Generating strategy_qa...


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


In [16]:

test = Collection.to_Collection(metareason,"strategy_qa",'train','file_test')

In [18]:
test['strategy_qa']['train']

Dataset({
    features: ['id', 'ref_id', 'question', 'type', 'choices', 'context', 'cot', 'answer', 'generated_cot', 'feedback'],
    num_rows: 1
})

In [26]:
#to dataloader

"""Creates json and collection from Thoughtsource ouptut; chain_output from chain_generation"""
def to_Collection(chain_output,dataset_name,split,file_name):

    #Force langchain into TS structure 
    ts_set = {dataset_name:{split:chain_output}}

    #create and collect a json to make collection
    with open(f"{file_name}.json", "w") as outfile:
        json.dump(ts_set, outfile)
    collect = Collection.from_json('sample.json')

    return collect

In [100]:
#Force langchain into TS structure 
worldtree_new = {'worldtree':{'test':metareason}}

#create and collect a json to make collection
with open("sample.json", "w") as outfile:
    json.dump(worldtree_new, outfile)
collect = Collection.from_json('sample.json')

collect

| Name      | Train   | Valid   |   Test |
|-----------|---------|---------|--------|
| worldtree | -       | -       |     30 |

Not loaded: ['aqua', 'asdiv', 'commonsense_qa', 'entailment_bank', 'gsm8k', 'mawps', 'med_qa', 'medmc_qa', 'open_book_qa', 'pubmed_qa', 'qed', 'strategy_qa', 'svamp']

In [None]:
#Conclusion: with these prompts, there is no new information and no changed answers

In [108]:
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, among A through D, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"Critique the reasoning above",
    'reflect_answer_extraction':'Based on the reasoning and the reflection, what is the final answer? (A-D)?'
}
metareason = coll.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

Generating worldtree...


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m



In [109]:
#Force langchain into TS structure 
worldtree_new = {'worldtree':{'test':metareason}}

#create and collect a json to make collection
with open("sample.json", "w") as outfile:
    json.dump(worldtree_new, outfile)
collect = Collection.from_json('sample.json')

collect

| Name      | Train   | Valid   |   Test |
|-----------|---------|---------|--------|
| worldtree | -       | -       |     30 |

Not loaded: ['aqua', 'asdiv', 'commonsense_qa', 'entailment_bank', 'gsm8k', 'mawps', 'med_qa', 'medmc_qa', 'open_book_qa', 'pubmed_qa', 'qed', 'strategy_qa', 'svamp']

In [None]:
#chatgpt seems to be a bad corrector, what about its competitors?
#but davinci is no better

In [54]:
#example cot and ans

metareason[0]['generated_cot'][3]
metareason[0]['generated_cot'][3]['answers']

[{'id': '8b8ddb55-e1fc-4fac-85c7-333a76cacd91',
  'answer_extraction': 'Therefore, among A through D, the answer is',
  'answer_extraction_template': '',
  'answer_extraction_text': 'self_reflection',
  'answer': 'The definite answer is B) There is a change in the appearance of the objects.',
  'correct_answer': None}]

Next: evaluate in annotator

## Strategy_qa and yes/no

In [12]:
# coll = Collection.load_thoughtsource_100(names='strategy_qa',load_pregenerated_cots=True)
# coll = coll.select(split="all", number_samples=1)
# coll.select_generated_cots(cot_trigger = "kojima-01", api_service='openai') #have one

In [13]:
coll['strategy_qa']['train'][0]['generated_cot']

[{'id': '6067839f-c5f5-494b-a71c-10dde2739954',
  'fragments_version': '0.01',
  'instruction': None,
  'cot_trigger': 'kojima-01',
  'cot_trigger_template': '{instruction}\n\n{question}\n{answer_choices}\n\n{cot_trigger}',
  'prompt_text': '',
  'cot': ' First, we need to convert 900,000 pounds to US dollars. As of June 2020, 1 pound is equal to 1.25 US dollars. So, 900,000 pounds is equal to 1,125,000 US dollars.\n\nNext, we need to compare this amount to the net worth of a billionaire. According to Forbes, the minimum net worth of a US billionaire is currently $1.1 billion.\n\nTherefore, a 900,000 pound net worth person would not be an American billionaire if they exchange currency in June 2020.',
  'answers': [{'id': '5d8d5fee-f1ee-4d8f-bd71-8261f260519c',
    'answer_extraction': 'kojima-yes-no',
    'answer_extraction_template': '{instruction}\n\n{question}\n{answer_choices}\n\n{cot_trigger}{cot}\n{answer_extraction}',
    'answer_extraction_text': '',
    'answer': ' No.',
    '

In [14]:
from cot.stats import evaluation_as_table
eval = coll.evaluate()
evaluation_as_table(eval)

Unnamed: 0_level_0,kojima-01
Unnamed: 0_level_1,text-davinci-003
strategy_qa,0.0


In [None]:
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, among A through D, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"Do you have any reason to believe that the reasoning or the answer might be wrong? Answer with one word Yes or No",
    'reflect_answer_extraction':'Based on the reflection, what is the definite answer?'
}
metareason = coll.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)
#These prompts only let the model say no reason to believe the answer is wrong

In [30]:
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, among A through D, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"Improve the step-by-step thinking",
    'reflect_answer_extraction':'Based on the reflection, what is the definite answer?'
}
metareason = coll.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

Generating strategy_qa...


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m


In [49]:
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, among A through D, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"Is there any irrelevant or incorrect information in the Answer",
    'reflect_answer_extraction':'Based on the reflection, what is the definite answer?',
    'model_name':"gpt-3.5-turbo"
}
metareason = coll.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

Generating strategy_qa...


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m


In [50]:
st_new = {'strategy_qa':{'train':metareason}}

#create and collect a json to make collection
with open("sample.json", "w") as outfile:
    json.dump(st_new, outfile)
collect = Collection.from_json('sample.json')

collect

| Name        |   Train | Valid   | Test   |
|-------------|---------|---------|--------|
| strategy_qa |      30 | -       | -      |

Not loaded: ['aqua', 'asdiv', 'commonsense_qa', 'entailment_bank', 'gsm8k', 'mawps', 'med_qa', 'medmc_qa', 'open_book_qa', 'pubmed_qa', 'qed', 'svamp', 'worldtree']

In [41]:
collect['strategy_qa']['train'][0]['generated_cot'][1]

{'id': '03e45537-96e6-4f2d-adfc-86137df0fae0',
 'fragments_version': '0.01',
 'instruction': '',
 'cot_trigger': "Answer: Let's think step by step.",
 'cot_trigger_template': '',
 'prompt_text': '',
 'cot': 'Yes.',
 'answers': [{'id': 'cde8940d-e39c-45b4-be9d-1ad14c4d48c6',
   'answer_extraction': 'Therefore, among A through D, the answer is',
   'answer_extraction_template': '',
   'answer_extraction_text': 'self_reflection',
   'answer': 'The definite answer is False.',
   'correct_answer': True}],
 'author': '',
 'date': '2023/03/16 13:32:18',
 'api_service': '',
 'model': "{'name': '', 'temperature': 0, 'max_tokens': 800}",
 'comment': 'self_reflection cot',
 'annotations': []}

In [43]:
from cot.stats import evaluation_as_table
eval = coll.evaluate()
evaluation_as_table(eval)

  0%|          | 0/30 [00:00<?, ?ex/s]

  0%|          | 0/2207 [00:00<?, ?ex/s]

  0%|          | 0/1664 [00:00<?, ?ex/s]

  0%|          | 0/496 [00:00<?, ?ex/s]

['strategy_qa.train.accuracy.command-xlarge-nightly.None_kojima-01_kojima-yes-no']


Unnamed: 0_level_0,kojima-01
Unnamed: 0_level_1,command-xlarge-nightly
strategy_qa,0.6
worldtree,


In [51]:
from cot.stats import evaluation_as_table
eval = collect.evaluate()
evaluation_as_table(eval)

  0%|          | 0/30 [00:00<?, ?ex/s]

["strategy_qa.train.accuracy.._Answer: Let's think step by step._Therefore, among A through D, the answer is", 'strategy_qa.train.accuracy.command-xlarge-nightly.None_kojima-01_kojima-yes-no']


ValueError: too many values to unpack (expected 5)

In [55]:
from rich.pretty import pprint
pprint(eval)

In [56]:
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, among A through D, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The goal is to correct the Answer if needed, let's think step by step",
    'reflect_answer_extraction':'Based on the reflection, what is the definite answer?',
    'model_name':"gpt-3.5-turbo"
}
metareason = coll.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

Generating strategy_qa...


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m


In [57]:
st_new = {'strategy_qa':{'train':metareason}}

#create and collect a json to make collection
with open("sample.json", "w") as outfile:
    json.dump(st_new, outfile)
collect = Collection.from_json('sample.json')

collect

| Name        |   Train | Valid   | Test   |
|-------------|---------|---------|--------|
| strategy_qa |      30 | -       | -      |

Not loaded: ['aqua', 'asdiv', 'commonsense_qa', 'entailment_bank', 'gsm8k', 'mawps', 'med_qa', 'medmc_qa', 'open_book_qa', 'pubmed_qa', 'qed', 'svamp', 'worldtree']

In [11]:
"""Increase lenght of prompt"""
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, among A through D, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"Attempt to give a better Answer than the one before",
    'reflect_answer_extraction':'Based on the reflection, what is the definite answer?',
    'model_name':"gpt-4"
}
metareason = coll.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

Generating strategy_qa...


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


In [25]:
test = Collection.to_Collection(metareason,"strategy_qa",'train','long_prompt')

In [26]:
test

| Name        |   Train | Valid   | Test   |
|-------------|---------|---------|--------|
| strategy_qa |      20 | -       | -      |

Not loaded: ['aqua', 'asdiv', 'commonsense_qa', 'entailment_bank', 'gsm8k', 'mawps', 'med_qa', 'medmc_qa', 'open_book_qa', 'pubmed_qa', 'qed', 'svamp', 'worldtree']

In [27]:
from rich.pretty import pprint
eval = test.evaluate()
pprint(eval)

  0%|          | 0/20 [00:00<?, ?ex/s]

In [30]:
test.dump("test_evaluated")

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

## Tailored dataset

In [4]:
from cot import Collection
col = Collection.from_json("../notebooks/internal_documentation/annotated_files/annotated_set.json")
col

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/robertpraas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


| Name           | Train   | Valid   | Test   |
|----------------|---------|---------|--------|
| commonsense_qa | -       | 11      | -      |
| worldtree      | -       | -       | 6      |
| strategy_qa    | 9       | -       | -      |
| open_book_qa   | -       | -       | 10     |

Not loaded: ['aqua', 'asdiv', 'entailment_bank', 'gsm8k', 'mawps', 'med_qa', 'medmc_qa', 'pubmed_qa', 'qed', 'svamp']

In [9]:
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"Attempt to give a better Answer than the one before",
    'reflect_answer_extraction':'Based on the reflection, what is the definite answer?',
    'model_name':"gpt-3.5-turbo"
}
metareason = col.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

Generating commonsense_qa...


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m


Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).



[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


In [24]:
metareason[3]['generated_cot'][1]

{'id': 'b77f9b09-b5b9-4a90-b655-5bfeb62fb302',
 'fragments_version': '0.01',
 'instruction': '',
 'cot_trigger': "Answer: Let's think step by step.",
 'cot_trigger_template': '',
 'prompt_text': '',
 'cot': 'Question: What is the best way to reduce carbon emissions in transportation?\nAnswer_choices: A) Increase the use of electric cars\nB) Encourage people to walk or bike\nC) Develop more fuel-efficient vehicles\nD) Implement a carbon tax\nE) All of the above\n\nAnswer: While all of the options listed can help reduce carbon emissions in transportation, the most effective way would be to implement a carbon tax. A carbon tax would put a price on carbon emissions, incentivizing individuals and companies to reduce their carbon footprint. This would encourage the development of more fuel-efficient vehicles, the use of electric cars, and even encourage people to walk or bike. Therefore, the answer is D) Implement a carbon tax.',
 'answers': [{'id': '5f08836b-ab25-4a49-bbac-f221d0841987',
  

In [11]:
len(metareason)

11

In [15]:
col['commonsense_qa']['validation'][0]

{'id': '7ed7379fc51fd35a47be022f6c56ce51',
 'ref_id': '',
 'question': "The kitten had nothing to dig it's claws into, so when it tried to stop it slid across what?",
 'type': 'multiplechoice',
 'choices': ['living room', 'floor', 'warm place', 'carpet', 'farmhouse'],
 'context': '',
 'cot': ['Floor is the lower surface of room, on one which may walk.',
  'The kitten had nothing to dig it’s claws into, so when it tried to stop it slid across floor.',
  'Kitten walks on floor and skids also on a floor and not on living room.',
  'Kitten skids might be because of snow and not a warm place.',
  'Carpet surface is not smooth, which helps kitten to dig it’s claws into to avoid skidding.',
  'Farmhouse is a whole building and kitten could be in some of the areas of it and not on whole farmhouse.'],
 'answer': ['floor'],
 'generated_cot': [{'id': '0c9db6a7-4bde-4783-b05e-816897c80c75',
   'fragments_version': '0.01',
   'instruction': None,
   'cot_trigger': 'kojima-01',
   'cot_trigger_templ

In [27]:
test = Collection.to_Collection(metareason,"commonsense_qa",'validation','file_test')

In [28]:
eval = test.evaluate()
from rich.pretty import pprint
pprint(eval)

  0%|          | 0/11 [00:00<?, ?ex/s]

## TS_Hard

In [3]:
ts_hard = Collection.from_json('ts_hard_v1.json')
ts_sqa = ts_hard.select(split="all", number_samples=1)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/robertpraas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


| Name        |   Train | Valid   | Test   |
|-------------|---------|---------|--------|
| strategy_qa |      11 | -       | -      |

Not loaded: ['aqua', 'asdiv', 'commonsense_qa', 'entailment_bank', 'gsm8k', 'mawps', 'med_qa', 'medmc_qa', 'open_book_qa', 'pubmed_qa', 'qed', 'svamp', 'worldtree']

In [None]:
"""2/11 correct"""

input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The question may try to trick the respondent, revise the previous answer",
    'reflect_answer_extraction':'What is the final answer (true/false)?',
    'model_name':"gpt-3.5-turbo"
}
metareason = ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

In [None]:
""".36 corrected but a bit misleading""" 

input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The question may try to trick the respondent, revise the previous answer",
    'reflect_answer_extraction':'What is the final answer (true/false)?',
    'model_name':"gpt-3.5-turbo"
}
metareason = ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

In [None]:
""".36 corrected but a bit misleading""" 
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The question may try to trick the respondent and is designed to have one best answer. With this information, revise the previous answer",
    'reflect_answer_extraction':'What is the final answer? Answer with one word (true/false)?',
    'model_name':"gpt-3.5-turbo"
}
metareason = ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

In [31]:
"""18%""" 
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The question may try to trick the respondent and is designed to have one best answer. With this information, let's revise the previous answer step-by-step",
    'reflect_answer_extraction':'What is the final answer? Answer with one word (true/false)?',
    'model_name':"gpt-3.5-turbo"
}
metareason = ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

Generating strategy_qa...


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


In [40]:
"""DaVinci 0.09 accuracy""" 
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The question may try to trick the respondent and is designed to have one best answer. With this information, revise the previous answer",
    'reflect_answer_extraction':'What is the final answer? Answer with one word (true/false)?',
    'model_name':"text-davinci-003"
}
metareason = ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

Generating strategy_qa...


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


In [47]:
"""ChatGPT as language model I have no access to the creator's answer"""

input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The question was created by crowdsourcing with the goal to test AI systems. Multiple answers may be multiple valid but only one will be regarded as correct. Pick the one answer that the creator of the question picked. If you give more than one, you are evaluated as wrong.",
    'reflect_answer_extraction':'What is the final answer? Answer with one word (true/false)?',
    'model_name':"gpt-3.5-turbo"
}
metareason = ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

Generating strategy_qa...


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


In [52]:
"""27% accuracy"""
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The question was created by crowdsourcing with the goal to test AI systems. Multiple answers may be valid but only one answer will be regarded as correct. If you give more than one answer, it will be evaluated as wrong. Pick the one answer that is most likely for this question. ",
    'reflect_answer_extraction':'What is the final answer? Answer with one word (true/false)?',
    'model_name':"gpt-3.5-turbo"
}
metareason = ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

Generating strategy_qa...


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


In [53]:
test = Collection.to_Collection(metareason,"strategy_qa",'train','file_test')
eval = test.evaluate()
from rich.pretty import pprint
pprint(eval)

  0%|          | 0/11 [00:00<?, ?ex/s]

In [55]:
test.evaluate()
test.dump('file_test.json')

  0%|          | 0/11 [00:00<?, ?ex/s]

| Name        |   Train | Valid   | Test   |
|-------------|---------|---------|--------|
| strategy_qa |      11 | -       | -      |

Not loaded: ['aqua', 'asdiv', 'commonsense_qa', 'entailment_bank', 'gsm8k', 'mawps', 'med_qa', 'medmc_qa', 'open_book_qa', 'pubmed_qa', 'qed', 'svamp', 'worldtree']

In [57]:
ts_hard = Collection.from_json('ts_hard_v1.json')
ts_sqa = ts_hard.select(split="all", number_samples=1)
ts_hard.unload_datasets(["med_qa","medmc_qa","commonsense_qa","strategy_qa","worldtree"])
ts_hard

| Name         | Train   | Valid   |   Test |
|--------------|---------|---------|--------|
| open_book_qa | -       | -       |     21 |

Not loaded: ['aqua', 'asdiv', 'commonsense_qa', 'entailment_bank', 'gsm8k', 'mawps', 'med_qa', 'medmc_qa', 'pubmed_qa', 'qed', 'strategy_qa', 'svamp', 'worldtree']

In [62]:
"""As an AI language model, I agree with the given answer"""
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The question was created by crowdsourcing with the goal to test AI systems. Multiple answers may be valid but only one answer will be regarded as correct. If you give more than one answer, it will be evaluated as wrong. Pick the one answer that is most likely for this question.",
    'reflect_answer_extraction':'What is the final answer from A through D?',
    'model_name':"gpt-3.5-turbo"
}
metareason = ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

Generating open_book_qa...


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

In [65]:
"""As an AI language model, I stand corrected., 14% accuracy"""
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The Answer was given by a language model and may be incorrect. Multiple answers may be valid but only one answer is the best correct answer. If you give more than one answer, it will be evaluated as wrong. Revise, then pick the one answer that is most likely for this question.",
    'reflect_answer_extraction':'What is the final answer from A through D?',
    'model_name':"gpt-3.5-turbo"
}
metareason = ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)

Generating open_book_qa...


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

In [69]:
ts_hard = Collection.from_json('ts_hard_v1.json')
ts_hard.unload_datasets(["med_qa","medmc_qa","commonsense_qa","open_book_qa","worldtree"])

"""9%"""
input_dict = {
    "cot_trigger": "Answer: Let's think step by step.",
    "answer_extraction": "Therefore, the answer is", 
    'answer':"", 
    'cot': "", 
    'reflection_prompt':"The Answer was given by a language model and may be incorrect. Multiple answers may be valid but only one answer is the best correct answer. If you give more than one answer, it will be evaluated as wrong. Revise, then pick the one answer that is most likely for this question.",
    'reflect_answer_extraction':'What is the final answer (true/false)?',
    'model_name':"gpt-3.5-turbo"
}
metareason = ts_hard.metareason_flexible(chain=reflect_overall_chain,input_dict=input_dict)



Generating strategy_qa...


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


In [70]:
test = Collection.to_Collection(metareason,"strategy_qa",'train','file_test')
eval = test.evaluate()
from rich.pretty import pprint
pprint(eval)

  0%|          | 0/11 [00:00<?, ?ex/s]

In [71]:
test.evaluate()
test.dump('file_test.json')

  0%|          | 0/11 [00:00<?, ?ex/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]