## Installation and Imports

In [1]:
#after cloning the repo, install linking to your local location of the cot library
!pip install -e ./ThoughtSource-main/libs/cot

In [1]:
#imports
import os
from cot import Collection
from cot.generate import FRAGMENTS
from pprint import pprint
import json

In [2]:
#You can find your API_TOKEN under Settings from your Hugging Face account

#os.environ["HUGGINGFACEHUB_API_TOKEN"] = "<token>" 


## 1. Data: Loading dataset and generate random sample

In [3]:
# load a dataset to sample from 
collection_worldtree = Collection(["worldtree"], verbose=False)
print(collection_worldtree)

Loading worldtree...
| Name      |   Train |   Valid |   Test |
|-----------|---------|---------|--------|
| worldtree |    2207 |     496 |   1664 |

Not loaded: ['aqua', 'asdiv', 'commonsense_qa', 'entailment_bank', 'gsm8k', 'mawps', 'med_qa', 'medmc_qa', 'open_book_qa', 'pubmed_qa', 'qed', 'strategy_qa', 'svamp']


In [4]:
# selecting random 100 rows of train split
collection_worldtree_100 = collection_worldtree.select(split="train", number_samples=100, random_samples=True, seed=0)
collection_worldtree_100


| Name      |   Train | Valid   | Test   |
|-----------|---------|---------|--------|
| worldtree |     100 | -       | -      |

Not loaded: ['aqua', 'asdiv', 'commonsense_qa', 'entailment_bank', 'gsm8k', 'mawps', 'med_qa', 'medmc_qa', 'open_book_qa', 'pubmed_qa', 'qed', 'strategy_qa', 'svamp']

In [6]:
#create json, it will appear in the same folder as this notebook
collection_worldtree_100.dump("worldtree_100_dataset.json")

## 2. Generate: Model Chain of Thought (CoT) Generation

## Examples of available fragments
Three groups of templates for CoT generation:
1) Instructions
2) CoT-Triggers
3) Answer-Extractions

In [8]:
# Choose between instructions
pprint(FRAGMENTS["instructions"])

{'qa-01': 'Answer the following question through step-by-step reasoning.',
 'qa-02': 'Answer the following question through careful, concise step-by-step '
          'reasoning.',
 'qa-03': 'Answer the following question through careful, concise step-by-step '
          'reasoning. Avoid making up wrong statements. If the question does '
          'not make sense or cannot be answered, write "I cannot answer the '
          'question". If you do not have a good answer, write "I do not have a '
          'good answer". If you are uncertain, write "I am uncertain about '
          'this".',
 'qa-04': 'Answer the following question through careful, concise step-by-step '
          'reasoning. Avoid making up wrong statements. Generate sub-questions '
          'that are required to answer the original question, answer them '
          'until you can answer the original question. If the question does '
          'not make sense or cannot be answered, write "I cannot answer the '
          

In [7]:
# Choose a trigger to induce chain-of-thought reasoning
print(json.dumps(FRAGMENTS["cot_triggers"], sort_keys=True, indent=2))

{
  "kojima-01": "Answer: Let's think step by step.",
  "kojima-02": "Answer: We should think about this step by step.",
  "kojima-03": "Answer: First,",
  "kojima-04": "Answer: Before we dive into the answer,",
  "kojima-05": "Answer: Proof followed by the answer.",
  "kojima-06": "Answer: Let's think step by step in a realistic way.",
  "kojima-07": "Answer: Let's think step by step using common sense and knowledge.",
  "kojima-08": "Answer: Let's think like a detective step by step.",
  "kojima-09": "Answer: Let's think about this logically.",
  "kojima-10": "Answer: Let's think step by step. First,",
  "kojima-11": "Answer: Let's think",
  "kojima-12": "Answer: Let's solve this problem by splitting it into steps.",
  "kojima-13": "Answer: The answer is after the proof.",
  "kojima-14": "Answer: Let's be realistic and think step by step.",
  "lievin-01": "Answer: Let's derive the differential diagnosis step by step.",
  "lievin-02": "Answer: Let's use step by step inductive reasonin

In [None]:
# After obtaining a chain-of-thought (cot), the model receives the question with cot and an answer_extraction instruction
pprint(FRAGMENTS["answer_extractions"])

{'kojima-01': 'Therefore, the answer is',
 'kojima-02': 'Therefore,',
 'kojima-03': 'The answer is',
 'kojima-A-C': 'Therefore, among A through C, the answer is',
 'kojima-A-D': 'Therefore, among A through D, the answer is',
 'kojima-A-E': 'Therefore, among A through E, the answer is',
 'kojima-A-F': 'Therefore, among A through F, the answer is',
 'kojima-numerals': 'Therefore, the answer (arabic numerals) is',
 'kojima-yes-no': 'Therefore, the answer (Yes or No) is'}


## Generating Chain of Thought examples

In [9]:
# load dataset (detailed explanation in generate_samples notebook) and config file
collection = Collection(["worldtree"], verbose=False)
worldtree_100_random = collection.select(split="train", number_samples=100, random_samples=True, seed=0)
config={
    "idx_range": "all", # Determines which indices the generate_and_extract routine is applied to, Default: "all" (All items are used)
    "instruction_keys": ["qa-01"], # Determines which instructions are used from fragments.json, Default: None (no instructions are used)
    "cot_trigger_keys": ["kojima-01"], # Determines which cot triggers are used from fragments.json, Default: ["kojima-01"] (only the first trigger is used)
    "answer_extraction_keys": ["kojima-A-D"], # Determines which answer extraction prompts are used from fragments.json, Default: ["kojima-01"] (only the first prompt is used)
    "author" : "konstantin", # Name of the person responsible for generation, Default: ""
    "api_service": "mock_api", # Name of the API called ("openai", "huggingface_hub", or a mock for testing: "mock_api"), Default: "huggingface_hub"
    "engine": "", # Name of the engine used (for "huggingface_hub" use for example "google/flan-t5-xl"), Default: "google/flan-t5-xl"
    "temperature": 0, # Name of the person responsible for generation, Default: 0
    "max_tokens": 512, # Maximum length of output generated by the model, Default: 128
    "api_time_interval": 1.0, # Pause between two api calls in seconds, Default: 1.0
    "verbose": False, # Determines whether the progress of the generation is printed, Default: True
    "warn": True, # Determines whether a warnings that external APIs will be called are printed, Default: True
}

Loading worldtree...


In [14]:
# Generating chains of thought and answer extractions (This is in Mock-API mode, not calling model over API)
worldtree_100_random.generate(name="worldtree", config=config) #if you cannot press y, set "warn" to false in config


        You are about to [1m call an external API [0m in total 200 times, which [1m may produce costs [0m.
        Number API calls for CoT generation: n_samples 100 * n_instruction_keys 1 * n_cot_trigger_keys 1
        Number API calls for answer extraction: n_samples 100 * n_instruction_keys 1 * n_cot_trigger_keys 1 * n_answer_extraction_keys 1
        Do you want to continue? y/n
        [1m Note: You are using a mock api. When entering 'y', a test run without API calls is made. [0m


  0%|          | 0/100 [00:00<?, ?ex/s]

In [19]:
# the above was a fake call in debug mode. Now loading a prepared dataset with real model answers.
worldtree_100_random = Collection.from_json("worldtree_100_generate.json")

#### Question, choices and right answer

In [24]:
# Extract from prepared dataset
pprint("Question: "+ worldtree_100_random["worldtree"]["train"][1]["question"])
pprint("Answer Options:")
pprint(worldtree_100_random["worldtree"]["train"][1]["choices"])
pprint("Answer: "+ "".join(worldtree_100_random["worldtree"]["train"][1]["answer"]))

'Question: The length of a year is equivalent to the time it takes for one'
'MC:'
['rotation of Earth',
 'rotation of the Sun',
 'revolution of Earth around the Sun',
 'revolution of the Sun around Earth']
'Answer: revolution of Earth around the Sun'


#### Model generated chain of thought and extracted answer (correct)

In [25]:
pprint(worldtree_100_random["worldtree"]["train"][1]["generated_cot"][0]["cot"])
pprint(worldtree_100_random["worldtree"]["train"][1]["generated_cot"][0]['answers'][0]['answer'])

('A year is equal to one revolution of Earth around the Sun. One revolution of '
 'Earth around the Sun is equal to one year. So, the final answer is C.')
'C.'


## 3. Evaluate: Performance Evaluation of model outputs

In [26]:
# Loading collection with model answers
collection = Collection.from_json("worldtree_100_generate.json")

In [27]:
# Before the evaluation step the collection does not include if the answer is correct.
collection["worldtree"]["train"][0]['generated_cot'][0]["answers"][0]['correct_answer']

In [28]:
# Evaluate the accuracy of the model predictions
collection.evaluate("worldtree","train")

  0%|          | 0/100 [00:00<?, ?ex/s]

{'accuracy': {'qa-01_kojima-01_kojima-A-D': 0.86}}


In [29]:
# Now the collection includes if the answer is correct
collection["worldtree"]["train"][0]['generated_cot'][0]["answers"][0]['correct_answer']

True

In [30]:
collection.dump("worldtree_100_evaluate.json") #save evaluation to current folder 

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

## Summary

In [31]:
#Combination of the methods in one cell

# 1) Dataset loading and selecting a random sample
collection = Collection(["worldtree"], verbose=False)
collection = collection.select(split="train", number_samples=10)


# 2) Language Model generates chains of thought and then extracts answers
config={
    "instruction_keys": ['qa-01'], # "Answer the following question through step-by-step reasoning."
    "cot_trigger_keys": ['kojima-01'], # "Answer: Let's think step by step."
    "answer_extraction_keys": ['kojima-A-D'], # "Therefore, among A through D, the answer is"
    "api_service": "huggingface_hub",
    "engine": "google/flan-t5-xl",
    "warn": False,
    "verbose": False,
}
pprint(collection.generate(config=config))

# 3) Performance evaluation
pprint(collection.evaluate())

Loading worldtree...
{'accuracy': {'qa-01_kojima-01_kojima-A-D': 0.6}}
