In [1]:
from cot import Collection
from cot.generate import TEMPLATES
from pprint import pprint

  from .autonotebook import tqdm as notebook_tqdm


## Generate random sample of worldtree dataset

In [2]:
# load datasets to sample from
collection = Collection(["worldtree"], verbose=False)
print(collection)

Loading worldtree...
| Name      |   Train |   Valid |   Test |
|-----------|---------|---------|--------|
| worldtree |    2207 |     496 |   1664 |

Not loaded: ['aqua', 'asdiv', 'commonsense_qa', 'entailment_bank', 'gsm8k', 'mawps', 'open_book_qa', 'qed', 'strategy_qa', 'svamp']


In [3]:
# you can also open an existing json
# collection = Collection.from_json("generation.json")

In [4]:
# define number of random samples per dataset
number_of_samples = 100

# define subset to sample from (train/validation/test)
subset_name = 'train'

# set seed for reproducibility
seed = True

In [5]:
import random
import copy

random_sampled_collection = copy.deepcopy(collection)

for dataset in random_sampled_collection:
    _ , dataset_dict = dataset
    subset = copy.deepcopy(dataset_dict[subset_name])
    #get number of samples in subset
    samples_count = subset.num_rows
    # set seed for reproducibility
    if seed:
        random.seed(0)
    random_ids = random.sample(range(samples_count), number_of_samples)
    # sample from subset
    random_subset = subset.select(random_ids)
    # clear original dictionary
    dataset_dict.clear()
    # reinsert selected samples
    dataset_dict[subset_name] = random_subset
random_sampled_collection.dump("random_sample.json")

In [6]:
random_sampled_collection

| Name      |   Train | Valid   | Test   |
|-----------|---------|---------|--------|
| worldtree |     100 | -       | -      |

Not loaded: ['aqua', 'asdiv', 'commonsense_qa', 'entailment_bank', 'gsm8k', 'mawps', 'open_book_qa', 'qed', 'strategy_qa', 'svamp']

# CoT Generation

Print available templates

In [7]:
pprint(TEMPLATES)

{'answer-extractions': {'kojima-01': 'Therefore, the answer is',
                        'kojima-02': 'Therefore,',
                        'kojima-03': 'The answer is',
                        'kojima-A-C': 'Therefore, among A through C, the '
                                      'answer is',
                        'kojima-A-D': 'Therefore, among A through D, the '
                                      'answer is',
                        'kojima-A-E': 'Therefore, among A through E, the '
                                      'answer is',
                        'kojima-A-F': 'Therefore, among A through F, the '
                                      'answer is',
                        'kojima-numerals': 'Therefore, the answer (arabic '
                                           'numerals) is',
                        'kojima-yes-no': 'Therefore, the answer (Yes or No) '
                                         'is'},
 'cot-triggers': {'kojima-01': "Answer: Let's think step by step.

In [8]:
"""
config={
    "idx_range":(0,3), # Determines which indices the generate_and_extract routine is applied to, Default: None (All items are used)
    "debug": True, # Determines whether an api is called or a mock is returned, used for debugging, Default: True (api is not used)
    "instruction_keys": ["qa-01"], # Determines which instructions are used from templates.json, Default: None (All used)
    "cot_trigger_keys": ["kojima-01"], # Determines which cot triggers are used from templates.json, Default: None (All are used)
    "answer_extraction_keys": ["kojima-01"], # Determines which answer extraction prompts are used from templates.json, Default: None (All are used)
    "author" : "Konstantin", # Name of the person responsible for generation, Default: ""
    "api_service": "openai", # Name of the API called ("openai", "huggingface_hub"), Default: "openai"
    "engine": "text-davinci-002", # Name of the engine used (for "huggingface_hub" use for example "google/flan-t5-xl"), Default: "text-davinci-002"
    "temperature": 0, # Name of the person responsible for generation, Default: 0
    "max_tokens": 512, # Maximum lenght of output generated by the model, Default: 128
    "api_time_interval": 1.0, # Pause between two api calls in seconds, Default: 1.0
    "warn": False,
}
"""

config={
    "debug": True,
    "instruction_keys": [None],
    "cot_trigger_keys": [
        'kojima-01'
    ],
    "answer_extraction_keys": [
        'kojima-01'
    ],
    "author" : "konstantin",
    "api_service": "openai",
    "engine": "text-davinci-002",
    "temperature": 0,
    "max_tokens": 512,
    "api_time_interval": 1.0,
    "verbose": False
}
random_sampled_collection.generate(name="worldtree", config=config)

n_samples: 100, n_instruction_keys: 1, n_cot_trigger_keys: 1, n_answer_extraction_keys: 1
You are about to call the openai API which produces costs.
Due to your settings you are about to call the openai API in total 200 times.
Number API calls for CoT generation: n_samples * n_instruction_keys * n_cot_trigger_keys
Number API calls for answer extraction: n_samples * n_instruction_keys * n_cot_trigger_keys * n_answer_extraction_keys
Do you want to continue? y/n



### Save generated cots and extractions

In [None]:
random_sampled_collection.dump("generation.json")