In [2]:
from cot import Collection, stats
from pprint import pprint

# Random Sampling from Datasets

In [5]:
# load datasets to sample from
collection = Collection(["worldtree"], verbose=False)
print(collection)

Loading worldtree...
| Name      |   Train |   Valid |   Test |
|-----------|---------|---------|--------|
| worldtree |    2207 |     496 |   1664 |

Not loaded: ['aqua', 'asdiv', 'commonsense_qa', 'entailment_bank', 'gsm8k', 'mawps', 'open_book_qa', 'qed', 'strategy_qa', 'svamp']


In [4]:
# you can also extend an existing json
# collection = Collection.from_json("generation.json")

No config specified, defaulting to: worldtree_dataset/worldtree_thoughtsource


In [6]:
# define number of random samples per dataset
number_of_samples = 100

# define subset to sample from (train/validation/test)
subset_name = 'train'

# set seed for reproducibility
seed = True

In [7]:
import random
import copy

random_sampled_collection = copy.deepcopy(collection)

for dataset in random_sampled_collection:
    _ , dataset_dict = dataset
    subset = copy.deepcopy(dataset_dict[subset_name])
    #get number of samples in subset
    samples_count = subset.num_rows
    # set seed for reproducibility
    if seed:
        random.seed(0)
    random_ids = random.sample(range(samples_count), number_of_samples)
    # sample from subset
    random_subset = subset.select(random_ids)
    # clear original dictionary
    dataset_dict.clear()
    # reinsert selected samples
    dataset_dict[subset_name] = random_subset
    

In [8]:
random_sampled_collection

| Name      |   Train | Valid   | Test   |
|-----------|---------|---------|--------|
| worldtree |     100 | -       | -      |

Not loaded: ['aqua', 'asdiv', 'commonsense_qa', 'entailment_bank', 'gsm8k', 'mawps', 'open_book_qa', 'qed', 'strategy_qa', 'svamp']

# Generate CoT

In [9]:
config={
    "debug": True,
    "instruction_keys": [None],
    "cot_trigger_keys": [
        'kojima-01'
    ],
    "answer_extraction_keys": [
        'kojima-01'
    ],
    "author" : "simon",
    "engine": "text-davinci-002",
    "temperature": 0,
    "max_tokens": 512,
    "api_time_interval": 1.0,
    "verbose": False
}
random_sampled_collection.generate(name="worldtree", config=config)

You are about to call the openai API which produces costs.
Due to your settings you are about to call the openai API in total 200 times.
Number API calls for CoT generation: n_samples * n_instruction_keys * n_cot_trigger_keys
Number API calls for answer extraction: n_samples * n_instruction_keys * n_cot_trigger_keys * n_answer_extraction_keys
Do you want to continue? y/n



In [10]:
random_sampled_collection.dump("asdf.json")

In [11]:
random_sampled_collection["worldtree"]["train"][0]

{'id': '1577',
 'question_id': '1577',
 'document_id': '1577',
 'question': 'A parent and a child share several characteristics. Both individuals are tall, have curly hair, are good cooks, and have freckles. Which of these characteristics is a learned behavior?',
 'type': 'multiplechoice',
 'cot_type': 'list',
 'choices': ['being tall',
  'having curly hair',
  'being a good cook',
  'having freckles'],
 'context': '',
 'cot': ['Skills are learned characteristics.',
  'A behavior is a kind of characteristic.',
  'Cooking is a kind of skill for preparing food.'],
 'answer': ['being a good cook'],
 'generated_cot': [{'annotation': [],
   'answers': [{'answer': 'test',
     'answer-extraction': 'kojima-01',
     'correct_answer': None}],
   'author': 'simon',
   'comment': '',
   'cot': 'test',
   'cot-trigger': 'kojima-01',
   'date': '2022/11/03 15:52:01',
   'instruction': None,
   'model': {'max_tokens': 512, 'name': 'text-davinci-002', 'temperature': 0},
   'templates_version': '0.01