In [1]:
import json
import time
import os
import numpy as np

from cortexsubsetloader import CortexSubsetLoader

In [2]:
past_datasets = []
data_loc = 'data'
datasets = os.listdir(data_loc)
most_recent = 0
for f in datasets:
    if 'cortex_' in f and '.json' in f:
        version = int(f.split('cortex_')[-1].split('_')[0].replace('e', ''))
        if version > most_recent:
            most_recent = version
        past_datasets.append(f'{data_loc}/{f}')

def load_old_data(past_datasets):
    global most_recent, agg_old_data
    agg_old_data = []
    for dataset in past_datasets:
        with open(dataset, 'r') as f:
            agg_old_data = agg_old_data + json.load(f)
    return agg_old_data

agg_old_data = [] # load_old_data(past_datasets)
saveable_dataset = []
version_num = most_recent + 1

def save_dataset(version, data, reload_agg=True):
    global past_datasets, agg_old_data
    save_len = (len(data))
    with open(f'data/cortex_{version}_{save_len}.json', 'w') as f:
        json.dump(data[-save_len:], f)
    past_datasets.append(f'data/cortex_{version}_{save_len}.json')
    if reload_agg: agg_old_data = load_old_data(past_datasets)
    print(f'Saved {save_len} samples to data/cortex_{version}_{save_len}.json')


print(f"found {len(past_datasets)} datasets, comprised of {len(agg_old_data)} samples")
print(f"most recent dataset is {most_recent}")

found 8 datasets, comprised of 0 samples
most recent dataset is 686


In [3]:
def make_new_eval_set(size=512):
    global version_num
    eval_set = []
    cortex_subset_loader = CortexSubsetLoader(latest=True, random_seed = None, max_samples=2560, steps=1, progress=True, running=True,
                    retry_limit=5, page_size=400, retry_delay=5)
    for prompt, response in cortex_subset_loader.buffer:
            new = {"instruction": prompt, "response": response}
            if new not in saveable_dataset and new not in agg_old_data and new not in eval_set:
                eval_set.append(new)
    eval_set = np.array(eval_set)[np.random.permutation(len(eval_set))].tolist()

    if len(eval_set) > size:
         save_dataset('e'+str(version_num), eval_set[:size])
         version_num += 1
    else:
        print(f"only found {len(eval_set)} new samples, not saving")


def create_new_datasets(num_new=1, size=2048*4, sleep_minutes=4, max_samples_per_step=1024, ignore_old=False, steps=1):
    global version_num, saveable_dataset
    datasets_so_far = 0
    while datasets_so_far < num_new:
        cortex_subset_loader = CortexSubsetLoader(latest=True, random_seed = None, max_samples=max_samples_per_step, steps=steps, 
                                        progress=True, running=True, retry_limit=5, page_size=400, retry_delay=5)
        print("samples collected:",len(cortex_subset_loader.buffer))
        current_size = len(saveable_dataset)
        for prompt, response in cortex_subset_loader.buffer:
            new = {"instruction": prompt, "response": response}
            if new not in saveable_dataset:
                if ignore_old or new not in agg_old_data:
                    saveable_dataset.append(new)
        samples_added = len(saveable_dataset) - current_size
        print("samples collected:",len(cortex_subset_loader.buffer), "\tdeduped samples added:", samples_added)
        saveable_dataset = np.array(saveable_dataset)
        sp = np.random.permutation(len(saveable_dataset))
        saveable_dataset = saveable_dataset[sp].tolist()
        print(len(saveable_dataset))

        if len(saveable_dataset) >= size:
            save_dataset(version_num, saveable_dataset[:size], reload_agg=not ignore_old)
            version_num += 1
            saveable_dataset = saveable_dataset[size:]
            datasets_so_far += 1

        if datasets_so_far < num_new:
            time.sleep(sleep_minutes * 60)

In [4]:
data = []
CortexSubsetLoader(latest=True, random_seed = None, max_samples=512, steps=1, 
                                        progress=True, running=True, retry_limit=5, page_size=400, retry_delay=5, ignore_list=data)

                                                   

<cortexsubsetloader.CortexSubsetLoader at 0x73493feab9a0>

In [6]:
make_new_eval_set(size=512)

                                                    

Did not collect 2560, only got 2318
Saved 512 samples to data/cortex_e687_512.json


In [22]:
create_new_datasets(num_new=48, size=8192+512, max_samples_per_step=4096, ignore_old=True, sleep_minutes=5, steps=1)

                                                    

Did not collect 4096, only got 2036
samples collected: 2036
samples collected: 2036 	deduped samples added: 2036
2036


KeyboardInterrupt: 

In [7]:
create_new_datasets(num_new=48, size=8144+512, max_samples_per_step=3072, ignore_old=True, sleep_minutes=5)

                                                    

Did not collect 3072, only got 2376
samples collected: 2376
samples collected: 2376 	deduped samples added: 160
2591


                                                    

Did not collect 3072, only got 2205
samples collected: 2205
samples collected: 2205 	deduped samples added: 425
3016


                                                    

Did not collect 3072, only got 2213
samples collected: 2213
samples collected: 2213 	deduped samples added: 219
3235


                                                    

Did not collect 3072, only got 2170
samples collected: 2170
samples collected: 2170 	deduped samples added: 176
3411


                                                    

Did not collect 3072, only got 2189
samples collected: 2189
samples collected: 2189 	deduped samples added: 195
3606


                                                    

Did not collect 3072, only got 1994
samples collected: 1994
samples collected: 1994 	deduped samples added: 0
3606


                                                    

Did not collect 3072, only got 1979
samples collected: 1979
samples collected: 1979 	deduped samples added: 218
3824


                                                    

Did not collect 3072, only got 2188
samples collected: 2188
samples collected: 2188 	deduped samples added: 424
4248


                                                    

Did not collect 3072, only got 2166
samples collected: 2166
samples collected: 2166 	deduped samples added: 619
4867


                                                    

Did not collect 3072, only got 2218
samples collected: 2218
samples collected: 2218 	deduped samples added: 671
5538


                                                    

Did not collect 3072, only got 2003
samples collected: 2003
samples collected: 2003 	deduped samples added: 455
5993


                                                    

Did not collect 3072, only got 2458
samples collected: 2458
samples collected: 2458 	deduped samples added: 686
6679


                                                    

Did not collect 3072, only got 2015
samples collected: 2015
samples collected: 2015 	deduped samples added: 232
6911


                                                    

Did not collect 3072, only got 2232
samples collected: 2232
samples collected: 2232 	deduped samples added: 446
7357


                                                    

Did not collect 3072, only got 2237
samples collected: 2237
samples collected: 2237 	deduped samples added: 664
8021


                                                    

Did not collect 3072, only got 2241
samples collected: 2241
samples collected: 2241 	deduped samples added: 666
8687
Saved 8656 samples to data/cortex_587_8656.json


                                                    

Did not collect 3072, only got 2216
samples collected: 2216
samples collected: 2216 	deduped samples added: 2212
2243


                                                    

Did not collect 3072, only got 2013
samples collected: 2013
samples collected: 2013 	deduped samples added: 222
2465


                                                    

Did not collect 3072, only got 2013
samples collected: 2013
samples collected: 2013 	deduped samples added: 0
2465


                                                    

Did not collect 3072, only got 2211
samples collected: 2211
samples collected: 2211 	deduped samples added: 198
2663


                                                    

Did not collect 3072, only got 2211
samples collected: 2211
samples collected: 2211 	deduped samples added: 195
2858


                                                    

Did not collect 3072, only got 2013
samples collected: 2013
samples collected: 2013 	deduped samples added: 0
2858


                                                    

Did not collect 3072, only got 2447
samples collected: 2447
samples collected: 2447 	deduped samples added: 434
3292


                                                    

Did not collect 3072, only got 1973
samples collected: 1973
samples collected: 1973 	deduped samples added: 183
3475


                                                    

Did not collect 3072, only got 2189
samples collected: 2189
samples collected: 2189 	deduped samples added: 399
3874


                                                    

Did not collect 3072, only got 2165
samples collected: 2165
samples collected: 2165 	deduped samples added: 372
4246


                                                    

Did not collect 3072, only got 2219
samples collected: 2219
samples collected: 2219 	deduped samples added: 427
4673


                                                    

Did not collect 3072, only got 2387
samples collected: 2387
samples collected: 2387 	deduped samples added: 596
5269


                                                    

Did not collect 3072, only got 2609
samples collected: 2609
samples collected: 2609 	deduped samples added: 609
5878


                                                    

Did not collect 3072, only got 2199
samples collected: 2199
samples collected: 2199 	deduped samples added: 417
6295


                                                    

Did not collect 3072, only got 2396
samples collected: 2396
samples collected: 2396 	deduped samples added: 612
6907


                                                    

Did not collect 3072, only got 2407
samples collected: 2407
samples collected: 2407 	deduped samples added: 624
7531


                                                    

Did not collect 3072, only got 2411
samples collected: 2411
samples collected: 2411 	deduped samples added: 606
8137


                                                    

Did not collect 3072, only got 2570
samples collected: 2570
samples collected: 2570 	deduped samples added: 590
8727
Saved 8656 samples to data/cortex_588_8656.json


                                                    

Did not collect 3072, only got 2412
samples collected: 2412
samples collected: 2412 	deduped samples added: 2406
2477


                                                    

Did not collect 3072, only got 2406
samples collected: 2406
samples collected: 2406 	deduped samples added: 203
2680


                                                    

Did not collect 3072, only got 2414
samples collected: 2414
samples collected: 2414 	deduped samples added: 211
2891


                                                    

Did not collect 3072, only got 2403
samples collected: 2403
samples collected: 2403 	deduped samples added: 197
3088


                                                    

Did not collect 3072, only got 2421
samples collected: 2421
samples collected: 2421 	deduped samples added: 216
3304


                                                    

Did not collect 3072, only got 2819
samples collected: 2819
samples collected: 2819 	deduped samples added: 616
3920


                                                    

Did not collect 3072, only got 1999
samples collected: 1999
samples collected: 1999 	deduped samples added: 214
4134


                                                    

Did not collect 3072, only got 2444
samples collected: 2444
samples collected: 2444 	deduped samples added: 659
4793


                                                    

Did not collect 3072, only got 2209
samples collected: 2209
samples collected: 2209 	deduped samples added: 422
5215


                                                    

Did not collect 3072, only got 2402
samples collected: 2402
samples collected: 2402 	deduped samples added: 616
5831


                                                    

Did not collect 3072, only got 2833
samples collected: 2833
samples collected: 2833 	deduped samples added: 849
6680


                                                    

Did not collect 3072, only got 2599
samples collected: 2599
samples collected: 2599 	deduped samples added: 619
7299


                                                    

Did not collect 3072, only got 2416
samples collected: 2416
samples collected: 2416 	deduped samples added: 665
7964


                                                    

Failed to fetch data, retrying. Attempt 1/5


                                                    

Did not collect 3072, only got 2175
samples collected: 2175
samples collected: 2175 	deduped samples added: 424
8388


                                                    

Did not collect 3072, only got 2388
samples collected: 2388
samples collected: 2388 	deduped samples added: 647
9035
Saved 8656 samples to data/cortex_589_8656.json


                                                    

Did not collect 3072, only got 2402
samples collected: 2402
samples collected: 2402 	deduped samples added: 2350
2729


                                                    

Did not collect 3072, only got 2170
samples collected: 2170
samples collected: 2170 	deduped samples added: 804
3533


                                                    

Did not collect 3072, only got 2331
samples collected: 2331
samples collected: 2331 	deduped samples added: 988
4521


                                                    

Did not collect 3072, only got 1954
samples collected: 1954
samples collected: 1954 	deduped samples added: 610
5131


                                                    

Did not collect 3072, only got 2372
samples collected: 2372
samples collected: 2372 	deduped samples added: 1016
6147


                                                    

Did not collect 3072, only got 2564
samples collected: 2564
samples collected: 2564 	deduped samples added: 1006
7153


                                                    

Did not collect 3072, only got 2611
samples collected: 2611
samples collected: 2611 	deduped samples added: 859
8012


                                                    

Did not collect 3072, only got 2389
samples collected: 2389
samples collected: 2389 	deduped samples added: 612
8624


                                                    

Did not collect 3072, only got 2191
samples collected: 2191
samples collected: 2191 	deduped samples added: 657
9281
Saved 8656 samples to data/cortex_590_8656.json


                                                    

Did not collect 3072, only got 2190
samples collected: 2190
samples collected: 2190 	deduped samples added: 2086
2711


                                                    

Did not collect 3072, only got 2402
samples collected: 2402
samples collected: 2402 	deduped samples added: 866
3577


                                                    

Did not collect 3072, only got 2167
samples collected: 2167
samples collected: 2167 	deduped samples added: 633
4210


                                                    

Did not collect 3072, only got 1962
samples collected: 1962
samples collected: 1962 	deduped samples added: 428
4638


                                                    

Did not collect 3072, only got 1950
samples collected: 1950
samples collected: 1950 	deduped samples added: 413
5051


                                                    

Did not collect 3072, only got 1534
samples collected: 1534
samples collected: 1534 	deduped samples added: 0
5051


                                                    

Did not collect 3072, only got 1756
samples collected: 1756
samples collected: 1756 	deduped samples added: 222
5273


                                                    

Did not collect 3072, only got 1925
samples collected: 1925
samples collected: 1925 	deduped samples added: 387
5660


                                                    

Did not collect 3072, only got 1746
samples collected: 1746
samples collected: 1746 	deduped samples added: 212
5872


                                                    

Did not collect 3072, only got 1956
samples collected: 1956
samples collected: 1956 	deduped samples added: 422
6294


                                                    

Did not collect 3072, only got 1958
samples collected: 1958
samples collected: 1958 	deduped samples added: 422
6716


                                                    

Did not collect 3072, only got 2159
samples collected: 2159
samples collected: 2159 	deduped samples added: 622
7338


                                                    

Did not collect 3072, only got 1973
samples collected: 1973
samples collected: 1973 	deduped samples added: 221
7559


                                                    

Did not collect 3072, only got 2163
samples collected: 2163
samples collected: 2163 	deduped samples added: 410
7969


                                                    

Did not collect 3072, only got 1972
samples collected: 1972
samples collected: 1972 	deduped samples added: 217
8186


                                                    

Did not collect 3072, only got 2183
samples collected: 2183
samples collected: 2183 	deduped samples added: 430
8616


                                                    

Did not collect 3072, only got 2175
samples collected: 2175
samples collected: 2175 	deduped samples added: 422
9038
Saved 8656 samples to data/cortex_591_8656.json


                                                    

Did not collect 3072, only got 1752
samples collected: 1752
samples collected: 1752 	deduped samples added: 1674
2056


                                                    

Did not collect 3072, only got 2160
samples collected: 2160
samples collected: 2160 	deduped samples added: 408
2464


                                                    

Did not collect 3072, only got 2158
samples collected: 2158
samples collected: 2158 	deduped samples added: 402
2866


                                                    

Did not collect 3072, only got 2163
samples collected: 2163
samples collected: 2163 	deduped samples added: 212
3078


                                                    

Did not collect 3072, only got 2124
samples collected: 2124
samples collected: 2124 	deduped samples added: 370
3448


                                                    

Did not collect 3072, only got 2185
samples collected: 2185
samples collected: 2185 	deduped samples added: 432
3880


                                                    

Did not collect 3072, only got 1752
samples collected: 1752
samples collected: 1752 	deduped samples added: 0
3880


                                                    

Did not collect 3072, only got 2155
samples collected: 2155
samples collected: 2155 	deduped samples added: 400
4280


                                                    

Did not collect 3072, only got 1968
samples collected: 1968
samples collected: 1968 	deduped samples added: 0
4280


                                                    

Did not collect 3072, only got 2394
samples collected: 2394
samples collected: 2394 	deduped samples added: 423
4703


                                                    

Did not collect 3072, only got 2392
samples collected: 2392
samples collected: 2392 	deduped samples added: 422
5125


                                                    

Did not collect 3072, only got 2375
samples collected: 2375
samples collected: 2375 	deduped samples added: 405
5530


                                                    

Did not collect 3072, only got 2380
samples collected: 2380
samples collected: 2380 	deduped samples added: 410
5940


                                                    

Did not collect 3072, only got 2383
samples collected: 2383
samples collected: 2383 	deduped samples added: 411
6351


                                                    

Did not collect 3072, only got 2601
samples collected: 2601
samples collected: 2601 	deduped samples added: 630
6981


                                                    

Did not collect 3072, only got 2398
samples collected: 2398
samples collected: 2398 	deduped samples added: 642
7623


                                                    

Did not collect 3072, only got 2368
samples collected: 2368
samples collected: 2368 	deduped samples added: 615
8238


                                                    

Did not collect 3072, only got 2589
samples collected: 2589
samples collected: 2589 	deduped samples added: 628
8866
Saved 8656 samples to data/cortex_592_8656.json


                                                    

Did not collect 3072, only got 2386
samples collected: 2386
samples collected: 2386 	deduped samples added: 2335
2545


                                                    

Did not collect 3072, only got 2369
samples collected: 2369
samples collected: 2369 	deduped samples added: 405
2950


                                                    

Did not collect 3072, only got 2353
samples collected: 2353
samples collected: 2353 	deduped samples added: 386
3336


                                                    

Did not collect 3072, only got 2346
samples collected: 2346
samples collected: 2346 	deduped samples added: 172
3508


                                                    

Did not collect 3072, only got 2291
samples collected: 2291
samples collected: 2291 	deduped samples added: 113
3621


                                                    

Did not collect 3072, only got 2385
samples collected: 2385
samples collected: 2385 	deduped samples added: 211
3832


                                                    

Did not collect 3072, only got 2404
samples collected: 2404
samples collected: 2404 	deduped samples added: 230
4062


                                                    

Did not collect 3072, only got 2584
samples collected: 2584
samples collected: 2584 	deduped samples added: 407
4469


                                                    

Did not collect 3072, only got 2387
samples collected: 2387
samples collected: 2387 	deduped samples added: 213
4682


                                                    

Did not collect 3072, only got 2371
samples collected: 2371
samples collected: 2371 	deduped samples added: 404
5086


                                                    

Did not collect 3072, only got 2373
samples collected: 2373
samples collected: 2373 	deduped samples added: 409
5495


                                                    

Did not collect 3072, only got 2380
samples collected: 2380
samples collected: 2380 	deduped samples added: 416
5911


                                                    

Did not collect 3072, only got 2174
samples collected: 2174
samples collected: 2174 	deduped samples added: 209
6120


                                                    

Did not collect 3072, only got 2187
samples collected: 2187
samples collected: 2187 	deduped samples added: 218
6338


                                                    

Did not collect 3072, only got 2356
samples collected: 2356
samples collected: 2356 	deduped samples added: 387
6725


                                                    

Did not collect 3072, only got 2594
samples collected: 2594
samples collected: 2594 	deduped samples added: 629
7354


                                                    

Did not collect 3072, only got 2569
samples collected: 2569
samples collected: 2569 	deduped samples added: 397
7751


                                                    

Did not collect 3072, only got 2170
samples collected: 2170
samples collected: 2170 	deduped samples added: 0
7751


                                                    

Did not collect 3072, only got 2524
samples collected: 2524
samples collected: 2524 	deduped samples added: 354
8105


                                                    

Did not collect 3072, only got 2806
samples collected: 2806
samples collected: 2806 	deduped samples added: 635
8740
Saved 8656 samples to data/cortex_593_8656.json


                                                    

Did not collect 3072, only got 2774
samples collected: 2774
samples collected: 2774 	deduped samples added: 2747
2831


                                                    

Did not collect 3072, only got 2790
samples collected: 2790
samples collected: 2790 	deduped samples added: 620
3451


                                                    

Did not collect 3072, only got 2597
samples collected: 2597
samples collected: 2597 	deduped samples added: 220
3671


                                                    

Did not collect 3072, only got 3012
samples collected: 3012
samples collected: 3012 	deduped samples added: 628
4299


                                                    

Did not collect 3072, only got 2583
samples collected: 2583
samples collected: 2583 	deduped samples added: 206
4505


                                                    

Did not collect 3072, only got 2602
samples collected: 2602
samples collected: 2602 	deduped samples added: 224
4729


                                                    

Did not collect 3072, only got 2377
samples collected: 2377
samples collected: 2377 	deduped samples added: 0
4729


                                                    

Did not collect 3072, only got 2575
samples collected: 2575
samples collected: 2575 	deduped samples added: 198
4927


                                                    

Did not collect 3072, only got 2772
samples collected: 2772
samples collected: 2772 	deduped samples added: 391
5318


                                                    

Did not collect 3072, only got 2801
samples collected: 2801
samples collected: 2801 	deduped samples added: 202
5520


                                                    

Did not collect 3072, only got 3026
samples collected: 3026
samples collected: 3026 	deduped samples added: 428
5948


                                                    

Did not collect 3072, only got 2994
samples collected: 2994
samples collected: 2994 	deduped samples added: 179
6127


                                                    

Did not collect 3072, only got 3018
samples collected: 3018
samples collected: 3018 	deduped samples added: 205
6332


                                                    

Did not collect 3072, only got 2998
samples collected: 2998
samples collected: 2998 	deduped samples added: 184
6516


                                                    

Did not collect 3072, only got 3028
samples collected: 3028
samples collected: 3028 	deduped samples added: 68
6584


                                                    

Did not collect 3072, only got 2987
samples collected: 2987
samples collected: 2987 	deduped samples added: 174
6758


                                                    

Did not collect 3072, only got 3020
samples collected: 3020
samples collected: 3020 	deduped samples added: 206
6964


                                                    

Did not collect 3072, only got 3028
samples collected: 3028
samples collected: 3028 	deduped samples added: 0
6964


                                                    

Did not collect 3072, only got 3030
samples collected: 3030
samples collected: 3030 	deduped samples added: 218
7182


                                                    

Did not collect 3072, only got 2811
samples collected: 2811
samples collected: 2811 	deduped samples added: 0
7182


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 362
7544


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 408
7952


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 370
8322


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 508
8830
Saved 8656 samples to data/cortex_594_8656.json


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 3022
3196


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 1323
4519


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 670
5189


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 426
5615


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 404
6019


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 432
6451


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 0
6451


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 207
6658


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 639
7297


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 302
7599


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 390
7989


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 635
8624


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 647
9271
Saved 8656 samples to data/cortex_595_8656.json


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 2901
3516


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 984
4500


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 717
5217


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 0
5217


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 190
5407


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 232
5639


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 196
5835


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 70
5905


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 77
5982


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 362
6344


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 393
6737


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 436
7173


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 62
7235


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 210
7445


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 397
7842


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 385
8227


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 216
8443


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 199
8642


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 194
8836
Saved 8656 samples to data/cortex_596_8656.json


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 3004
3184


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 1738
4922


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 742
5664


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 0
5664


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 223
5887


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 354
6241


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 217
6458


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 0
6458


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 169
6627


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 225
6852


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 213
7065


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 0
7065


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 309
7374


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 611
7985


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 357
8342


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 191
8533


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 334
8867
Saved 8656 samples to data/cortex_597_8656.json


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 2989
3200


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 1670
4870


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 831
5701


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 202
5903


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 0
5903


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 211
6114


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 0
6114


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 438
6552


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 394
6946


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 212
7158


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 216
7374


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 300
7674


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 217
7891


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 220
8111


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 232
8343


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 660
9003
Saved 8656 samples to data/cortex_598_8656.json


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 2967
3314


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 1879
5193


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 1022
6215


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 1024
7239


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 656
7895


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 195
8090


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 436
8526


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 0
8526


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 208
8734
Saved 8656 samples to data/cortex_599_8656.json


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 3041
3119


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 1616
4735


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 968
5703


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 282
5985


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 511
6496


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 231
6727


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 427
7154


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 650
7804


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 211
8015


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 0
8015


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 230
8245


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 69
8314


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 213
8527


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 0
8527


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 425
8952
Saved 8656 samples to data/cortex_600_8656.json


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 2967
3263


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 1646
4909


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 1236
6145


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 211
6356


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 1003
7359


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 406
7765


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 200
7965


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 173
8138


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 187
8325


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 172
8497


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 0
8497


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 423
8920
Saved 8656 samples to data/cortex_601_8656.json


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 2982
3246


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 1779
5025


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 1951
6976


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 508
7484


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 794
8278


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 401
8679
Saved 8656 samples to data/cortex_602_8656.json


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 3055
3078


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 1739
4817


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 1888
6705


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 853
7558


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 206
7764


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 644
8408


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 414
8822
Saved 8656 samples to data/cortex_603_8656.json


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 3023
3189


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 1895
5084


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 1240
6324


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 1021
7345


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 642
7987


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 224
8211


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 426
8637


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 185
8822
Saved 8656 samples to data/cortex_604_8656.json


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 3008
3174


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 2097
5271


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 1707
6978


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 229
7207


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 1473
8680
Saved 8656 samples to data/cortex_605_8656.json


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 3057
3081


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 2216
5297


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 678
5975


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 1083
7058


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 1221
8279


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 826
9105
Saved 8656 samples to data/cortex_606_8656.json


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 2941
3390


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 2087
5477


                                                    

samples collected: 3072
samples collected: 3072 	deduped samples added: 2015
7492


                                                    

Did not collect 3072, only got 2443
samples collected: 2443
samples collected: 2443 	deduped samples added: 1069
8561


                                                    

Did not collect 3072, only got 2421
samples collected: 2421
samples collected: 2421 	deduped samples added: 649
9210
Saved 8656 samples to data/cortex_607_8656.json


                                                    

Did not collect 3072, only got 2201
samples collected: 2201
samples collected: 2201 	deduped samples added: 2061
2615


                                                    

Did not collect 3072, only got 2405
samples collected: 2405
samples collected: 2405 	deduped samples added: 415
3030


                                                    

Did not collect 3072, only got 2220
samples collected: 2220
samples collected: 2220 	deduped samples added: 232
3262


                                                    

Did not collect 3072, only got 2215
samples collected: 2215
samples collected: 2215 	deduped samples added: 227
3489


                                                    

Did not collect 3072, only got 2399
samples collected: 2399
samples collected: 2399 	deduped samples added: 409
3898


                                                    

Did not collect 3072, only got 2209
samples collected: 2209
samples collected: 2209 	deduped samples added: 445
4343


                                                    

Did not collect 3072, only got 2165
samples collected: 2165
samples collected: 2165 	deduped samples added: 401
4744


                                                    

Did not collect 3072, only got 2619
samples collected: 2619
samples collected: 2619 	deduped samples added: 652
5396


                                                    

Did not collect 3072, only got 2418
samples collected: 2418
samples collected: 2418 	deduped samples added: 455
5851


                                                    

Did not collect 3072, only got 1976
samples collected: 1976
samples collected: 1976 	deduped samples added: 456
6307


                                                    

Did not collect 3072, only got 1973
samples collected: 1973
samples collected: 1973 	deduped samples added: 225
6532


                                                    

Did not collect 3072, only got 2206
samples collected: 2206
samples collected: 2206 	deduped samples added: 449
6981


                                                    

Did not collect 3072, only got 2117
samples collected: 2117
samples collected: 2117 	deduped samples added: 369
7350


                                                    

Did not collect 3072, only got 1958
samples collected: 1958
samples collected: 1958 	deduped samples added: 210
7560


                                                    

Did not collect 3072, only got 2202
samples collected: 2202
samples collected: 2202 	deduped samples added: 453
8013


                                                    

Did not collect 3072, only got 2408
samples collected: 2408
samples collected: 2408 	deduped samples added: 657
8670
Saved 8656 samples to data/cortex_608_8656.json


                                                    

Did not collect 3072, only got 2155
samples collected: 2155
samples collected: 2155 	deduped samples added: 2155
2169


                                                    

Did not collect 3072, only got 2206
samples collected: 2206
samples collected: 2206 	deduped samples added: 661
2830


                                                    

Did not collect 3072, only got 1989
samples collected: 1989
samples collected: 1989 	deduped samples added: 446
3276


                                                    

Did not collect 3072, only got 1996
samples collected: 1996
samples collected: 1996 	deduped samples added: 453
3729


                                                    

Did not collect 3072, only got 1767
samples collected: 1767
samples collected: 1767 	deduped samples added: 0
3729


                                                    

Did not collect 3072, only got 2206
samples collected: 2206
samples collected: 2206 	deduped samples added: 437
4166


                                                    

Did not collect 3072, only got 2173
samples collected: 2173
samples collected: 2173 	deduped samples added: 403
4569


                                                    

Did not collect 3072, only got 2198
samples collected: 2198
samples collected: 2198 	deduped samples added: 198
4767


                                                    

Did not collect 3072, only got 2197
samples collected: 2197
samples collected: 2197 	deduped samples added: 194
4961


                                                    

Did not collect 3072, only got 2188
samples collected: 2188
samples collected: 2188 	deduped samples added: 189
5150


                                                    

Did not collect 3072, only got 2667
samples collected: 2667
samples collected: 2667 	deduped samples added: 667
5817


                                                    

Did not collect 3072, only got 2624
samples collected: 2624
samples collected: 2624 	deduped samples added: 622
6439


                                                    

Did not collect 3072, only got 1977
samples collected: 1977
samples collected: 1977 	deduped samples added: 445
6884


                                                    

Did not collect 3072, only got 2175
samples collected: 2175
samples collected: 2175 	deduped samples added: 642
7526


                                                    

Did not collect 3072, only got 2175
samples collected: 2175
samples collected: 2175 	deduped samples added: 652
8178


                                                    

Did not collect 3072, only got 2169
samples collected: 2169
samples collected: 2169 	deduped samples added: 647
8825
Saved 8656 samples to data/cortex_609_8656.json


                                                    

Did not collect 3072, only got 1907
samples collected: 1907
samples collected: 1907 	deduped samples added: 1878
2047


                                                    

Did not collect 3072, only got 1956
samples collected: 1956
samples collected: 1956 	deduped samples added: 435
2482


                                                    

Did not collect 3072, only got 1744
samples collected: 1744
samples collected: 1744 	deduped samples added: 0
2482


                                                    

Did not collect 3072, only got 1911
samples collected: 1911
samples collected: 1911 	deduped samples added: 166
2648


                                                    

Did not collect 3072, only got 1944
samples collected: 1944
samples collected: 1944 	deduped samples added: 198
2846


                                                    

Did not collect 3072, only got 1953
samples collected: 1953
samples collected: 1953 	deduped samples added: 209
3055


                                                    

Did not collect 3072, only got 2194
samples collected: 2194
samples collected: 2194 	deduped samples added: 450
3505


                                                    

Did not collect 3072, only got 2392
samples collected: 2392
samples collected: 2392 	deduped samples added: 647
4152


                                                    

Did not collect 3072, only got 2612
samples collected: 2612
samples collected: 2612 	deduped samples added: 865
5017


                                                    

Did not collect 3072, only got 2877
samples collected: 2877
samples collected: 2877 	deduped samples added: 906
5923


                                                    

Did not collect 3072, only got 2657
samples collected: 2657
samples collected: 2657 	deduped samples added: 676
6599


                                                    

Did not collect 3072, only got 2424
samples collected: 2424
samples collected: 2424 	deduped samples added: 206
6805


                                                    

Did not collect 3072, only got 2429
samples collected: 2429
samples collected: 2429 	deduped samples added: 214
7019


                                                    

Did not collect 3072, only got 2441
samples collected: 2441
samples collected: 2441 	deduped samples added: 226
7245


                                                    

Did not collect 3072, only got 2448
samples collected: 2448
samples collected: 2448 	deduped samples added: 233
7478


                                                    

Did not collect 3072, only got 2651
samples collected: 2651
samples collected: 2651 	deduped samples added: 431
7909


                                                    

Did not collect 3072, only got 2448
samples collected: 2448
samples collected: 2448 	deduped samples added: 0
7909


                                                    

Did not collect 3072, only got 2881
samples collected: 2881
samples collected: 2881 	deduped samples added: 661
8570


                                                    

Did not collect 3072, only got 2404
samples collected: 2404
samples collected: 2404 	deduped samples added: 408
8978
Saved 8656 samples to data/cortex_610_8656.json


                                                    

Did not collect 3072, only got 2436
samples collected: 2436
samples collected: 2436 	deduped samples added: 2357
2679


                                                    

Did not collect 3072, only got 2828
samples collected: 2828
samples collected: 2828 	deduped samples added: 392
3071


                                                    

Did not collect 3072, only got 2628
samples collected: 2628
samples collected: 2628 	deduped samples added: 424
3495


                                                    

Did not collect 3072, only got 2642
samples collected: 2642
samples collected: 2642 	deduped samples added: 436
3931


                                                    

Did not collect 3072, only got 2388
samples collected: 2388
samples collected: 2388 	deduped samples added: 183
4114


                                                    

Did not collect 3072, only got 2437
samples collected: 2437
samples collected: 2437 	deduped samples added: 233
4347


                                                    

Did not collect 3072, only got 2204
samples collected: 2204
samples collected: 2204 	deduped samples added: 0
4347


                                                    

Did not collect 3072, only got 2609
samples collected: 2609
samples collected: 2609 	deduped samples added: 404
4751


                                                    

Did not collect 3072, only got 2618
samples collected: 2618
samples collected: 2618 	deduped samples added: 411
5162


                                                    

Did not collect 3072, only got 2635
samples collected: 2635
samples collected: 2635 	deduped samples added: 431
5593


                                                    

Did not collect 3072, only got 2601
samples collected: 2601
samples collected: 2601 	deduped samples added: 622
6215


                                                    

Did not collect 3072, only got 2414
samples collected: 2414
samples collected: 2414 	deduped samples added: 436
6651


                                                    

Did not collect 3072, only got 2393
samples collected: 2393
samples collected: 2393 	deduped samples added: 202
6853


                                                    

Did not collect 3072, only got 2632
samples collected: 2632
samples collected: 2632 	deduped samples added: 440
7293


                                                    

Did not collect 3072, only got 2398
samples collected: 2398
samples collected: 2398 	deduped samples added: 207
7500


                                                    

Did not collect 3072, only got 2626
samples collected: 2626
samples collected: 2626 	deduped samples added: 226
7726


                                                    

Did not collect 3072, only got 2398
samples collected: 2398
samples collected: 2398 	deduped samples added: 0
7726


                                                    

Did not collect 3072, only got 2615
samples collected: 2615
samples collected: 2615 	deduped samples added: 214
7940


                                                    

Did not collect 3072, only got 2575
samples collected: 2575
samples collected: 2575 	deduped samples added: 408
8348


                                                    

Did not collect 3072, only got 2373
samples collected: 2373
samples collected: 2373 	deduped samples added: 207
8555


                                                    

Did not collect 3072, only got 2842
samples collected: 2842
samples collected: 2842 	deduped samples added: 675
9230
Saved 8656 samples to data/cortex_611_8656.json


                                                    

Did not collect 3072, only got 2586
samples collected: 2586
samples collected: 2586 	deduped samples added: 2465
3039


                                                    

Did not collect 3072, only got 2586
samples collected: 2586
samples collected: 2586 	deduped samples added: 633
3672


                                                    

Did not collect 3072, only got 2134
samples collected: 2134
samples collected: 2134 	deduped samples added: 418
4090


                                                    

Did not collect 3072, only got 2398
samples collected: 2398
samples collected: 2398 	deduped samples added: 665
4755


                                                    

Did not collect 3072, only got 2349
samples collected: 2349
samples collected: 2349 	deduped samples added: 613
5368


                                                    

Did not collect 3072, only got 2391
samples collected: 2391
samples collected: 2391 	deduped samples added: 657
6025


                                                    

Did not collect 3072, only got 2603
samples collected: 2603
samples collected: 2603 	deduped samples added: 644
6669


                                                    

Did not collect 3072, only got 2619
samples collected: 2619
samples collected: 2619 	deduped samples added: 445
7114


                                                    

Did not collect 3072, only got 2615
samples collected: 2615
samples collected: 2615 	deduped samples added: 211
7325


                                                    

Did not collect 3072, only got 2850
samples collected: 2850
samples collected: 2850 	deduped samples added: 448
7773


                                                    

Did not collect 3072, only got 2427
samples collected: 2427
samples collected: 2427 	deduped samples added: 234
8007


                                                    

Did not collect 3072, only got 2853
samples collected: 2853
samples collected: 2853 	deduped samples added: 426
8433


                                                    

Did not collect 3072, only got 2396
samples collected: 2396
samples collected: 2396 	deduped samples added: 429
8862
Saved 8656 samples to data/cortex_612_8656.json


                                                    

Did not collect 3072, only got 2620
samples collected: 2620
samples collected: 2620 	deduped samples added: 2571
2777


                                                    

Did not collect 3072, only got 2645
samples collected: 2645
samples collected: 2645 	deduped samples added: 893
3670


                                                    

Did not collect 3072, only got 2198
samples collected: 2198
samples collected: 2198 	deduped samples added: 448
4118


                                                    

Did not collect 3072, only got 2410
samples collected: 2410
samples collected: 2410 	deduped samples added: 659
4777


                                                    

Did not collect 3072, only got 2630
samples collected: 2630
samples collected: 2630 	deduped samples added: 662
5439


                                                    

Did not collect 3072, only got 2409
samples collected: 2409
samples collected: 2409 	deduped samples added: 441
5880


                                                    

Did not collect 3072, only got 2614
samples collected: 2614
samples collected: 2614 	deduped samples added: 361
6241


                                                    

Did not collect 3072, only got 2584
samples collected: 2584
samples collected: 2584 	deduped samples added: 404
6645


                                                    

Did not collect 3072, only got 2619
samples collected: 2619
samples collected: 2619 	deduped samples added: 439
7084


                                                    

Did not collect 3072, only got 2820
samples collected: 2820
samples collected: 2820 	deduped samples added: 406
7490


                                                    

Did not collect 3072, only got 2599
samples collected: 2599
samples collected: 2599 	deduped samples added: 330
7820


                                                    

Did not collect 3072, only got 2608
samples collected: 2608
samples collected: 2608 	deduped samples added: 419
8239


                                                    

Did not collect 3072, only got 2415
samples collected: 2415
samples collected: 2415 	deduped samples added: 228
8467


                                                    

Did not collect 3072, only got 2640
samples collected: 2640
samples collected: 2640 	deduped samples added: 438
8905
Saved 8656 samples to data/cortex_613_8656.json


                                                    

Did not collect 3072, only got 2638
samples collected: 2638
samples collected: 2638 	deduped samples added: 2582
2831


                                                    

Did not collect 3072, only got 2426
samples collected: 2426
samples collected: 2426 	deduped samples added: 0
2831


                                                    

Did not collect 3072, only got 2634
samples collected: 2634
samples collected: 2634 	deduped samples added: 207
3038


                                                    

Did not collect 3072, only got 2632
samples collected: 2632
samples collected: 2632 	deduped samples added: 206
3244


                                                    

Did not collect 3072, only got 2623
samples collected: 2623
samples collected: 2623 	deduped samples added: 401
3645


                                                    

Did not collect 3072, only got 2418
samples collected: 2418
samples collected: 2418 	deduped samples added: 199
3844


                                                    

Did not collect 3072, only got 3061
samples collected: 3061
samples collected: 3061 	deduped samples added: 842
4686


                                                    

Did not collect 3072, only got 2389
samples collected: 2389
samples collected: 2389 	deduped samples added: 397
5083


                                                    

Did not collect 3072, only got 2429
samples collected: 2429
samples collected: 2429 	deduped samples added: 438
5521


                                                    

Did not collect 3072, only got 2569
samples collected: 2569
samples collected: 2569 	deduped samples added: 593
6114


                                                    

Did not collect 3072, only got 2406
samples collected: 2406
samples collected: 2406 	deduped samples added: 644
6758


                                                    

Did not collect 3072, only got 2164
samples collected: 2164
samples collected: 2164 	deduped samples added: 405
7163


                                                    

Did not collect 3072, only got 2415
samples collected: 2415
samples collected: 2415 	deduped samples added: 668
7831


                                                    

Did not collect 3072, only got 2426
samples collected: 2426
samples collected: 2426 	deduped samples added: 679
8510


                                                    

Did not collect 3072, only got 2193
samples collected: 2193
samples collected: 2193 	deduped samples added: 447
8957
Saved 8656 samples to data/cortex_614_8656.json


                                                    

Did not collect 3072, only got 2176
samples collected: 2176
samples collected: 2176 	deduped samples added: 2098
2399


                                                    

Did not collect 3072, only got 1972
samples collected: 1972
samples collected: 1972 	deduped samples added: 0
2399


                                                    

Did not collect 3072, only got 2374
samples collected: 2374
samples collected: 2374 	deduped samples added: 401
2800


                                                    

Did not collect 3072, only got 1985
samples collected: 1985
samples collected: 1985 	deduped samples added: 228
3028


                                                    

Did not collect 3072, only got 2189
samples collected: 2189
samples collected: 2189 	deduped samples added: 433
3461


                                                    

Did not collect 3072, only got 2208
samples collected: 2208
samples collected: 2208 	deduped samples added: 221
3682


                                                    

Did not collect 3072, only got 1986
samples collected: 1986
samples collected: 1986 	deduped samples added: 0
3682


                                                    

Did not collect 3072, only got 2203
samples collected: 2203
samples collected: 2203 	deduped samples added: 217
3899


                                                    

Did not collect 3072, only got 2202
samples collected: 2202
samples collected: 2202 	deduped samples added: 210
4109


                                                    

Did not collect 3072, only got 1986
samples collected: 1986
samples collected: 1986 	deduped samples added: 0
4109


                                                    

Did not collect 3072, only got 2214
samples collected: 2214
samples collected: 2214 	deduped samples added: 220
4329


                                                    

Did not collect 3072, only got 2192
samples collected: 2192
samples collected: 2192 	deduped samples added: 195
4524


                                                    

Did not collect 3072, only got 2419
samples collected: 2419
samples collected: 2419 	deduped samples added: 428
4952


                                                    

Did not collect 3072, only got 2420
samples collected: 2420
samples collected: 2420 	deduped samples added: 429
5381


                                                    

Did not collect 3072, only got 2636
samples collected: 2636
samples collected: 2636 	deduped samples added: 416
5797


                                                    

Did not collect 3072, only got 2357
samples collected: 2357
samples collected: 2357 	deduped samples added: 370
6167


                                                    

Did not collect 3072, only got 2208
samples collected: 2208
samples collected: 2208 	deduped samples added: 226
6393


                                                    

Did not collect 3072, only got 2431
samples collected: 2431
samples collected: 2431 	deduped samples added: 439
6832


                                                    

Did not collect 3072, only got 2420
samples collected: 2420
samples collected: 2420 	deduped samples added: 153
6985


                                                    

Did not collect 3072, only got 2407
samples collected: 2407
samples collected: 2407 	deduped samples added: 198
7183


                                                    

Did not collect 3072, only got 2207
samples collected: 2207
samples collected: 2207 	deduped samples added: 0
7183


                                                    

Did not collect 3072, only got 2403
samples collected: 2403
samples collected: 2403 	deduped samples added: 196
7379


                                                    

Did not collect 3072, only got 2427
samples collected: 2427
samples collected: 2427 	deduped samples added: 216
7595


                                                    

Did not collect 3072, only got 2411
samples collected: 2411
samples collected: 2411 	deduped samples added: 193
7788


                                                    

Did not collect 3072, only got 2415
samples collected: 2415
samples collected: 2415 	deduped samples added: 206
7994


                                                    

Did not collect 3072, only got 2640
samples collected: 2640
samples collected: 2640 	deduped samples added: 394
8388


                                                    

Did not collect 3072, only got 2404
samples collected: 2404
samples collected: 2404 	deduped samples added: 387
8775
Saved 8656 samples to data/cortex_615_8656.json


                                                    

Did not collect 3072, only got 2422
samples collected: 2422
samples collected: 2422 	deduped samples added: 2389
2508


                                                    

Did not collect 3072, only got 2209
samples collected: 2209
samples collected: 2209 	deduped samples added: 224
2732


                                                    

Did not collect 3072, only got 2403
samples collected: 2403
samples collected: 2403 	deduped samples added: 416
3148


                                                    

Did not collect 3072, only got 2421
samples collected: 2421
samples collected: 2421 	deduped samples added: 421
3569


                                                    

Did not collect 3072, only got 2217
samples collected: 2217
samples collected: 2217 	deduped samples added: 232
3801


                                                    

Did not collect 3072, only got 2416
samples collected: 2416
samples collected: 2416 	deduped samples added: 431
4232


                                                    

Did not collect 3072, only got 2188
samples collected: 2188
samples collected: 2188 	deduped samples added: 193
4425


                                                    

Did not collect 3072, only got 2180
samples collected: 2180
samples collected: 2180 	deduped samples added: 192
4617


                                                    

Did not collect 3072, only got 2209
samples collected: 2209
samples collected: 2209 	deduped samples added: 223
4840


                                                    

Did not collect 3072, only got 2399
samples collected: 2399
samples collected: 2399 	deduped samples added: 406
5246


                                                    

Did not collect 3072, only got 2405
samples collected: 2405
samples collected: 2405 	deduped samples added: 397
5643


                                                    

Did not collect 3072, only got 2224
samples collected: 2224
samples collected: 2224 	deduped samples added: 209
5852


                                                    

Did not collect 3072, only got 2216
samples collected: 2216
samples collected: 2216 	deduped samples added: 211
6063


                                                    

Did not collect 3072, only got 2217
samples collected: 2217
samples collected: 2217 	deduped samples added: 203
6266


                                                    

Did not collect 3072, only got 2210
samples collected: 2210
samples collected: 2210 	deduped samples added: 69
6335


                                                    

Did not collect 3072, only got 2205
samples collected: 2205
samples collected: 2205 	deduped samples added: 195
6530


                                                    

Did not collect 3072, only got 2417
samples collected: 2417
samples collected: 2417 	deduped samples added: 412
6942


                                                    

Did not collect 3072, only got 1984
samples collected: 1984
samples collected: 1984 	deduped samples added: 205
7147


                                                    

Did not collect 3072, only got 2405
samples collected: 2405
samples collected: 2405 	deduped samples added: 625
7772


                                                    

Did not collect 3072, only got 1754
samples collected: 1754
samples collected: 1754 	deduped samples added: 197
7969


                                                    

Did not collect 3072, only got 1973
samples collected: 1973
samples collected: 1973 	deduped samples added: 418
8387


                                                    

Did not collect 3072, only got 2205
samples collected: 2205
samples collected: 2205 	deduped samples added: 648
9035
Saved 8656 samples to data/cortex_616_8656.json


                                                    

Did not collect 3072, only got 2450
samples collected: 2450
samples collected: 2450 	deduped samples added: 2371
2750


                                                    

Did not collect 3072, only got 2015
samples collected: 2015
samples collected: 2015 	deduped samples added: 440
3190


                                                    

Did not collect 3072, only got 2188
samples collected: 2188
samples collected: 2188 	deduped samples added: 613
3803


                                                    

Did not collect 3072, only got 2387
samples collected: 2387
samples collected: 2387 	deduped samples added: 799
4602


                                                    

Did not collect 3072, only got 2248
samples collected: 2248
samples collected: 2248 	deduped samples added: 673
5275


                                                    

Did not collect 3072, only got 2004
samples collected: 2004
samples collected: 2004 	deduped samples added: 405
5680


                                                    

Did not collect 3072, only got 2431
samples collected: 2431
samples collected: 2431 	deduped samples added: 824
6504


                                                    

Did not collect 3072, only got 1996
samples collected: 1996
samples collected: 1996 	deduped samples added: 290
6794


                                                    

Did not collect 3072, only got 2449
samples collected: 2449
samples collected: 2449 	deduped samples added: 857
7651


                                                    

Did not collect 3072, only got 2232
samples collected: 2232
samples collected: 2232 	deduped samples added: 425
8076


                                                    

Did not collect 3072, only got 2184
samples collected: 2184
samples collected: 2184 	deduped samples added: 608
8684
Saved 8656 samples to data/cortex_617_8656.json


                                                    

Did not collect 3072, only got 2640
samples collected: 2640
samples collected: 2640 	deduped samples added: 2628
2656


                                                    

Did not collect 3072, only got 2451
samples collected: 2451
samples collected: 2451 	deduped samples added: 894
3550


                                                    

Did not collect 3072, only got 2220
samples collected: 2220
samples collected: 2220 	deduped samples added: 522
4072


                                                    

Did not collect 3072, only got 2455
samples collected: 2455
samples collected: 2455 	deduped samples added: 879
4951


                                                    

Did not collect 3072, only got 2195
samples collected: 2195
samples collected: 2195 	deduped samples added: 637
5588


                                                    

Did not collect 3072, only got 2006
samples collected: 2006
samples collected: 2006 	deduped samples added: 451
6039


                                                    

Did not collect 3072, only got 2234
samples collected: 2234
samples collected: 2234 	deduped samples added: 678
6717


                                                    

Did not collect 3072, only got 2003
samples collected: 2003
samples collected: 2003 	deduped samples added: 391
7108


                                                    

Did not collect 3072, only got 1791
samples collected: 1791
samples collected: 1791 	deduped samples added: 236
7344


                                                    

Did not collect 3072, only got 1729
samples collected: 1729
samples collected: 1729 	deduped samples added: 174
7518


                                                    

Did not collect 3072, only got 1961
samples collected: 1961
samples collected: 1961 	deduped samples added: 396
7914


                                                    

Did not collect 3072, only got 2192
samples collected: 2192
samples collected: 2192 	deduped samples added: 383
8297


                                                    

Did not collect 3072, only got 2212
samples collected: 2212
samples collected: 2212 	deduped samples added: 654
8951
Saved 8656 samples to data/cortex_618_8656.json


                                                    

Did not collect 3072, only got 2202
samples collected: 2202
samples collected: 2202 	deduped samples added: 2155
2450


                                                    

Did not collect 3072, only got 2211
samples collected: 2211
samples collected: 2211 	deduped samples added: 623
3073


                                                    

Did not collect 3072, only got 2197
samples collected: 2197
samples collected: 2197 	deduped samples added: 872
3945


                                                    

Did not collect 3072, only got 2459
samples collected: 2459
samples collected: 2459 	deduped samples added: 1101
5046


                                                    

Did not collect 3072, only got 1990
samples collected: 1990
samples collected: 1990 	deduped samples added: 878
5924


                                                    

Did not collect 3072, only got 1749
samples collected: 1749
samples collected: 1749 	deduped samples added: 422
6346


                                                    

Did not collect 3072, only got 1783
samples collected: 1783
samples collected: 1783 	deduped samples added: 442
6788


                                                    

Did not collect 3072, only got 1541
samples collected: 1541
samples collected: 1541 	deduped samples added: 217
7005


                                                    

Did not collect 3072, only got 1769
samples collected: 1769
samples collected: 1769 	deduped samples added: 444
7449


                                                    

Did not collect 3072, only got 1764
samples collected: 1764
samples collected: 1764 	deduped samples added: 436
7885


                                                    

Did not collect 3072, only got 1970
samples collected: 1970
samples collected: 1970 	deduped samples added: 642
8527


                                                    

Did not collect 3072, only got 1323
samples collected: 1323
samples collected: 1323 	deduped samples added: 0
8527


                                                    

Did not collect 3072, only got 1925
samples collected: 1925
samples collected: 1925 	deduped samples added: 498
9025
Saved 8656 samples to data/cortex_619_8656.json


                                                    

Did not collect 3072, only got 1981
samples collected: 1981
samples collected: 1981 	deduped samples added: 1926
2295


                                                    

Did not collect 3072, only got 1752
samples collected: 1752
samples collected: 1752 	deduped samples added: 606
2901


                                                    

Did not collect 3072, only got 1760
samples collected: 1760
samples collected: 1760 	deduped samples added: 231
3132


                                                    

Did not collect 3072, only got 1759
samples collected: 1759
samples collected: 1759 	deduped samples added: 336
3468


                                                    

Did not collect 3072, only got 1723
samples collected: 1723
samples collected: 1723 	deduped samples added: 321
3789


                                                    

Did not collect 3072, only got 1735
samples collected: 1735
samples collected: 1735 	deduped samples added: 225
4014


                                                    

Did not collect 3072, only got 1542
samples collected: 1542
samples collected: 1542 	deduped samples added: 151
4165


                                                    

Did not collect 3072, only got 1542
samples collected: 1542
samples collected: 1542 	deduped samples added: 0
4165


                                                    

Did not collect 3072, only got 1762
samples collected: 1762
samples collected: 1762 	deduped samples added: 305
4470


                                                    

Did not collect 3072, only got 1329
samples collected: 1329
samples collected: 1329 	deduped samples added: 232
4702


                                                    

Did not collect 3072, only got 1313
samples collected: 1313
samples collected: 1313 	deduped samples added: 12
4714


                                                    

Did not collect 3072, only got 1767
samples collected: 1767
samples collected: 1767 	deduped samples added: 670
5384


                                                    

Did not collect 3072, only got 1938
samples collected: 1938
samples collected: 1938 	deduped samples added: 602
5986


                                                    

Did not collect 3072, only got 1984
samples collected: 1984
samples collected: 1984 	deduped samples added: 618
6604


                                                    

Did not collect 3072, only got 2184
samples collected: 2184
samples collected: 2184 	deduped samples added: 784
7388


                                                    

Did not collect 3072, only got 1949
samples collected: 1949
samples collected: 1949 	deduped samples added: 441
7829


                                                    

Did not collect 3072, only got 1947
samples collected: 1947
samples collected: 1947 	deduped samples added: 322
8151


                                                    

Did not collect 3072, only got 1963
samples collected: 1963
samples collected: 1963 	deduped samples added: 411
8562


                                                    

Did not collect 3072, only got 2130
samples collected: 2130
samples collected: 2130 	deduped samples added: 391
8953
Saved 8656 samples to data/cortex_620_8656.json


                                                    

Did not collect 3072, only got 1937
samples collected: 1937
samples collected: 1937 	deduped samples added: 1889
2186


                                                    

Did not collect 3072, only got 1741
samples collected: 1741
samples collected: 1741 	deduped samples added: 334
2520


                                                    

Did not collect 3072, only got 1937
samples collected: 1937
samples collected: 1937 	deduped samples added: 344
2864


                                                    

Did not collect 3072, only got 1931
samples collected: 1931
samples collected: 1931 	deduped samples added: 283
3147


                                                    

Did not collect 3072, only got 1956
samples collected: 1956
samples collected: 1956 	deduped samples added: 373
3520


                                                    

Did not collect 3072, only got 1960
samples collected: 1960
samples collected: 1960 	deduped samples added: 327
3847


                                                    

Did not collect 3072, only got 1734
samples collected: 1734
samples collected: 1734 	deduped samples added: 197
4044


                                                    

Did not collect 3072, only got 2191
samples collected: 2191
samples collected: 2191 	deduped samples added: 517
4561


                                                    

Did not collect 3072, only got 2199
samples collected: 2199
samples collected: 2199 	deduped samples added: 650
5211


                                                    

Did not collect 3072, only got 2174
samples collected: 2174
samples collected: 2174 	deduped samples added: 634
5845


                                                    

Did not collect 3072, only got 2162
samples collected: 2162
samples collected: 2162 	deduped samples added: 413
6258


                                                    

Did not collect 3072, only got 2181
samples collected: 2181
samples collected: 2181 	deduped samples added: 432
6690


                                                    

Did not collect 3072, only got 2380
samples collected: 2380
samples collected: 2380 	deduped samples added: 636
7326


                                                    

Did not collect 3072, only got 2363
samples collected: 2363
samples collected: 2363 	deduped samples added: 617
7943


                                                    

Did not collect 3072, only got 2356
samples collected: 2356
samples collected: 2356 	deduped samples added: 571
8514


                                                    

Did not collect 3072, only got 2632
samples collected: 2632
samples collected: 2632 	deduped samples added: 797
9311
Saved 8656 samples to data/cortex_621_8656.json


                                                    

Did not collect 3072, only got 2141
samples collected: 2141
samples collected: 2141 	deduped samples added: 2010
2665


                                                    

Did not collect 3072, only got 2396
samples collected: 2396
samples collected: 2396 	deduped samples added: 453
3118


                                                    

Did not collect 3072, only got 2388
samples collected: 2388
samples collected: 2388 	deduped samples added: 282
3400


                                                    

Did not collect 3072, only got 2366
samples collected: 2366
samples collected: 2366 	deduped samples added: 276
3676


                                                    

Did not collect 3072, only got 2377
samples collected: 2377
samples collected: 2377 	deduped samples added: 354
4030


                                                    

Did not collect 3072, only got 2185
samples collected: 2185
samples collected: 2185 	deduped samples added: 229
4259


                                                    

Did not collect 3072, only got 2156
samples collected: 2156
samples collected: 2156 	deduped samples added: 199
4458


                                                    

Did not collect 3072, only got 2408
samples collected: 2408
samples collected: 2408 	deduped samples added: 451
4909


                                                    

Did not collect 3072, only got 2160
samples collected: 2160
samples collected: 2160 	deduped samples added: 421
5330


                                                    

Did not collect 3072, only got 2371
samples collected: 2371
samples collected: 2371 	deduped samples added: 626
5956


                                                    

Did not collect 3072, only got 2641
samples collected: 2641
samples collected: 2641 	deduped samples added: 862
6818


                                                    

Did not collect 3072, only got 2616
samples collected: 2616
samples collected: 2616 	deduped samples added: 652
7470


                                                    

Did not collect 3072, only got 1729
samples collected: 1729
samples collected: 1729 	deduped samples added: 0
7470


                                                    

Did not collect 3072, only got 2403
samples collected: 2403
samples collected: 2403 	deduped samples added: 673
8143


                                                    

Did not collect 3072, only got 2166
samples collected: 2166
samples collected: 2166 	deduped samples added: 430
8573


                                                    

Did not collect 3072, only got 2198
samples collected: 2198
samples collected: 2198 	deduped samples added: 434
9007
Saved 8656 samples to data/cortex_622_8656.json


                                                    

Did not collect 3072, only got 2184
samples collected: 2184
samples collected: 2184 	deduped samples added: 2110
2461


                                                    

Did not collect 3072, only got 1983
samples collected: 1983
samples collected: 1983 	deduped samples added: 0
2461


                                                    

Did not collect 3072, only got 2187
samples collected: 2187
samples collected: 2187 	deduped samples added: 203
2664


                                                    

Did not collect 3072, only got 2151
samples collected: 2151
samples collected: 2151 	deduped samples added: 48
2712


                                                    

Did not collect 3072, only got 1763
samples collected: 1763
samples collected: 1763 	deduped samples added: 0
2712


                                                    

Did not collect 3072, only got 1961
samples collected: 1961
samples collected: 1961 	deduped samples added: 121
2833


                                                    

Did not collect 3072, only got 2198
samples collected: 2198
samples collected: 2198 	deduped samples added: 400
3233


                                                    

Did not collect 3072, only got 2169
samples collected: 2169
samples collected: 2169 	deduped samples added: 500
3733


                                                    

Did not collect 3072, only got 2419
samples collected: 2419
samples collected: 2419 	deduped samples added: 480
4213


                                                    

Did not collect 3072, only got 2202
samples collected: 2202
samples collected: 2202 	deduped samples added: 292
4505


                                                    

Did not collect 3072, only got 2201
samples collected: 2201
samples collected: 2201 	deduped samples added: 375
4880


                                                    

Did not collect 3072, only got 1765
samples collected: 1765
samples collected: 1765 	deduped samples added: 0
4880


                                                    

Did not collect 3072, only got 2225
samples collected: 2225
samples collected: 2225 	deduped samples added: 525
5405


                                                    

Did not collect 3072, only got 2224
samples collected: 2224
samples collected: 2224 	deduped samples added: 236
5641


                                                    

Did not collect 3072, only got 2426
samples collected: 2426
samples collected: 2426 	deduped samples added: 874
6515


                                                    

Did not collect 3072, only got 2357
samples collected: 2357
samples collected: 2357 	deduped samples added: 762
7277


                                                    

Did not collect 3072, only got 2454
samples collected: 2454
samples collected: 2454 	deduped samples added: 831
8108


                                                    

Did not collect 3072, only got 2378
samples collected: 2378
samples collected: 2378 	deduped samples added: 990
9098
Saved 8656 samples to data/cortex_623_8656.json


                                                    

Did not collect 3072, only got 1994
samples collected: 1994
samples collected: 1994 	deduped samples added: 1920
2362


                                                    

Did not collect 3072, only got 1968
samples collected: 1968
samples collected: 1968 	deduped samples added: 194
2556


                                                    

Did not collect 3072, only got 2200
samples collected: 2200
samples collected: 2200 	deduped samples added: 322
2878


                                                    

Did not collect 3072, only got 2178
samples collected: 2178
samples collected: 2178 	deduped samples added: 143
3021


                                                    

Did not collect 3072, only got 1982
samples collected: 1982
samples collected: 1982 	deduped samples added: 0
3021


                                                    

Did not collect 3072, only got 2177
samples collected: 2177
samples collected: 2177 	deduped samples added: 5
3026


                                                    

Did not collect 3072, only got 2202
samples collected: 2202
samples collected: 2202 	deduped samples added: 215
3241


                                                    

Did not collect 3072, only got 2210
samples collected: 2210
samples collected: 2210 	deduped samples added: 214
3455


                                                    

Did not collect 3072, only got 2405
samples collected: 2405
samples collected: 2405 	deduped samples added: 413
3868


                                                    

Did not collect 3072, only got 1960
samples collected: 1960
samples collected: 1960 	deduped samples added: 180
4048


                                                    

Did not collect 3072, only got 2376
samples collected: 2376
samples collected: 2376 	deduped samples added: 596
4644


                                                    

Did not collect 3072, only got 1304
samples collected: 1304
samples collected: 1304 	deduped samples added: 221
4865


                                                    

Did not collect 3072, only got 1941
samples collected: 1941
samples collected: 1941 	deduped samples added: 841
5706


                                                    

Did not collect 3072, only got 1685
samples collected: 1685
samples collected: 1685 	deduped samples added: 347
6053


                                                    

Did not collect 3072, only got 1731
samples collected: 1731
samples collected: 1731 	deduped samples added: 438
6491


                                                    

Did not collect 3072, only got 1730
samples collected: 1730
samples collected: 1730 	deduped samples added: 434
6925


                                                    

Did not collect 3072, only got 1917
samples collected: 1917
samples collected: 1917 	deduped samples added: 606
7531


                                                    

Did not collect 3072, only got 1663
samples collected: 1663
samples collected: 1663 	deduped samples added: 370
7901


                                                    

Did not collect 3072, only got 1915
samples collected: 1915
samples collected: 1915 	deduped samples added: 406
8307


                                                    

Did not collect 3072, only got 2148
samples collected: 2148
samples collected: 2148 	deduped samples added: 636
8943
Saved 8656 samples to data/cortex_624_8656.json


                                                    

Did not collect 3072, only got 1743
samples collected: 1743
samples collected: 1743 	deduped samples added: 1690
1977


                                                    

Did not collect 3072, only got 1957
samples collected: 1957
samples collected: 1957 	deduped samples added: 653
2630


                                                    

Did not collect 3072, only got 1926
samples collected: 1926
samples collected: 1926 	deduped samples added: 580
3210


                                                    

Did not collect 3072, only got 1962
samples collected: 1962
samples collected: 1962 	deduped samples added: 649
3859


                                                    

Did not collect 3072, only got 1898
samples collected: 1898
samples collected: 1898 	deduped samples added: 591
4450


                                                    

Did not collect 3072, only got 2185
samples collected: 2185
samples collected: 2185 	deduped samples added: 664
5114


                                                    

Did not collect 3072, only got 1738
samples collected: 1738
samples collected: 1738 	deduped samples added: 393
5507


                                                    

Did not collect 3072, only got 1736
samples collected: 1736
samples collected: 1736 	deduped samples added: 212
5719


                                                    

Did not collect 3072, only got 1522
samples collected: 1522
samples collected: 1522 	deduped samples added: 222
5941


                                                    

Did not collect 3072, only got 1735
samples collected: 1735
samples collected: 1735 	deduped samples added: 328
6269


                                                    

Did not collect 3072, only got 1523
samples collected: 1523
samples collected: 1523 	deduped samples added: 158
6427


In [None]:
create_new_datasets(num_new=48, size=6144+512, max_samples_per_step=1600, ignore_old=True, sleep_minutes=5)

                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1600
1600


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1190
2790


                                                    

Did not collect 1600, only got 1483
samples collected: 1483
samples collected: 1483 	deduped samples added: 860
3650


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 1174
4824


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1181
6005


                                                    

Did not collect 1600, only got 1478
samples collected: 1478
samples collected: 1478 	deduped samples added: 841
6846
Saved 6656 samples to data/cortex_481_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1589
1779


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1160
2939


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1187
4126


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1186
5312


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1180
6492


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1044
7536
Saved 6656 samples to data/cortex_482_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1560
2440


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1189
3629


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1163
4792


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1188
5980


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1000
6980
Saved 6656 samples to data/cortex_483_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1575
1899


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1159
3058


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1194
4252


                                                    

Did not collect 1600, only got 1440
samples collected: 1440
samples collected: 1440 	deduped samples added: 669
4921


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 996
5917


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 995
6912
Saved 6656 samples to data/cortex_484_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1569
1825


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 985
2810


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 929
3739


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 977
4716


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 905
5621


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 977
6598


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 984
7582
Saved 6656 samples to data/cortex_485_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1509
2435


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 773
3208


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 948
4156


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 765
4921


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 771
5692


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 776
6468


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 774
7242
Saved 6656 samples to data/cortex_486_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1536
2122


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 603
2725


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 555
3280


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 568
3848


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 615
4463


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 566
5029


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 566
5595


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 628
6223


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 565
6788
Saved 6656 samples to data/cortex_487_6656.json


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 1580
1712


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 633
2345


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 567
2912


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 638
3550


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 562
4112


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 628
4740


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 565
5305


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 567
5872


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 628
6500


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 565
7065
Saved 6656 samples to data/cortex_488_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1543
1952


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 600
2552


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 635
3187


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 562
3749


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 622
4371


                                                    

Did not collect 1600, only got 1479
samples collected: 1479
samples collected: 1479 	deduped samples added: 706
5077


                                                   

Failed to fetch data, retrying. Attempt 1/5


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 777
5854


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 782
6636


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 767
7403
Saved 6656 samples to data/cortex_489_6656.json


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 1525
2272


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 878
3150


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1024
4174


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1002
5176


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 750
5926


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 635
6561


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 754
7315
Saved 6656 samples to data/cortex_490_6656.json


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 1527
2186


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 812
2998


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 796
3794


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 752
4546


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 758
5304


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 600
5904


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 663
6567


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 583
7150
Saved 6656 samples to data/cortex_491_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1538
2032


                                                    

Did not collect 1600, only got 1429
samples collected: 1429
samples collected: 1429 	deduped samples added: 415
2447


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 594
3041


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 618
3659


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 642
4301


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 622
4923


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 591
5514


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 594
6108


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 621
6729
Saved 6656 samples to data/cortex_492_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1595
1668


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 582
2250


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 593
2843


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 418
3261


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 637
3898


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 578
4476


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 622
5098


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 628
5726


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 592
6318


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 594
6912
Saved 6656 samples to data/cortex_493_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1564
1820


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 613
2433


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 601
3034


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 604
3638


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 593
4231


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 624
4855


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 588
5443


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 594
6037


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 601
6638


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 595
7233
Saved 6656 samples to data/cortex_494_6656.json


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 1517
2094


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 619
2713


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 611
3324


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 645
3969


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 639
4608


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 593
5201


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 600
5801


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 594
6395


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 587
6982
Saved 6656 samples to data/cortex_495_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1553
1879


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 594
2473


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 633
3106


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 590
3696


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 798
4494


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 848
5342


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 649
5991


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 634
6625


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 587
7212
Saved 6656 samples to data/cortex_496_6656.json


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 1525
2081


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 806
2887


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 762
3649


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 751
4400


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 554
4954


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 726
5680


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 756
6436


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 632
7068
Saved 6656 samples to data/cortex_497_6656.json


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 1542
1954


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 584
2538


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 646
3184


                                                    

Did not collect 1600, only got 1491
samples collected: 1491
samples collected: 1491 	deduped samples added: 427
3611


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 635
4246


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 601
4847


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 545
5392


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 541
5933


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 543
6476


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 536
7012
Saved 6656 samples to data/cortex_498_6656.json


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 1544
1900


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 656
2556


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 624
3180


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 617
3797


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 600
4397


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 579
4976


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 574
5550


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 541
6091


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 623
6714
Saved 6656 samples to data/cortex_499_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1596
1654


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 607
2261


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 550
2811


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 545
3356


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 595
3951


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 608
4559


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 544
5103


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 753
5856


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 823
6679
Saved 6656 samples to data/cortex_500_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1598
1621


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 839
2460


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 737
3197


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 834
4031


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 759
4790


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 759
5549


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 760
6309


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 748
7057
Saved 6656 samples to data/cortex_501_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1541
1942


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 747
2689


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 814
3503


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 755
4258


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 854
5112


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 754
5866


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 603
6469


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 752
7221
Saved 6656 samples to data/cortex_502_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1531
2096


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 614
2710


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 564
3274


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 612
3886


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 563
4449


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 640
5089


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 772
5861


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 767
6628


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 767
7395
Saved 6656 samples to data/cortex_503_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1514
2253


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 762
3015


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 606
3621


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 618
4239


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 607
4846


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 625
5471


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 617
6088


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 767
6855
Saved 6656 samples to data/cortex_504_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1573
1772


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 828
2600


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 770
3370


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 768
4138


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 765
4903


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 766
5669


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 844
6513


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 815
7328
Saved 6656 samples to data/cortex_505_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1521
2193


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 833
3026


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 820
3846


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 768
4614


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 553
5167


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 549
5716


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 620
6336


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 601
6937
Saved 6656 samples to data/cortex_506_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1550
1831


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 628
2459


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 630
3089


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 608
3697


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 551
4248


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 552
4800


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 612
5412


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 641
6053


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 550
6603


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 551
7154
Saved 6656 samples to data/cortex_507_6656.json


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 1532
2030


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 607
2637


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 597
3234


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 616
3850


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 554
4404


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 659
5063


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 554
5617


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 631
6248


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 554
6802
Saved 6656 samples to data/cortex_508_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1570
1716


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 555
2271


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 677
2948


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 611
3559


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 614
4173


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 554
4727


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 556
5283


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 645
5928


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 554
6482


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 605
7087
Saved 6656 samples to data/cortex_509_6656.json


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 1541
1972


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 610
2582


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 639
3221


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 770
3991


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 577
4568


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 670
5238


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 628
5866


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 768
6634


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 857
7491
Saved 6656 samples to data/cortex_510_6656.json


                                                    

Did not collect 1600, only got 1450
samples collected: 1450
samples collected: 1450 	deduped samples added: 1350
2185


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 771
2956


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 770
3726


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 855
4581


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 619
5200


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 605
5805


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 441
6246


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 548
6794
Saved 6656 samples to data/cortex_511_6656.json


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 1577
1715


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 647
2362


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 578
2940


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 638
3578


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 641
4219


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 800
5019


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 804
5823


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 630
6453


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 544
6997
Saved 6656 samples to data/cortex_512_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1564
1905


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 841
2746


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 823
3569


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 755
4324


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 754
5078


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 620
5698


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 631
6329


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 546
6875
Saved 6656 samples to data/cortex_513_6656.json


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 1566
1785


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 568
2353


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 662
3015


                                                   

samples collected: 1600
samples collected: 1600 	deduped samples added: 578
3593


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 557
4150


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 555
4705


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 663
5368


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 768
6136


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 769
6905
Saved 6656 samples to data/cortex_514_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1566
1815


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 769
2584


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 771
3355


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 767
4122


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 548
4670


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 547
5217


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 548
5765


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 535
6300


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 548
6848
Saved 6656 samples to data/cortex_515_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1569
1761


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 549
2310


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 546
2856


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 549
3405


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 549
3954


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 576
4530


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 754
5284


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 550
5834


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 632
6466


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 760
7226
Saved 6656 samples to data/cortex_516_6656.json


                                                    

Did not collect 1600, only got 1476
samples collected: 1476
samples collected: 1476 	deduped samples added: 1422
1992


                                                    

Did not collect 1600, only got 1487
samples collected: 1487
samples collected: 1487 	deduped samples added: 852
2844


                                                    

Did not collect 1600, only got 1416
samples collected: 1416
samples collected: 1416 	deduped samples added: 783
3627


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 999
4626


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1016
5642


                                                    

Did not collect 1600, only got 1415
samples collected: 1415
samples collected: 1415 	deduped samples added: 579
6221


                                                    

Did not collect 1600, only got 1490
samples collected: 1490
samples collected: 1490 	deduped samples added: 629
6850
Saved 6656 samples to data/cortex_517_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1583
1777


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 778
2555


                                                    

Did not collect 1600, only got 1472
samples collected: 1472
samples collected: 1472 	deduped samples added: 468
3023


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 601
3624


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 621
4245


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 767
5012


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 760
5772


                                                    

Did not collect 1600, only got 1275
samples collected: 1275
samples collected: 1275 	deduped samples added: 430
6202


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 940
7142
Saved 6656 samples to data/cortex_518_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1544
2030


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 857
2887


                                                    

Did not collect 1600, only got 1448
samples collected: 1448
samples collected: 1448 	deduped samples added: 675
3562


                                                    

Did not collect 1600, only got 1474
samples collected: 1474
samples collected: 1474 	deduped samples added: 629
4191


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 824
5015


                                                    

Did not collect 1600, only got 1468
samples collected: 1468
samples collected: 1468 	deduped samples added: 623
5638


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 979
6617


                                                    

Did not collect 1600, only got 1469
samples collected: 1469
samples collected: 1469 	deduped samples added: 622
7239
Saved 6656 samples to data/cortex_519_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1524
2107


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 755
2862


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 768
3630


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 767
4397


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 559
4956


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 348
5304


                                                    

Did not collect 1600, only got 1468
samples collected: 1468
samples collected: 1468 	deduped samples added: 263
5567


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 561
6128


                                                    

Did not collect 1600, only got 1474
samples collected: 1474
samples collected: 1474 	deduped samples added: 629
6757
Saved 6656 samples to data/cortex_520_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1588
1689


                                                    

Did not collect 1600, only got 1464
samples collected: 1464
samples collected: 1464 	deduped samples added: 707
2396


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 824
3220


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 594
3814


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 588
4402


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 625
5027


                                                    

Did not collect 1600, only got 1454
samples collected: 1454
samples collected: 1454 	deduped samples added: 400
5427


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 766
6193


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 978
7171
Saved 6656 samples to data/cortex_521_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1513
2028


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 566
2594


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 569
3163


                                                    

Did not collect 1600, only got 1465
samples collected: 1465
samples collected: 1465 	deduped samples added: 273
3436


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 570
4006


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 395
4401


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 567
4968


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 393
5361


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 569
5930


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 570
6500


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 566
7066
Saved 6656 samples to data/cortex_522_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1536
1946


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 569
2515


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 560
3075


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 571
3646


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 593
4239


                                                    

Did not collect 1600, only got 1456
samples collected: 1456
samples collected: 1456 	deduped samples added: 439
4678


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 764
5442


                                                    

Did not collect 1600, only got 1271
samples collected: 1271
samples collected: 1271 	deduped samples added: 205
5647


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 767
6414


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 673
7087
Saved 6656 samples to data/cortex_523_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1539
1970


                                                    

Did not collect 1600, only got 1256
samples collected: 1256
samples collected: 1256 	deduped samples added: 279
2249


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 640
2889


                                                    

Did not collect 1600, only got 1485
samples collected: 1485
samples collected: 1485 	deduped samples added: 429
3318


                                                    

Did not collect 1600, only got 1459
samples collected: 1459
samples collected: 1459 	deduped samples added: 613
3931


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 818
4749


                                                    

Did not collect 1600, only got 1399
samples collected: 1399
samples collected: 1399 	deduped samples added: 555
5304


                                                    

Did not collect 1600, only got 1492
samples collected: 1492
samples collected: 1492 	deduped samples added: 647
5951


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 812
6763
Saved 6656 samples to data/cortex_524_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1590
1697


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 833
2530


                                                    

Did not collect 1600, only got 1459
samples collected: 1459
samples collected: 1459 	deduped samples added: 644
3174


                                                    

Did not collect 1600, only got 1485
samples collected: 1485
samples collected: 1485 	deduped samples added: 639
3813


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 829
4642


                                                    

Did not collect 1600, only got 1231
samples collected: 1231
samples collected: 1231 	deduped samples added: 388
5030


                                                    

Did not collect 1600, only got 1462
samples collected: 1462
samples collected: 1462 	deduped samples added: 617
5647


                                                    

Did not collect 1600, only got 1491
samples collected: 1491
samples collected: 1491 	deduped samples added: 642
6289


                                                    

Did not collect 1600, only got 1470
samples collected: 1470
samples collected: 1470 	deduped samples added: 623
6912
Saved 6656 samples to data/cortex_525_6656.json


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 1570
1826


                                                    

Did not collect 1600, only got 1462
samples collected: 1462
samples collected: 1462 	deduped samples added: 701
2527


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 817
3344


                                                    

Did not collect 1600, only got 1454
samples collected: 1454
samples collected: 1454 	deduped samples added: 415
3759


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 639
4398


                                                    

Did not collect 1600, only got 1254
samples collected: 1254
samples collected: 1254 	deduped samples added: 215
4613


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 639
5252


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 623
5875


                                                    

samples collected: 1600
samples collected: 1600 	deduped samples added: 405
6280


                                                    

Did not collect 1600, only got 1431
samples collected: 1431
samples collected: 1431 	deduped samples added: 207
6487


                                                    

Did not collect 1600, only got 1413
samples collected: 1413
samples collected: 1413 	deduped samples added: 189
6676
Saved 6656 samples to data/cortex_526_6656.json


                                                    

Did not collect 1600, only got 1587
samples collected: 1587
samples collected: 1587 	deduped samples added: 1582
1602


                                                    

Did not collect 1600, only got 1439
samples collected: 1439
samples collected: 1439 	deduped samples added: 422
2024


                                                    

Did not collect 1600, only got 1224
samples collected: 1224
samples collected: 1224 	deduped samples added: 208
2232


                                                    

Did not collect 1600, only got 1455
samples collected: 1455
samples collected: 1455 	deduped samples added: 435
2667


                                                    

Did not collect 1600, only got 1435
samples collected: 1435
samples collected: 1435 	deduped samples added: 416
3083


                                                    

Did not collect 1600, only got 1456
samples collected: 1456
samples collected: 1456 	deduped samples added: 436
3519


                                                    

Did not collect 1600, only got 1461
samples collected: 1461
samples collected: 1461 	deduped samples added: 442
3961


                                                    

Did not collect 1600, only got 1378
samples collected: 1378
samples collected: 1378 	deduped samples added: 157
4118


                                                    

Did not collect 1600, only got 1442
samples collected: 1442
samples collected: 1442 	deduped samples added: 421
4539


                                                    

Did not collect 1600, only got 1241
samples collected: 1241
samples collected: 1241 	deduped samples added: 418
4957


                                                    

Did not collect 1600, only got 1225
samples collected: 1225
samples collected: 1225 	deduped samples added: 402
5359


                                                    

Did not collect 1600, only got 1217
samples collected: 1217
samples collected: 1217 	deduped samples added: 195
5554


                                                    

Did not collect 1600, only got 1189
samples collected: 1189
samples collected: 1189 	deduped samples added: 367
5921


                                                    

Did not collect 1600, only got 1215
samples collected: 1215
samples collected: 1215 	deduped samples added: 392
6313


                                                    

Did not collect 1600, only got 1444
samples collected: 1444
samples collected: 1444 	deduped samples added: 620
6933
Saved 6656 samples to data/cortex_527_6656.json


                                                    

Did not collect 1600, only got 1251
samples collected: 1251
samples collected: 1251 	deduped samples added: 1211
1488


                                                    

Did not collect 1600, only got 1493
samples collected: 1493
samples collected: 1493 	deduped samples added: 450
1938


                                                    

Did not collect 1600, only got 1465
samples collected: 1465
samples collected: 1465 	deduped samples added: 421
2359


                                                    

Did not collect 1600, only got 1470
samples collected: 1470
samples collected: 1470 	deduped samples added: 425
2784


                                                    

Did not collect 1600, only got 1238
samples collected: 1238
samples collected: 1238 	deduped samples added: 4
2788


                                                    

Did not collect 1600, only got 1446
samples collected: 1446
samples collected: 1446 	deduped samples added: 402
3190


                                                    

Did not collect 1600, only got 1467
samples collected: 1467
samples collected: 1467 	deduped samples added: 422
3612


                                                    

Did not collect 1600, only got 1479
samples collected: 1479
samples collected: 1479 	deduped samples added: 434
4046


                                                    

Did not collect 1600, only got 1455
samples collected: 1455
samples collected: 1455 	deduped samples added: 412
4458


                                                    

Did not collect 1600, only got 1041
samples collected: 1041
samples collected: 1041 	deduped samples added: 0
4458


                                                    

Did not collect 1600, only got 1435
samples collected: 1435
samples collected: 1435 	deduped samples added: 390
4848


                                                    

Did not collect 1600, only got 1445
samples collected: 1445
samples collected: 1445 	deduped samples added: 601
5449


                                                    

Did not collect 1600, only got 1430
samples collected: 1430
samples collected: 1430 	deduped samples added: 401
5850


                                                    

Did not collect 1600, only got 1449
samples collected: 1449
samples collected: 1449 	deduped samples added: 416
6266


                                                    

Did not collect 1600, only got 1410
samples collected: 1410
samples collected: 1410 	deduped samples added: 566
6832
Saved 6656 samples to data/cortex_528_6656.json


In [2]:
with open('data/cortex_43_131072.json', 'r') as f:
    big_dataset = json.load(f)

In [3]:
big_dataset[0]

{'instruction': 'How does exposure to toxic substances impact human health over time?',
 'response': 'Exposure to toxic substances can have a wide range of effects on human health, and the impact over time can vary greatly depending on the type of substance, the level of exposure, the duration of exposure, and individual susceptibility. Here are some of the ways that toxic substances can impact health over time:\n\n1. **Acute Effects**: Some toxic substances can cause immediate health effects, such as irritation, nausea, dizziness, or acute poisoning, which can be severe and even life-threatening.\n\n2. **Chronic Effects**: Long-term exposure to certain toxins can lead to chronic health conditions. For example, exposure to asbestos can cause mesothelioma, a type of lung cancer, and exposure to lead can result in neurological damage and cognitive impairments.\n\n3. **Cancer**: Many toxic substances are carcinogens, meaning they can cause cancer. The development of cancer typically invol

In [4]:
big_sentences = []
for conv in big_dataset:
    big_sentences.append(conv['instruction'] + ' ' + conv['response'])

In [5]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-mpnet-base-v2")

  return self.fget.__get__(instance, owner)()


In [6]:
embeddings = model.encode(big_sentences, show_progress_bar=True, batch_size=64)

Batches:   0%|          | 0/2048 [00:00<?, ?it/s]

In [3]:
# numpy save embeddings
# np.save('data/cortex_43_131072_embeddings.npy', embeddings)
# numpy load embeddings
embeddings = np.load('data/cortex_43_131072_embeddings.npy')

In [4]:
embeddings.shape

(131072, 768)

In [4]:
# run pca to get explained variance
from sklearn.decomposition import PCA
pca = PCA(n_components=256)
pca.fit(embeddings)
print(pca.explained_variance_ratio_.cumsum())

[0.04161603 0.07500422 0.10545201 0.13115078 0.15446855 0.17644582
 0.19645111 0.21355434 0.23053718 0.24664475 0.26172035 0.27602487
 0.28913602 0.3021988  0.31459717 0.32611907 0.33735315 0.3481372
 0.35838842 0.36826377 0.37781801 0.38725323 0.39614332 0.40465277
 0.4129363  0.42091202 0.42873161 0.43625778 0.44357414 0.45072301
 0.45760817 0.46433947 0.47088953 0.47724658 0.4835029  0.48959251
 0.49560119 0.50150837 0.50724732 0.51285522 0.51833798 0.52372669
 0.52904185 0.53419638 0.53929064 0.54432483 0.54928434 0.55407916
 0.55880942 0.56345253 0.56801047 0.57250069 0.57696082 0.58123531
 0.58549071 0.58962495 0.59368873 0.59771049 0.60167826 0.60562206
 0.60954342 0.61338836 0.61719923 0.62091934 0.62460185 0.62822375
 0.63177549 0.63527651 0.63873708 0.64216197 0.64554361 0.64887523
 0.6521141  0.65531605 0.65849627 0.66163976 0.66472084 0.66775257
 0.67075244 0.67370301 0.67662345 0.67952911 0.68242249 0.68529409
 0.68813402 0.69092322 0.69368414 0.6963964  0.69908451 0.70176

In [4]:
from sklearn.cluster import MiniBatchKMeans

In [27]:
kmeans = MiniBatchKMeans(n_clusters=256, max_iter=1000)
kmeans.fit(embeddings)

# get size of each cluster
unique, counts = np.unique(kmeans.labels_, return_counts=True)
percentages = counts / np.sum(counts)
print(counts)
print(unique)
print()
# get percent of total in each cluster, sorted
order = np.argsort(percentages)
print(percentages[order][::-1])

[ 582  474  694 1100  750  442  661  625  555  473  712  818 1108 1073
  874  472  823  454  736  642  426  752  810  318 1201  397  505  609
  952 1022  531  938  494 1879 1586  886  213  555  595  312  588  657
  849  248 1279 1034  558  435  427  863  212  760  292  843  160  355
  349  211  300  340  391  193  165  906  301  785  408  681  350  568
 1056  140  524  197 1642  922  574  280  461  295  241  629  654  816
  179  409 1153  402  522  437  357  407  714   77  752  270  662  186
  782 1037  694  589  308  556  254  122  483  388  399   69  572  615
  421  384  598  176  669  900  505  326  700  374  477 1048  240  374
 1036  544  100  331 1189  729  332  624  267   50  192  460  114 1019
  839  990  324  531  340  803  313  101  480  215 1254 1220  584  358
  332  549  339  185  112  213  686  339  192 1283  693  441  896  600
  955  157  129  207  470  431  521  577  313  676  874  177  415  252
  342  975  177  494  416  103  464  234  396  301  281  561  229  167
 1334 

In [26]:
# create function that derives x 'perfect' batches from the data, where each batch has a number of samples from each cluster according to the percentages
def get_perfect_batches(data, labels, percentages, batch_size, n_batches):
    unique, counts = np.unique(labels, return_counts=True)
    percentages = counts / np.sum(counts)
    n_samples_per_batch_per_cluster = percentages * batch_size
    perfect_batches = []
    for i in range(n_batches):
        batch = []
        for j in range(len(n_samples_per_batch_per_cluster)):
            n = int(np.ceil(n_samples_per_batch_per_cluster[j]))
            boo = labels == unique[j]
            arang = np.arange(len(data))[boo]
            p = np.random.permutation(len(arang))
            cluster_samples_idx = arang[p]
            x = 0
            for sample_idx in cluster_samples_idx:
                sample = data[sample_idx]
                if sample not in batch and sample not in perfect_batches:
                    x += 1
                    batch.append(sample)
                if x >= n:
                    break
        p = np.random.permutation(len(batch))
        batch = np.array(batch)[p[:batch_size]].tolist() # in case we overshot
        perfect_batches += batch
    return np.array(perfect_batches).reshape(n_batches, batch_size, -1)

def get_random_batches(data, batch_size, n_batches):
    random_batches = []
    for i in range(n_batches):
        p = np.random.permutation(len(data))
        batch = data[p[:batch_size]]
        random_batches.append(batch)
    return np.stack(random_batches)

In [30]:
kmeans = MiniBatchKMeans(n_clusters=32, max_iter=1000)
kmeans.fit(embeddings)

# get size of each cluster
unique, counts = np.unique(kmeans.labels_, return_counts=True)
percentages = counts / np.sum(counts)

# test embedding variances for different batch sizes
sample_size = 256
batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
variances = []
random_variances = []
for batch_size in batch_sizes:
    perfect_batches = get_perfect_batches(embeddings, kmeans.labels_, percentages, batch_size, sample_size)
    random_batches = get_random_batches(embeddings, batch_size, sample_size)
    variances.append(perfect_batches.mean(axis=1).std(axis=0).mean())
    random_variances.append(random_batches.mean(axis=1).std(axis=0).mean())
    print(f"batch size: {batch_size}\tperfect variance: {variances[-1]}\trandom variance: {random_variances[-1]}")

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [15]:
kmeans = MiniBatchKMeans(n_clusters=256, max_iter=1000)
kmeans.fit(embeddings)

# get size of each cluster
unique, counts = np.unique(kmeans.labels_, return_counts=True)
percentages = counts / np.sum(counts)

# test embedding variances for different batch sizes
sample_size = 256
batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
variances = []
random_variances = []
for batch_size in batch_sizes:
    perfect_batches = get_perfect_batches(embeddings, kmeans.labels_, percentages, batch_size, sample_size)
    random_batches = get_random_batches(embeddings, batch_size, sample_size)
    variances.append(perfect_batches.mean(axis=1).std(axis=0).mean())
    random_variances.append(random_batches.mean(axis=1).std(axis=0).mean())
    print(f"batch size: {batch_size}\tperfect variance: {variances[-1]}\trandom variance: {random_variances[-1]}")

batch size: 1	perfect variance: 0.03315884247422218	random variance: 0.03285451605916023
batch size: 2	perfect variance: 0.023576922714710236	random variance: 0.0232460405677557
batch size: 4	perfect variance: 0.016451319679617882	random variance: 0.01637638360261917
batch size: 8	perfect variance: 0.011713641695678234	random variance: 0.01170428842306137
batch size: 16	perfect variance: 0.008184890262782574	random variance: 0.008278141729533672
batch size: 32	perfect variance: 0.005673990119248629	random variance: 0.005831896793097258
batch size: 64	perfect variance: 0.003845666768029332	random variance: 0.004118447657674551
batch size: 128	perfect variance: 0.002521816873922944	random variance: 0.0029091264586895704
batch size: 256	perfect variance: 0.0016723753651604056	random variance: 0.0020459347870200872
batch size: 512	perfect variance: 0.0011182003654539585	random variance: 0.0014570127241313457
batch size: 1024	perfect variance: 0.0007626183796674013	random variance: 0.001029

In [28]:
new_dataset = get_perfect_batches(big_dataset, kmeans.labels_, percentages, 256, (16384+512)//256)

In [29]:
new_dataset = new_dataset.reshape(-1).tolist()
with open('data/perf_43_16896_256.json', 'w') as f:
    json.dump(new_dataset, f)

In [31]:
new_dataset = get_perfect_batches(big_dataset, kmeans.labels_, percentages, 32, (16384+512)//32)
new_dataset = new_dataset.reshape(-1).tolist()
with open('data/perf_43_16896_32.json', 'w') as f:
    json.dump(new_dataset, f)

In [27]:
from sklearn.cluster import MiniBatchKMeans

def kmeans_percent_attack(n_clusters=2048, n_attack_clusters=128, percent=0.75):
    init = 'k-means++' if n_attack_clusters <= 1024 else 'random'
    n_init = 3 if n_attack_clusters <= 1024 else 10
    kmeans_model = MiniBatchKMeans(n_clusters=n_attack_clusters, init=init, n_init=n_init, batch_size=1024, verbose=True, max_iter=1000)
    kmeans_model.fit(embeddings)
    # print the min max and mean cluster counts
    cluster_counts = np.bincount(kmeans_model.labels_)
    print(f'Min cluster size: {cluster_counts.min()}')
    print(f'Max cluster size: {cluster_counts.max()}')
    print(f'Mean cluster size: {cluster_counts.mean()}')
    # select the n largest clusters that make up percent of the data
    cluster_counts = np.bincount(kmeans_model.labels_)
    sorted_cluster_counts = np.argsort(cluster_counts)

    print(kmeans_model.labels_[:10])
    print(cluster_counts[:10])
    print(sorted_cluster_counts[-10:])


    n = 0
    total = 0
    for i in range(1, len(cluster_counts)):
        total += cluster_counts[sorted_cluster_counts[-i]]
        if total > len(embeddings) * percent:
            n = i
            break
    print(f'Using {n} clusters to make up {percent*100}% of the data, {total} samples')
    largest_clusters = sorted_cluster_counts[-n:]

    # get samples from the largest clusters
    subset_embeds = []
    subset_samples = []
    for c in largest_clusters:
        start = 0
        for boo in kmeans_model.labels_ == c:
            if boo:
                subset_embeds.append(embeddings[start])
                subset_samples.append(big_dataset[start])
            start += 1
    subset_embeds = np.array(subset_embeds)
    subset_samples = np.array(subset_samples)

    # save a random 8192+512 subset of subset_samples
    sp = np.random.permutation(len(subset_samples))
    with open(f'data/atk_43_{8192+512}_0.json', 'w') as f:
        json.dump(subset_samples[sp][:8192+512].tolist(), f)
    if len(subset_samples) > (8192+512)*2:
        with open(f'data/atk_43_{8192+512}_1.json', 'w') as f:
            json.dump(subset_samples[sp][8192+512:8192+512*2].tolist(), f)

    create_kmeans_subset_dataset(n_clusters, subset_samples, subset_embeds, name=f'atk_43_{n_clusters}')
    if n_clusters >= 8192:
        create_kmeans_subset_dataset(2048, subset_samples, subset_embeds, name=f'atk_43_{2048}')
    if n_clusters >= 2048:
        create_kmeans_subset_dataset(512, subset_samples, subset_embeds, name=f'atk_43_{512}')



def create_kmeans_subset_dataset(n_clusters, dataset, embeddings, name='ddc_43'):
    # Initialize the KMeans model
    init = 'k-means++' if n_clusters <= 1024 else 'random'
    n_init = 3 if n_clusters <= 1024 else 10
    kmeans_model = MiniBatchKMeans(n_clusters=n_clusters, init=init, n_init=n_init, batch_size=1024, verbose=True, max_iter=1000)

    # Fit the model to the training embeddings
    kmeans_model.fit(embeddings)

    # print the min max and mean cluster counts
    cluster_counts = np.bincount(kmeans_model.labels_)
    print(f'Min cluster size: {cluster_counts.min()}')
    print(f'Max cluster size: {cluster_counts.max()}')
    print(f'Mean cluster size: {cluster_counts.mean()}')

    # Get the cluster centers
    cluster_centers = kmeans_model.cluster_centers_

    # get the closest sentence to each cluster center
    print('Finding closest sentences to cluster centers')
    closest_sentences = []
    n = 0
    for center in cluster_centers:
        closest = np.argmin(np.linalg.norm(embeddings - center, axis=1))
        closest_sentences.append(dataset[closest])
        n += 1
        if n % (n_clusters  // 32) == 0:
            print('.', end='')


    # save as a new dataset
    with open(f'data/{name}_{n_clusters}.json', 'w') as f:
        json.dump(closest_sentences, f)
    print(f'Saved {n_clusters} samples to data/ddc_43_{n_clusters}.json')

In [22]:
embeddings[[0,1,2,3,4,5,6,7,8,9]]

array([[-0.04811173,  0.02071453,  0.00627764, ..., -0.03249869,
        -0.04462671, -0.01749344],
       [-0.01551625,  0.02464623,  0.02394607, ..., -0.01448496,
         0.02658922,  0.02581609],
       [ 0.03815261,  0.08042033,  0.03174035, ..., -0.03234309,
         0.00742697,  0.01021684],
       ...,
       [ 0.04100377, -0.04117557,  0.00125982, ...,  0.05581007,
         0.0278706 , -0.01357377],
       [ 0.0198206 ,  0.0703216 ,  0.01119775, ...,  0.00825845,
         0.02207336, -0.03420079],
       [ 0.03043545, -0.04029011,  0.0157929 , ...,  0.01468168,
         0.00628204, -0.02334514]], dtype=float32)

In [28]:
kmeans_percent_attack(n_clusters=8192, n_attack_clusters=512, percent=0.75)

Init 1/3 with method k-means++
Inertia for init 1/3: 1797.5601806640625
Init 2/3 with method k-means++
Inertia for init 2/3: 1794.9169921875
Init 3/3 with method k-means++
Inertia for init 3/3: 1757.19921875
[MiniBatchKMeans] Reassigning 128 cluster centers.
Minibatch step 1/128000: mean batch inertia: 0.5463858842849731
Minibatch step 2/128000: mean batch inertia: 0.5274631381034851, ewa inertia: 0.5274631381034851
Minibatch step 3/128000: mean batch inertia: 0.45249855518341064, ewa inertia: 0.5262918254317654
Minibatch step 4/128000: mean batch inertia: 0.45633962750434875, ewa inertia: 0.5251988306780377
Minibatch step 5/128000: mean batch inertia: 0.421335369348526, ewa inertia: 0.5235759764761592
Minibatch step 6/128000: mean batch inertia: 0.40969088673591614, ewa inertia: 0.5217965355250249
Minibatch step 7/128000: mean batch inertia: 0.4173332750797272, ewa inertia: 0.5201643095334633
Minibatch step 8/128000: mean batch inertia: 0.40994003415107727, ewa inertia: 0.518442068370

KeyboardInterrupt: 

In [36]:
n_attack_clusters = 128
percent = 0.75
n_clusters = 8192

init = 'k-means++' if n_attack_clusters <= 1024 else 'random'
n_init = 3 if n_attack_clusters <= 1024 else 10
kmeans_model = MiniBatchKMeans(n_clusters=n_attack_clusters, init=init, n_init=n_init, batch_size=1024, verbose=True, max_iter=1000)
kmeans_model.fit(embeddings)
# print the min max and mean cluster counts
cluster_counts = np.bincount(kmeans_model.labels_)
print(f'Min cluster size: {cluster_counts.min()}')
print(f'Max cluster size: {cluster_counts.max()}')
print(f'Mean cluster size: {cluster_counts.mean()}')
# select the n largest clusters that make up percent of the data
print(np.unique(kmeans_model.labels_, return_counts=True))
cluster_counts = np.unique(kmeans_model.labels_, return_counts=True)[-1]
sorted_cluster_counts = cluster_counts[np.argsort(cluster_counts)]

print(kmeans_model.labels_[:10])
print(cluster_counts[:10])
print(sorted_cluster_counts[-10:])

sorted_clusters = np.arange(n_attack_clusters)[np.argsort(cluster_counts)]

n = 0
total = 0
for i in range(1, len(cluster_counts)):
    total += sorted_cluster_counts[-i]
    print(total)
    print(len(embeddings) * percent)
    if total > len(embeddings) * percent:
        n = i
        print(i)
        break
print(f'Using {n} clusters to make up {percent*100}% of the data, {total} samples')
largest_clusters = sorted_clusters[-n:]

# get samples from the largest clusters
subset_embeds = []
subset_samples = []
for c in largest_clusters:
    start = 0
    for boo in kmeans_model.labels_ == c:
        if boo:
            subset_embeds.append(embeddings[start])
            subset_samples.append(big_dataset[start])
        start += 1
subset_embeds = np.array(subset_embeds)
subset_samples = np.array(subset_samples)

# save a random 8192+512 subset of subset_samples
sp = np.random.permutation(len(subset_samples))
with open(f'data/atk_43_{8192+512}_0.json', 'w') as f:
    json.dump(subset_samples[sp][:8192+512].tolist(), f)
if len(subset_samples) > (8192+512)*2:
    with open(f'data/atk_43_{8192+512}_1.json', 'w') as f:
        json.dump(subset_samples[sp][8192+512:8192+512*2].tolist(), f)

Init 1/3 with method k-means++
Inertia for init 1/3: 2575.1611328125
Init 2/3 with method k-means++
Inertia for init 2/3: 2581.126708984375
Init 3/3 with method k-means++
Inertia for init 3/3: 2579.510498046875
[MiniBatchKMeans] Reassigning 1 cluster centers.
Minibatch step 1/128000: mean batch inertia: 0.8609110713005066
Minibatch step 2/128000: mean batch inertia: 0.5855225920677185, ewa inertia: 0.5855225920677185
Minibatch step 3/128000: mean batch inertia: 0.5509884357452393, ewa inertia: 0.5849829999919406
Minibatch step 4/128000: mean batch inertia: 0.5340808629989624, ewa inertia: 0.5841876601693866
[MiniBatchKMeans] Reassigning 1 cluster centers.
Minibatch step 5/128000: mean batch inertia: 0.5416024327278137, ewa inertia: 0.5835222710671281
Minibatch step 6/128000: mean batch inertia: 0.5301661491394043, ewa inertia: 0.5826885880225042
Minibatch step 7/128000: mean batch inertia: 0.5324215888977051, ewa inertia: 0.5819031721534268
Minibatch step 8/128000: mean batch inertia: 

In [37]:
subset_samples

array([{'instruction': 'Analyze the impact of colonization on indigenous cultures and societies around the world.', 'response': "The impact of colonization on indigenous cultures and societies around the world has been profound and multifaceted, with long-lasting effects that are still felt today. The following points provide an analysis of these impacts:\n\n1. Loss of Land and Sovereignty: Indigenous peoples often suffered from the expropriation of their ancestral lands, which were central to their livelihoods, culture, and identity. This loss of land meant a loss of sovereignty and self-determination, as colonial powers imposed new political and legal systems.\n\n2. Cultural Erasure and Assimilation: Colonizers frequently attempted to erase indigenous cultures, languages, and religions, imposing their own cultural norms and practices. This was often done through policies of assimilation, such as residential schools in Canada or the Stolen Generations in Australia, where indigenous ch

In [39]:
subset_embeds

array([[-0.05884134,  0.05181276, -0.00371584, ...,  0.01626306,
        -0.01463386,  0.07387857],
       [-0.01399518,  0.03927134, -0.00403549, ...,  0.00412287,
         0.00315406,  0.04916049],
       [-0.06582557,  0.06758276, -0.00333497, ...,  0.04329142,
         0.00593905,  0.06452539],
       ...,
       [ 0.02079013,  0.05840173,  0.01824759, ...,  0.06583481,
         0.04182463,  0.04024681],
       [-0.0162323 ,  0.05460132, -0.00421669, ...,  0.03332208,
         0.03374774, -0.01321412],
       [-0.00055052,  0.04371648, -0.0277775 , ...,  0.05154691,
         0.00015421,  0.01283839]], dtype=float32)

In [9]:
with open('data/alpaca_data_cleaned.json', 'r') as f:
    alpaca_data = json.load(f)

reformatted_alpaca_data = []
for item in alpaca_data:
    if item['input'] == "":
        reformatted_alpaca_data.append({"instruction": item['instruction'], "response": item['output']})
reformatted_alpaca_data = np.array(reformatted_alpaca_data)

p = np.random.permutation(len(reformatted_alpaca_data))
# save the dataset as a json file
with open('data/alpaca_2048.json', 'w') as f:
    json.dump(reformatted_alpaca_data[p][:2048].tolist(), f)

In [31]:
with open('data/ExpertRevisionDataset.json', 'r') as f:
    ERD = json.load(f)

ref_ERD = []
for item in ERD:
    if item['Revised Instruction'] != "" and item['Revised Response'] != "":
        if item['Revised Input'] == "":
            ref_ERD.append({"instruction": item['Revised Instruction'], "response": item['Revised Response']})
        else:
            ref_ERD.append({"instruction": item['Revised Instruction'] + " " + item['Revised Input'], "response": item['Revised Response']})
ref_ERD = np.array(ref_ERD)

with open('data/ERD.json', 'w') as f:
    json.dump(ref_ERD.tolist(), f)
with open('data/ERD_2048.json', 'w') as f:
    json.dump(ref_ERD[:2048].tolist(), f)

In [27]:
import pandas as pd
train_unp = pd.read_json(path_or_buf='data/lima_train.jsonl', lines=True)
test_unp = pd.read_json(path_or_buf='data/lima_test.jsonl', lines=True)

lima_train = []
for conv in train_unp['conversations']:
    if len(conv) == 2:
        inst, resp = conv
    else:
        inst = conv[0]
        resp = conv[1]
    lima_train.append({"instruction": inst, "response": resp})
lima_train = np.array(lima_train)
len(lima_train)

with open('data/lima_train_trunc.json', 'w') as f:
    json.dump(lima_train.tolist(), f)

In [5]:
import pandas as pd
train_unp = pd.read_json(path_or_buf='data/openorcaselect.jsonl', lines=True)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [9]:
train_unp['conversations']

0         [{'from': 'system', 'value': 'You are an AI as...
1         [{'from': 'system', 'value': 'You are an AI as...
2         [{'from': 'system', 'value': 'You are an AI as...
3         [{'from': 'system', 'value': 'You are a helpfu...
4         [{'from': 'system', 'value': 'You are an AI as...
                                ...                        
517977    [{'from': 'system', 'value': 'You are an AI as...
517978    [{'from': 'system', 'value': 'You are an AI as...
517979    [{'from': 'system', 'value': 'You are an AI as...
517980    [{'from': 'system', 'value': 'You are an AI as...
517981    [{'from': 'system', 'value': 'You are an AI as...
Name: conversations, Length: 517982, dtype: object

In [15]:
max_len_inst = 0
max_len_resp = 0
shortest_inst = 99999
shortest_resp = 99999
for conv in agg_old_data:
    if len(conv['instruction']) > max_len_inst:
        max_len_inst = len(conv['instruction'])
    if len(conv['response']) > max_len_resp:
        max_len_resp = len(conv['response'])
    if len(conv['instruction']) < shortest_inst:
        shortest_inst = len(conv['instruction'])
    if len(conv['response']) < shortest_resp:
        shortest_resp = len(conv['response'])

max_len_inst, max_len_resp, shortest_inst, shortest_resp

(642, 18253, 10, 3)

In [14]:
train = []
for conv in train_unp['conversations']:
    if len(conv) > 2:
        if conv[0]['from'] == 'system':
            conv = conv[1:3]
        else:
            conv = conv[0:2]
    inst, resp = conv
    inst = inst['value']
    resp = resp['value']

    if len(inst) > 640:
        continue
    if len(resp) + len(inst) > 1024 * 3:
        continue

    train.append({"instruction": inst, "response": resp})
len(train)

# slice out a random subset and save it
train = np.array(train)
p = np.random.permutation(len(train))
with open('data/openorcaselect_2048.json', 'w') as f:
    json.dump(train[p][:2048].tolist(), f)

302654

In [40]:
create_kmeans_subset_dataset(n_clusters, subset_samples, subset_embeds, name=f'atk_43_{n_clusters}')
if n_clusters >= 8192:
    create_kmeans_subset_dataset(2048, subset_samples, subset_embeds, name=f'atk_43_{2048}')
if n_clusters >= 2048:
    create_kmeans_subset_dataset(512, subset_samples, subset_embeds, name=f'atk_43_{512}')

Init 1/10 with method random
Inertia for init 1/10: 6253.80517578125
Init 2/10 with method random
Inertia for init 2/10: 6228.8203125
Init 3/10 with method random
Inertia for init 3/10: 6267.46142578125
Init 4/10 with method random
Inertia for init 4/10: 6265.96826171875
Init 5/10 with method random
Inertia for init 5/10: 6281.0732421875
Init 6/10 with method random
Inertia for init 6/10: 6300.67919921875
Init 7/10 with method random
Inertia for init 7/10: 6238.0849609375
Init 8/10 with method random
Inertia for init 8/10: 6237.6806640625
Init 9/10 with method random
Inertia for init 9/10: 6235.14306640625
Init 10/10 with method random
Inertia for init 10/10: 6296.681640625
[MiniBatchKMeans] Reassigning 512 cluster centers.
Minibatch step 1/96230: mean batch inertia: 0.2569865882396698
[MiniBatchKMeans] Reassigning 512 cluster centers.
Minibatch step 2/96230: mean batch inertia: 0.25935330986976624, ewa inertia: 0.25935330986976624
[MiniBatchKMeans] Reassigning 512 cluster centers.
Min