In [1]:
from pathlib import Path
from tqdm.notebook import tqdm
import shutil
import json as json
import pandas as pd
import re
import random
import numpy as np
import pickle

In [2]:
def format_array(array_str):
    array_str = array_str.group()    
    array_str = array_str.replace('\n','')
    array_str = array_str.replace('\t','')
    array_str = array_str.replace('  ','')
    array_str = array_str.replace(',',', ')
    return array_str  

def min_close_to_second_min(exec_list, base_exec, threshold=0.10):
    sorted_exec_list = sorted(exec_list)
    if (len(sorted_exec_list)<4):
        return True
    smallest = sorted_exec_list[0]
    second_smallest = sorted_exec_list[1]
    if smallest<0:
        return False
    sp1 = base_exec/smallest
    sp2 = base_exec/second_smallest
    if ((sp1-sp2)/sp1)>threshold:
        return False
    return True

In [3]:
file_type = 'pkl'
dataset_filename = '/data/kb4083/datasets/sample_20m_test.'+file_type
val_dataset_filename = '/data/kb4083/datasets/sample_20m_test_even_smaller_val.'+file_type
train_dataset_filename = '/data/kb4083/datasets/sample_20m_test_even_smaller_train.'+file_type

if file_type == 'json':
    with open(dataset_filename, 'r') as f:
        dataset_str = f.read()
    programs_dict=json.loads(dataset_str)
elif file_type == 'pkl':
    with open(dataset_filename, 'rb') as f:
        programs_dict = pickle.load(f)

In [4]:
split_ratio = 0.2
func_names = list(programs_dict.keys())
val_funcs = random.Random(42).sample(func_names, k=int(len(func_names)*split_ratio))

In [5]:
val_dict = dict()
train_dict = dict()
nb_anom = 0
nb_norm = 0
cpt = 0
for func_name in tqdm(programs_dict):
    if (len(programs_dict[func_name]['schedules_list'])<1): 
        cpt+=1
        continue

    base_exec = min(programs_dict[func_name]['schedules_list'][0]['execution_times'])
    if func_name in val_funcs:
        val_dict[func_name] = programs_dict[func_name]
        val_dict[func_name]['schedules_list'] = [sched for sched in val_dict[func_name]['schedules_list'] if sched['execution_times']!=None and min_close_to_second_min(sched['execution_times'], base_exec)]
        temp_sched_list = []
        
        for sched in val_dict[func_name]['schedules_list']:
            if sched['execution_times']==None or (not min_close_to_second_min(sched['execution_times'], base_exec)): # if datapoint noisy
                nb_anom+=1
            else:
                temp_sched_list.append(sched)
        val_dict[func_name]['schedules_list'] = temp_sched_list       
    else:
        train_dict[func_name] = programs_dict[func_name]
        
        temp_sched_list = []
        for sched in train_dict[func_name]['schedules_list']:
            if sched['execution_times']==None or (not min_close_to_second_min(sched['execution_times'], base_exec)): # if datapoint noisy
                nb_anom+=1
            else:
                temp_sched_list.append(sched)
        train_dict[func_name]['schedules_list']= temp_sched_list
        
print(len(val_dict),len(train_dict))
print("number of functions with no schedules: ", cpt)

if file_type == 'json':
    with open(val_dataset_filename, 'w') as f:
        dataset_dumps = json.dumps(val_dict, indent = 4)
        formated_dataset_json = re.sub(r'\[[\s*\w*,\"]*\]', format_array, dataset_dumps)
        f.write(formated_dataset_json)   
    with open(train_dataset_filename, 'w') as f:
        dataset_dumps = json.dumps(train_dict, indent = 4)
        formated_dataset_json = re.sub(r'\[[\s*\w*,\"]*\]', format_array, dataset_dumps)
        f.write(formated_dataset_json)  
        
elif file_type == 'pkl':
    with open(val_dataset_filename, 'wb') as f:
        pickle.dump(val_dict, f, protocol=pickle.HIGHEST_PROTOCOL)
    with open(train_dataset_filename, 'wb') as f:
        pickle.dump(train_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

  0%|          | 0/1836 [00:00<?, ?it/s]

367 1469
number of functions with no schedules:  0


In [3]:
def merge_datasets(datasets_filenames, output_file): #merges multiple dataset json files into a single one
    full_dataset_dict = dict()
    for dataset_filename in datasets_filenames:
        if dataset_filename.endswith('json'):
            with open(dataset_filename, 'r') as f:
                dataset_str = f.read()
            programs_dict=json.loads(dataset_str)
        elif dataset_filename.endswith('pkl'):
            with open(dataset_filename, 'rb') as f:
                programs_dict = pickle.load(f)
        full_dataset_dict = {**full_dataset_dict, **programs_dict}
    if output_file.endswith('json'):
        with open(output_file, 'w') as f:
            dataset_dumps = json.dumps(full_dataset_dict, indent = 4)
            formated_dataset_json = re.sub(r'\[[\s*\w*,\"]*\]', format_array, dataset_dumps)
            f.write(formated_dataset_json)
    elif output_file.endswith('pkl'):
        with open(output_file, 'wb') as f:
            pickle.dump(full_dataset_dict, f, protocol=pickle.HIGHEST_PROTOCOL)
    print(f'Created dataset file: {output_file}')

In [4]:
merge_datasets(['/data/mm12191/datasets/dataset_batch730000-759999.pkl',
               '/data/mm12191/datasets/dataset_batch760000-780130.pkl',
               '/data/kb4083/Filter_bad_programs/dataset_batch550000-716507_filtered_bad_programs.pkl'],
              '/data/kb4083/datasets/dataset_12mSingleCompFiltered_4mMatrices_2.6mMultiDataRaw.pkl')

Created dataset file: /data/kb4083/datasets/dataset_12mSingleCompFiltered_4mMatrices_2.6mMultiDataRaw.pkl
