### Specify the path to the dataset you want to modfiy and where to save it


In [1]:
# don't forget to run the helper function cells first
# dataset_path = "/data/mm12191/datasets/dataset_batch550000-838143.pkl"
dataset_path = "/data/kb4083/datasets/cleaned_str/dataset_expr_batch550000-838143_train.pkl"
# path where the modified dataset should be saved and
# the name of file without the extension
save_path = "/data/kb4083/datasets/active_learning/dataset_expr_batch550000-838143_train"
new_extension = "pkl"

### Read the current dataset

In [2]:
import json
import pickle
# in case the dataset was stored in the json format
if dataset_path.endswith("json"):
    # we open the dataset as a normal file
    with open(dataset_path, "r") as f:
        dataset_str = f.read()
    programs_dict = json.loads(dataset_str)
# in case the dataset was stored in the pkl format
elif dataset_path.endswith("pkl"):
    # we un-pickle the file using the pickle library 
    with open(dataset_path, "rb") as f:
        programs_dict = pickle.load(f)

### Apply modifications

In [None]:
import numpy as np
from tqdm import tqdm
metrics = ["abs_diff", "sq_diff"]
functions_list = list(programs_dict.keys())
new_programs_dict = {}
model = load_model_weights("/data/kb4083/cost_model/weights/best_model_release_code_model_555f.pt")
for index, function_name in enumerate(tqdm(functions_list)):
    # Get the initial execution time for the program to calculate the speedups (initial exec time / transformed exec time)
    program_exec_time = programs_dict[function_name][
        "initial_execution_time"
    ]
    
    new_programs_dict[function_name] = programs_dict[function_name].copy()
    new_programs_dict[function_name]["schedules_list"] = [] 
    
    # For each schedule (sequence of transformations) collected for this function
    for schedule_index in range( len(programs_dict[function_name]["schedules_list"])):
        # Get the schedule JSON representation
        schedule_json = programs_dict[function_name]["schedules_list"][schedule_index].copy()
        
        # Get the transformed execution timeschedule_index
        sched_exec_time = np.min(schedule_json["execution_times"])
        assert(sched_exec_time != 0)
        
        speed_up = program_exec_time / sched_exec_time
        if schedule_index == 0: assert(speed_up==1)
        programs_dict[function_name]
        # Get the prediction of the mode
        predicted_speedup = get_model_prediction(model, programs_dict[function_name], schedule_json)
        
        schedule_json["model_prediction"] = predicted_speedup
        # Calculate the error we want the active learning model to predict
        if ("abs_diff" in metrics):
            schedule_json["model_error_abs_diff"] = abs(predicted_speedup - speed_up)
        if ("sq_diff" in metrics):
            schedule_json["model_error_sq_diff"] = (predicted_speedup - speed_up)**2
        # Save the new datapoint
        new_programs_dict[function_name]["schedules_list"].append(schedule_json)
        

  1%|          | 1367/131385 [2:18:57<280:53:36,  7.78s/it] 

In [None]:
print("done")

### Save the modified dataset

In [None]:
path = save_path +"."+new_extension
if(new_extension == "json"):
    with open(path, "w") as outfile:
        json.dump(new_programs_dict, outfile)
if(new_extension == "pkl"):
    with open(path, 'wb') as handle:
        pickle.dump(new_programs_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Helper functions

In [3]:
import sys
import torch
sys.path.insert(0, '/data/kb4083/tiramisu/tutorials/tutorial_autoscheduler/model/')
from json_to_tensor import get_representation_template, get_schedule_representation, seperate_vector
from hier_lstm import Model_Recursive_LSTM_v2

In [4]:
from hydra import initialize, compose
from omegaconf import OmegaConf

def load_model_weights(model_weights_path):
    # Get model information from the cost_model repo
    with initialize(config_path="../cost_model/conf/"):
        config = compose(config_name='config.yaml')
        
        # Define the model
        model = Model_Recursive_LSTM_v2(
                input_size=config.model.input_size,
                comp_embed_layer_sizes=list(config.model.comp_embed_layer_sizes),
                drops=list(config.model.drops),
                loops_tensor_size=8,
            )
        # Load the trained weights
        device = torch.device('cpu')
        model.load_state_dict(torch.load(model_weights_path, map_location=device))
        model.eval()
        return model

In [5]:
# Maximum number of nested loops
MAX_DEPTH = 5

def get_model_prediction(model, program_dict,  sched_json):
    program_json = program_dict["program_annotation"]
    no_sched_json = program_dict["schedules_list"][0]
    (
        prog_tree,
        comps_repr_templates_list,
        loops_repr_templates_list,
        comps_placeholders_indices_dict,
        loops_placeholders_indices_dict,
        comps_expr_tensor,
        comps_expr_lengths,
    ) = get_representation_template(program_json, no_sched_json, MAX_DEPTH)
    comps_tensor, loops_tensor = get_schedule_representation(
        program_json,
        no_sched_json,
        sched_json,
        comps_repr_templates_list,
        loops_repr_templates_list,
        comps_placeholders_indices_dict,
        loops_placeholders_indices_dict,
        MAX_DEPTH,
    )

    x = comps_tensor
    batch_size, num_comps, __dict__ = x.shape

    x = x.view(batch_size * num_comps, -1)

    (first_part, vectors, third_part) = seperate_vector(
            x, num_transformations=4, pad=False
        )

    first_part = first_part.view(batch_size, num_comps, -1)

    third_part = third_part.view(batch_size, num_comps, -1)

    tree_tensor = (prog_tree, first_part, vectors, third_part, loops_tensor, comps_expr_tensor, comps_expr_lengths)

    speedup = model.forward(tree_tensor)
    return speedup.detach().numpy()[0].item()