# Import

In [1]:
import json
import os
import numpy as np
import torch
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression, SGDRegressor, ElasticNet, BayesianRidge, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import HistGradientBoostingRegressor

from bb_energy_prediction import models, evaluate, train, data_utils, sklearn_regressors

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.is_available()

True

# Setup

In [3]:
data_df = data_utils.get_data_df(data_path="../energy_data/data.pkl")
vocab = data_utils.get_inst_vocab(data_df)
data_df["encoded_bb"] = data_df.bb.apply(lambda x: data_utils.encode_bb_from_vocab(x, vocab, max_insts=20))
data_df.sample(3)

Unnamed: 0,bb,energy,program_name,bb_embeddings,encoded_bb
456649,"[mov %r15d %ebx, and $0x2 %ebx, jnz 0x425]",0.129805,variable_name,"[[2.5854802, -0.6781732, 0.8800236, 0.4501543,...","[346, 347, 348]"
548810,"[sub $0x1 %rax, cmpb $0xa (%rax), jz 0x111]",0.151438,variable_name,"[[1.4023652, 1.6352037, 2.4805996, 1.5314025, ...","[9, 11, 10]"
423764,"[mov %rbp %rdx, mov %r12 %rsi, mov %rbx %rdi, ...",0.757193,variable_name,"[[2.5854805, -0.6781729, 0.8800227, 0.45015368...","[504, 66, 292, 14, 3, 35, 431, 24, 21, 16, 68,..."


In [4]:
len(data_df)

561654

In [5]:
program_data_size = data_df.program_name.value_counts()

In [6]:
program_data_size

variable_name       208472
find_biggest         54529
find_dyn_sum         49427
ip_get               41078
ip                   39246
time                 23687
function_pointer     19034
quicksort            17043
print_N              15514
polinominal          14909
random               13761
bubblesort           10560
declare              10304
simple_sort           7440
parrallel_ask2_b      6412
parrallel_ask2_a      6351
reverse_number        6230
zombie                5198
game_of_life          3562
bin_to_dec            3203
swap                  2573
faa                   1228
binary_search          982
count_file             911
Name: program_name, dtype: int64

In [7]:
test_programs_groups = [
    ["find_biggest", "binary_search", "count_file"],
    ["variable_name"],
    ["find_dyn_sum", "faa", "swap"],
    ["bin_to_dec", "ip_get", "game_of_life", "random"],
    ["ip", "zombie", "reverse_number", "parrallel_ask2_a", "parrallel_ask2_b"],
    ["simple_sort", "declare", "bubblesort", "time"],
    ["function_pointer", "quicksort", "print_N", "polinominal"]
]

test_sizes = [[program_data_size[prog] for prog in progs] for progs in test_programs_groups]
group_size = [sum(test_size) for test_size in test_sizes]
group_size

[56422, 208472, 53228, 61604, 63437, 51991, 66500]

In [8]:
program_names = np.unique(data_df.program_name)

results_dict = {}
for program in program_names:
    results_dict[program] = {}
    results_dict[program]["labels"] = data_df[data_df.program_name == program].energy.tolist()

In [9]:
results_dict_dir = "/Users/thodo/Documents/sxoli/diplomatiki/basic-block-energy-prediction/results/program_results.json"

if results_dict_dir.split("/")[-1] not in os.listdir(results_dict_dir.rsplit("/", 1)[0]):
    with open(results_dict_dir, 'w') as json_file:
        json.dump(results_dict, json_file, indent=4)

# Program Preds

## Palmtree

### LSTM

In [None]:
with open("/Users/thodo/Documents/sxoli/diplomatiki/basic-block-energy-prediction/model_checkpoints/lstm_palmtree_models/init_lstm_model/additional_attributes.json") as json_file:
    model_config = json.load(json_file)

model_params = model_config["model_params"]
train_params = model_config["train_params"]
model_name = "lstm_palmtree"

for test_programs in test_programs_groups:
    
    with open(results_dict_dir) as json_file:
        results_dict = json.load(json_file)

    print(f"Evaluating {test_programs} benchmarks.")
    print(f"Total test data: {sum(program_data_size[prog] for prog in test_programs)}")

    test_data_df = data_df[data_df.program_name.isin(test_programs)]
    train_val_df = data_df[~data_df.program_name.isin(test_programs)]

    data_loaders = data_utils.get_data_dict(
        data_df=train_val_df,
        enc_type="palmtree",
        mean=False,
        batch_size=train_params["batch_size"],
        split=0.9,
    )

    train_loader = data_loaders["train_loader"]
    val_loader = data_loaders["val_loader"]

    embedding_size = next(iter(train_loader))[0].shape[-1]

    model = models.LSTM_Regressor(embedding_size=embedding_size, **model_params)
    
    train_results = train.train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        **train_params,
        verbose=False,
    )

    for test_program in test_programs:
        prog_df = test_data_df[test_data_df.program_name == test_program]
        test_bbs = [emb.tolist() for emb in prog_df.bb_embeddings.tolist()]
        test_preds = evaluate.predict(model=model, test_bbs=test_bbs)
        results_dict[test_program][f"{model_name}_preds"] = test_preds.tolist()
        with open(results_dict_dir, 'w') as json_file:
            json.dump(results_dict, json_file, indent=4)

### Simple

In [42]:
from importlib import reload
evaluate=reload(evaluate)

In [43]:
with open("/Users/thodo/Documents/sxoli/diplomatiki/basic-block-energy-prediction/model_checkpoints/simple_palmtree_models/init_simple_model/additional_attributes.json") as json_file:
    model_config = json.load(json_file)

model_params = model_config["model_params"]
train_params = model_config["train_params"]

model_name = "simple_palmtree"

for test_programs in test_programs_groups[:1]:
    
    with open(results_dict_dir) as json_file:
        results_dict = json.load(json_file)

    print(f"Evaluating {test_programs} benchmarks.")
    print(f"Total test data: {sum(program_data_size[prog] for prog in test_programs)}")

    test_data_df = data_df[data_df.program_name.isin(test_programs)]
    train_val_df = data_df[~data_df.program_name.isin(test_programs)]

    data_loaders = data_utils.get_data_dict(
        data_df=train_val_df,
        enc_type="palmtree",
        mean=True,
        batch_size=train_params["batch_size"],
        split=0.9,
    )

    train_loader = data_loaders["train_loader"]
    val_loader = data_loaders["val_loader"]

    embedding_size = next(iter(train_loader))[0].shape[-1]

    model = models.Simple_Regressor(embedding_size=embedding_size, **model_params)
    
    train_results = train.train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        **train_params,
        verbose=False,
    )

    for test_program in test_programs:
        prog_df = test_data_df[test_data_df.program_name == test_program]
        test_embs = [emb.tolist() for emb in prog_df.bb_embeddings.tolist()]
        test_preds = evaluate.predict(model=model, test_bbs=test_embs)
        results_dict[test_program][f"{model_name}_preds"] = test_preds.tolist()
        with open(results_dict_dir, 'w') as json_file:
            json.dump(results_dict, json_file, indent=4)

Evaluating ['find_biggest', 'binary_search', 'count_file'] benchmarks.
Total test data: 56422


## Vocab

In [10]:
with open("/Users/thodo/Documents/sxoli/diplomatiki/basic-block-energy-prediction/model_checkpoints/lstm_vocab_models/base_model/additional_attributes.json") as json_file:
    model_config = json.load(json_file)

model_params = model_config["model_params"]
train_params = model_config["train_params"]

model_name = "lstm_vocab"

for test_programs in test_programs_groups[1:]:
    
    with open(results_dict_dir) as json_file:
        results_dict = json.load(json_file)

    print(f"Evaluating {test_programs} benchmarks.")
    print(f"Total test data: {sum(program_data_size[prog] for prog in test_programs)}")

    test_data_df = data_df[data_df.program_name.isin(test_programs)]
    train_val_df = data_df[~data_df.program_name.isin(test_programs)]

    data_loaders = data_utils.get_data_dict(
        data_df=train_val_df,
        enc_type="vocab",
        batch_size=train_params["batch_size"],
        split=0.9,
    )

    train_loader = data_loaders["train_loader"]
    val_loader = data_loaders["val_loader"]

    embedding_size = next(iter(train_loader))[0].shape[-1]

    model = models.LSTM_Regressor(vocab_size=len(vocab), custom_embs=True, **model_params)
    
    train_results = train.train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        **train_params,
        verbose=False,
    )

    for test_program in test_programs:
        prog_df = test_data_df[test_data_df.program_name == test_program]
        test_preds = evaluate.predict(model=model, test_bbs=prog_df.bb.values, vocab=vocab)
        results_dict[test_program][f"{model_name}_preds"] = test_preds.tolist()
        with open(results_dict_dir, 'w') as json_file:
            json.dump(results_dict, json_file, indent=4)

Evaluating ['variable_name'] benchmarks.
Total test data: 208472


## Regressors

In [None]:
regressors = [
    "linear",
    "lasso",
    "ridge",
    "elasticnet",
    "sgd",
    "hist_gradient_boosting",
    "svr",
]

In [None]:
for regressor in regressors:
    with open(f"/Users/thodo/Documents/sxoli/diplomatiki/basic-block-energy-prediction/model_checkpoints/regressors/{regressor}/additional_attributes.json") as json_file:
        model_config = json.load(json_file)

    pipe_params = model_config["pipe_params"]
    if regressor != "linear":
        regressor_params = model_config["regressor_params"]

    for test_programs in test_programs_groups:
        
        with open(results_dict_dir) as json_file:
            results_dict = json.load(json_file)

        print(f"Evaluating {test_programs} benchmarks.")
        print(f"Total test data: {sum(program_data_size[prog] for prog in test_programs)}")

        train_val_df = data_df[~data_df.program_name.isin(test_programs)]
        test_data_df = data_df[data_df.program_name.isin(test_programs)]

        X_train = np.array([" ".join(bb) for bb in train_val_df.bb.tolist()])
        y_train = train_val_df.energy.values

        X_test = np.array([" ".join(bb) for bb in test_data_df.bb.tolist()])
        y_test = test_data_df.energy.values
        
        if regressor == "linear":
            regressor = LinearRegression()
        elif regressor == "lasso":
            regressor = Lasso(**regressor_params)
        elif regressor == "ridge":
            regressor = Ridge(**regressor_params)
        elif regressor == "elasticnet":
            regressor = ElasticNet(**regressor_params)
        elif regressor == "sgd":
            regressor = SGDRegressor(**regressor_params)
        elif regressor == "svr":
            regressor = SVR(**regressor_params)
        elif regressor == "hist_gradient_boosting":
            regressor = HistGradientBoostingRegressor(**regressor_params)

        pipe = sklearn_regressors.train_pipe(regressor, X_train, y_train, **pipe_params)

        for test_program in test_programs:
            prog_df = test_data_df[test_data_df.program_name == test_program]
            prog_bbs = np.array([" ".join(bb) for bb in prog_df.bb.tolist()])
            test_preds = pipe.predict(prog_bbs)
            results_dict[test_program][f"{regressor}_preds"] = test_preds.tolist()
            with open(results_dict_dir, 'w') as json_file:
                json.dump(results_dict, json_file, indent=4)