# Imports

In [61]:
import os
import sys
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, SGDRegressor, ElasticNet, BayesianRidge
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error

from collections import Counter

sys.path.append("../../src")

import embedder
import utils
import matplotlib.pyplot as plt

# Data

In [43]:
data_files = os.listdir("../../dataset/bb_data/")

if "energy_dataset.pkl" not in data_files:
    result_files = [f for f in os.listdir("../../../energy_dataset/") if f.endswith("results")] 
    data_df = pd.DataFrame()

    for file in result_files:
        file_df = utils.read_bb_data(f"../../../energy_dataset/{file}/breaker_code.txt", f"../../../energy_dataset/{file}/breaker_final_energy.txt")
        file_df = utils.preprocess_bb_df(file_df)
        data_df = pd.concat([data_df, file_df], ignore_index=True)

    data_df["bb_embeddings"] = data_df.bb.apply(lambda x: embedder.encode(x))
    data_df.to_pickle("../../dataset/bb_data/energy_dataset.pkl")
    
else:
    data_df = pd.read_pickle("../../dataset/bb_data/energy_dataset.pkl")
    if "bb_embeddings" not in data_df.columns:
        data_df["bb_embeddings"] = data_df.bb.apply(lambda x: embedder.encode(x))
        data_df.to_pickle("../../dataset/bb_data/energy_dataset.pkl")

In [44]:
counts = Counter(inst for bb in data_df.bb.tolist() for inst in set(bb))

vocab = {inst: i for i, (inst, _) in enumerate(counts.most_common(20000), start=1)}
vocab["UNK"] = 0
vocab["PAD"] = vocab[max(vocab, key=vocab.get)] + 1

data_df["encoded_bb"] = data_df.bb.apply(lambda x: utils.encode_bb_from_vocab(x, vocab, max_insts=20))

In [45]:
data_df.head(5)

Unnamed: 0,bb,energy,bb_embeddings,encoded_bb
0,[movl $0x0],0.645165,"[[1.0268462, 0.028481035, 2.7335312, 0.5859081...","[62, 1808, 1808, 1808, 1808, 1808, 1808, 1808,..."
1,"[callq, pushq %rbp, mov %rsp %rbp]",2.910575,"[[0.11681142, -0.7584041, 2.0799158, 0.3227231...","[11, 20, 97, 1808, 1808, 1808, 1808, 1808, 180..."
2,"[popq %rbp, retq]",0.997157,"[[1.0268458, 0.028480439, 2.7335315, 0.5859076...","[14, 6, 1808, 1808, 1808, 1808, 1808, 1808, 18..."
3,[movl $0x0],0.664772,"[[1.0268462, 0.028481035, 2.7335312, 0.5859081...","[62, 1808, 1808, 1808, 1808, 1808, 1808, 1808,..."
4,"[cmpl $0x64, jnl 0x2c]",1.400952,"[[1.0268458, 0.028480439, 2.7335315, 0.5859076...","[555, 556, 1808, 1808, 1808, 1808, 1808, 1808,..."


# Regression

In [70]:
train_df, test_df = train_test_split(data_df, test_size=0.1, random_state=42)

x_train, y_train = np.array(list(train_df.encoded_bb)), train_df.energy
x_test, y_test = np.array(list(test_df.encoded_bb)), test_df.energy

In [73]:
regressors = [LinearRegression(), SGDRegressor(), ElasticNet(), BayesianRidge(), SVR(), GradientBoostingRegressor()]
regressor_names = ["Linear Regressor", "SGD Regressor", "ElasticNet Regressor", "BayesianRidge Regressor", "SV Regressor", "GradientBoosting Regressor"]

for reg, reg_name in zip(regressors, regressor_names):
    
    reg = reg.fit(x_train, y_train)
    preds = reg.predict(x_test)

    mse = mean_squared_error(y_test, preds)
    mae = mean_absolute_error(y_test, preds)

    print(f"\033[1m{reg_name} \033[0m")
    print(f"Test mse: {round(mse, 3)}")
    print(f"Test mae: {round(mae, 3)}\n\n")

[1mLinear Regressor [0m
Test mse: 0.436
Test mae: 0.324


[1mSGD Regressor [0m
Test mse: 2.6819470720846393e+31
Test mae: 5107716443180545.0


[1mElasticNet Regressor [0m
Test mse: 0.436
Test mae: 0.325


[1mBayesianRidge Regressor [0m
Test mse: 0.436
Test mae: 0.324


