# Basic Block Energy Prediction Consumption Demo Notebook

This notebook contains the core guidelines and structure that were followed for the implementation of the basic block energy consumption system. The source code used, resides inside the bb_energy_prediction directory.

The distinct machine learning models that were developed are the following:
* LSTM with PalmTree embeddings as input
* Simple Dense network with PalmTree embeddings as input
* LSTM with custom vocabulary and embedding layer 
* Linear Regression
* Lasso Regression
* Ridge Regression
* ElasticNet
* SGD
* SVR
* Hist-Gradient Boosting

## Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import joblib
import sys
import os
import logging
from dotenv import load_dotenv

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

import optuna

from collections import Counter
from typing import Optional

from sklearn.model_selection import cross_val_score, ShuffleSplit
from sklearn.linear_model import LinearRegression, SGDRegressor, ElasticNet, BayesianRidge, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import HistGradientBoostingRegressor

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline

from bb_energy_prediction import models, data_utils, train, evaluate, sklearn_regressors

load_dotenv()

In [None]:
torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

## Data 

Read the dataset. If the dataset does not exist then it is created automatically and saved inside the data path. The dataset columns include:
* bb: The basic blocks
* energy: The energy label
* program_name: The benchmark program that basic blocks originate
* bb_embeddings: The pre-computed PalmTree embeddings

Warning: If creating the data file, the PalmTree embeddings ingestion require several hours.

In [None]:
data_df = data_utils.get_data_df(data_path="../energy_data/data.pkl")

For the custom embedding approach a vocabulary should be created and the dataset should be tokenized:

In [None]:
vocab = data_utils.get_inst_vocab(data_df)
data_df["encoded_bb"] = data_df.bb.apply(lambda x: data_utils.encode_bb_from_vocab(x, vocab, max_insts=20))
print(f"size of vocabulary: {len(vocab)}")

In [None]:
data_df.sample(10)

Split the dataset into test and train_val sets:

In [None]:
shuffled_data_df = data_df.sample(frac=1).reset_index(drop=True)
test_size = int(0.1 * len(shuffled_data_df))
test_df = shuffled_data_df[-test_size:]
train_val_df = shuffled_data_df[:-test_size]

print(f"Test data size: {len(test_df)}")

## Model Training

The model_checkpoints file contains the best attributes for the implemented models, after hyperparameter optimization using Optuna.

For the deep learning approaches follow:

In [None]:
#save and load flags
load = True
save = False

#Specify the desired model path
exp_dir = "../model_checkpoints/lstm_vocab_models/base_model"
with open(f"{exp_dir}/additional_attributes.json") as json_file:
    model_config = json.load(json_file)

model_params = model_config["model_params"]
train_params = model_config["train_params"]
batch_size = model_config["batch_size"]

#enc_type can be "vocab" or "palmtree" depending on model choice
data_loaders = data_utils.get_data_dict(
    data_df=train_val_df, batch_size=batch_size, enc_type="vocab"
)
train_loader = data_loaders["train_loader"]
val_loader = data_loaders["val_loader"]

model = models.LSTM_Regressor(vocab_size=len(vocab), custom_embs=True, **model_params)

if load:
    model.load_state_dict(torch.load(f"{exp_dir}/model"))
    model.cuda()
    train_results = {}
    train_results["train_loss"] = model_config["train_loss"]
    train_results["val_loss"] = model_config["val_loss"]
else:
    train_results = train.train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        verbose=True,
        **train_params,
    )
    if save:
        torch.save(model.state_dict(), f"{exp_dir}/model")
        additional_attributes = {
            "model_params": model_params,
            "train_params": train_params,
            "loss": "RMSE",
            "batch_size": batch_size,
            "number of data": len(train_loader)*batch_size,
            "train_loss": train_results["train_loss"],
            "val_loss": train_results["val_loss"],
        }
        with open(f"{exp_dir}/additional_attributes.json", "w") as file:
            json.dump(additional_attributes, file, indent=4)

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(train_results["train_loss"], label="train_loss")
plt.plot(train_results["val_loss"], label="val_loss")
plt.title("Train and Val losses")
plt.xlabel("Labels")
plt.legend()
plt.show()

For the sklearn approaches choose among the available regressors and follow:

In [None]:
X_train = np.array([" ".join(bb) for bb in train_val_df.bb.tolist()])
y_train = train_val_df.energy.values

X_test = np.array([" ".join(bb) for bb in test_df.bb.tolist()])
y_test = test_df.energy.values

cnt_vect = CountVectorizer()
cnt_vect.fit_transform(X_train)
vocab_len = len(cnt_vect.get_feature_names_out())
print(f"Vocab length: {vocab_len}")

In [None]:
save = False
load = True

#Specify the desired sklearn regressor path
exp_dir = "../model_checkpoints/regressors/svr"

with open(f"{exp_dir}/additional_attributes.json") as json_file:
    model_config = json.load(json_file)

pipe_params = model_config["pipe_params"]

if load:
    pipe = joblib.load(f"{exp_dir}/pipe")
else:
    #change for the desired sklearn regressor
    regressor = SVR()

    pipe = sklearn_regressors.train_pipe(regressor, X_train, y_train, **pipe_params)

    if save:
        additional_attributes = {
            "pipe_params": pipe_params,
        }

        joblib.dump(pipe, f"{exp_dir}/pipe")
        with open(f"{exp_dir}/additional_attributes.json", "w") as file:
            json.dump(additional_attributes, file, indent=4)

## Test prediction

Produce the energy predictions of the test set and create the evaluation visualizations:

In [None]:
# Use flag to choose between sklearn and deep learning models
deep_learning = True

if deep_learning:
    if model.custom:
        test_preds = evaluate.predict(model=model, test_bbs=test_df.bb.tolist(), vocab=vocab)
    else:
        test_embs = [emb.tolist() for emb in test_df.bb_embeddings.tolist()]
        test_preds = evaluate.predict(model=model, test_bbs=test_embs)
else:
    test_bbs = np.array([" ".join(bb) for bb in test_df.bb.tolist()])
    test_preds = pipe.predict(test_bbs)

In [None]:
true_energies = test_df.energy.values
mae = round(mean_absolute_error(true_energies, test_preds), 3)

print(f"Mean absolute error: {mae} (*61μJ)")

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(20, 5))
plt.suptitle(f"Basic blocks' energy histogram label data vs model preds for test set")
axs[0].hist(true_energies, range=(0, 10), bins=50)
axs[0].set_title('labels')
axs[0].set_xlabel('Energy (*61μJ)')
axs[1].hist(test_preds, range=(0, 10), bins=50)
axs[1].set_title('Model preds')
axs[1].set_xlabel('Energy (*61μJ)')
plt.show()