New, cleaned-up notebook to generate figures in the paper.

Figures:
    - Accumulated Cost Histogram 
    - MFBO Sarch Dynamics: S_max vs experiment and acc cost vs experiment
    - Model Comparison Plot: S_max vs Cost for each type of search (SFBO, MFBO, random)
    - Search Distrobution plot
    - PCA MFBO Acquisition Dynamics: MFBO Acquisitions represented in 2D PCA space
    - Feature Radar Plot

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.text as mpltxt
import matplotlib.cm as cm
import pickle # for .pkl files
import h5py   # for .jld2 files
import os
import torch
from sklearn.decomposition import PCA
import seaborn as sns
from mpl_toolkits.axes_grid1 import ImageGrid


sns.set(style='ticks', palette='Set2', font_scale=1.5, rc={"lines.linewidth": 3})
sns.despine()

plt.rcParams['figure.dpi'] = 1200
save_plots = True

In [None]:
discrete_fidelities = [1/3, 2/3] # set of discrete fidelities (in ascending order) to select from

## Load Data

In [None]:
###
#  construct a single dictionary from those of all the runs
###
def get_bo_res(which_results: str, nb_run: int, norm_type):
    assert "results" in which_results
    # initialize dict to store results 
    bo_res = pickle.load(
             open('search_results/{}/{}/{}_run_{}.pkl'.format(norm_type, 
                                                              which_results, 
                                                              which_results, 0), 'rb'))
    for key in bo_res.keys():
        bo_res[key] = []
        
    # iterate though the runs
    for n in range(nb_run):
        # get results dict for specified run
        bo_res_file = pickle.load(
                     open('search_results/{}/{}/{}_run_{}.pkl'.format(norm_type, 
                                                                      which_results, 
                                                                      which_results, n), 'rb'))
        
        # append results to end of list
        for key in bo_res_file.keys():
            key_res = bo_res_file[key]
            bo_res[key].append(key_res)
            
    return bo_res

In [None]:
###
#  features and molecular simulation data
###
file = h5py.File("targets_and_{}_features.jld2".format(normalization), "r")

# feature matrix
X = torch.from_numpy(np.transpose(file["X"][:])) # ... Needs to be tensor?
# simulation data
y = [np.transpose(file["henry_y"][:]), 
     np.transpose(file["gcmc_y"][:])]

# total number of COFs in data set
nb_COFs = X.shape[0]

###
#  bayesian optimization data
###
# COF IDs used for initialization
init_cof_ids_file = pickle.load(open('search_results/{}/initializing_cof_ids_{}.pkl'.format(normalization, 
                                                                           normalization), 'rb'))
init_cof_ids = init_cof_ids_file['init_cof_ids']

nb_COFs_initialization = len(init_cof_ids[0])
nb_runs = len(init_cof_ids)

# random search 
random_search_res = pickle.load(open('search_results/{}/random_search_results.pkl'.format(normalization), 'rb'))

# multi-fidelity search
mfbo_res = get_bo_res('mfbo_results', nb_runs, normalization)

# single-fideliy search
sfbo_res = get_bo_res('sfbo_results', nb_runs, normalization)

# number of iterations per run
nb_iters = len(mfbo_res['acquired_set'][0])

###
#  quick checks
###
# structure of data
assert len(sfbo_res['ids_acquired'][0]) == nb_iters
assert len(random_search_res['ids_acquired'][0]) == nb_iters
assert sfbo_res['nb_COFs_initialization'][0] == nb_COFs_initialization

# each run has the correct initializing COFs
assert all([all(sfbo_res['ids_acquired'][r][:nb_COFs_initialization] == init_cof_ids[r]) 
            for r in range(nb_runs)])

In [None]:
# the max number of iterations needed for any of the runs
max_SFBO_iters = max(sfbo_res['BO_iter_top_cof_acquired'])
max_MFBO_iters = max(mfbo_res['BO_iter_top_cof_acquired'])

# the highest accumulated cost up to the max number of iterations needed
max_SFBO_cost = np.max(sfbo_res['accumulated_cost'][:][:max_SFBO_iters+1])
max_MFBO_cost = np.max(mfbo_res['accumulated_cost'][:][:max_MFBO_iters+1])


print("The max number of iterations needed for any of the runs -")
print("\tSFBO: {}".format(max_SFBO_iters))
print("\tMFBO: {}".format(max_MFBO_iters))
print("The highest accumulated cost up to the max number of iterations needed -")
print("\tSFBO: {} [hr]".format(max_SFBO_cost))
print("\tMFBO: {} [hr]".format(max_MFBO_cost))