In [1]:
using JLD2
using PyPlot
using StatsBase # Statistics
using Distributions

using PyCall
@pyimport torch 
@pyimport gpytorch
@pyimport botorch


# config plot settings
PyPlot.matplotlib.style.use("ggplot")
rcParams = PyPlot.PyDict(PyPlot.matplotlib."rcParams")
rcParams["font.size"] = 16;

In [2]:
###
#  load data and construct input tensors
###
@load joinpath(pwd(), "targets_and_normalized_features.jld2") X henry_y gcmc_y

3-element Vector{Symbol}:
 :X
 :henry_y
 :gcmc_y

In [None]:
"""
# Arguments
- `X`: feature matrix
- `y`: target vector
- `nb_iterations`: maximum number of BO iterations (experiment budget)
- `which_acquisition`: which acquisition function to implement
` `store_explore_exploit_terms`: whether or not to keep track of the explore and exploit 
                                 terms from the acqisition for the acquired material at each iteration
- `sample_gp`: whether or not to store sample GP functions
- `initialize_with`: specify which and/or how many materials to initialize the search
- `kwargs`: dictionary of optional keyword arguments
"""
function run_bayesian_optimization(X, y1, y2, nb_iterations::Int, 
                                   nb_COFs_initialization::Int;
                                   which_acquisition::Symbol=:EI,
                                   store_explore_exploit_terms::Bool=false,
                                   sample_gp::Bool=false,
                                   initialize_with::Union{Array{Int, 1}, Nothing}=nothing,
                                   kwargs::Dict{Symbol, Any}=Dict{Symbol, Any}())
    # quick checks
    @assert nb_iterations > nb_COFs_initialization "More initializations than itterations not allowed."
    @assert which_acquisition in [:EI] "Acquisition function not supported:\t $(which_acquisition)"
    
    # create array to store explore-explot terms if needed
    if store_explore_exploit_terms
        # store as (explore, exploit, fidelity)
        explore_exploit_balance = Tuple{Float64, Float64, Int64}[]
    end
    
    ###
    #  1. randomly select COF IDs for training initial GP
    ###
    if isnothing(initialize_with)
        ids_acquired = StatsBase.sample(1:nb_COFs, nb_COFs_initialization, replace=false)
        @assert length(unique(ids_acquired)) == nb_COFs_initialization
    else
        # initialize using a specified set of indecies
        ids_acquired = initialize_with
        fidelity = 1
        @assert length(unique(ids_acquired)) == nb_COFs_initialization
    end
    # initialize using ONLY the high-fidelity results
    x = X[ids_acquired, :]
    train_X = torch.from_numpy(x)       # feature vectors
    y1 = torch.from_numpy(gcmc_y[ids_acquired])  # low-fidelity
    y2 = torch.from_numpy(gcmc_y[ids_acquired])  # high-fidelity
    train_y = torch.stack([y1, y2], -1)
    # standardize outputs using *only currently acquired data*
    y_acquired = (y_acquired - torch.mean(y_acquired)) / torch.std(y_acquired)
    
    # uses ICM [here](https://botorch.org/api/models.html#multitaskgp)
    # botorch.models.multitask.MultiTaskGP(train_X, train_Y, task_feature, 
    #     task_covar_prior=None, output_tasks=None, 
    #     rank=None, input_transform=None, outcome_transform=None)
    model = botorch.models.multitask.MultiTaskGP(train_X, train_Y, -1)
    
end

In [5]:
ids_acquired = StatsBase.sample(1:600, 5, replace=false)

x = X[ids_acquired, :]
train_X = torch.from_numpy(x)       # feature vectors
y1 = torch.from_numpy(gcmc_y[ids_acquired])  # low-fidelity
y2 = torch.from_numpy(gcmc_y[ids_acquired])  # high-fidelity
y_acquired = torch.stack([y1, y2], -1)
# standardize outputs using *only currently acquired data*
y_acquired = (y_acquired - torch.mean(y_acquired)) / torch.std(y_acquired)

PyObject tensor([[ 1.1141,  1.1141],
        [-0.5547, -0.5547],
        [ 0.9686,  0.9686],
        [-1.4142, -1.4142],
        [-0.1138, -0.1138]], dtype=torch.float64)

In [6]:
model = botorch.models.multitask.MultiTaskGP(train_X, y_acquired, -1)

PyObject MultiTaskGP(
  (likelihood): GaussianLikelihood(
    (noise_covar): HomoskedasticNoise(
      (noise_prior): GammaPrior()
      (raw_noise_constraint): GreaterThan(1.000E-04)
    )
  )
  (mean_module): ConstantMean()
  (covar_module): ScaleKernel(
    (base_kernel): MaternKernel(
      (lengthscale_prior): GammaPrior()
      (raw_lengthscale_constraint): Positive()
    )
    (outputscale_prior): GammaPrior()
    (raw_outputscale_constraint): Positive()
  )
  (task_covar_module): IndexKernel(
    (raw_var_constraint): Positive()
  )
)

## RUN MTBO

In [7]:
# ###
# #  randomly select points and fidelities to initialize with
# ###
# nb_COFs_initialization = 1
# ids_acquired = []
# for i in 1:nb_COFs_initialization
#     # index of test sample 
#     ind = StatsBase.sample(1:length(henry_y), 1, replace=false)
#     # fidelity of test
#     fid =  StatsBase.sample(1:2, 1, replace=false)
#     push!(ids_acquired, (ind[1], fid[1]))
# end
    
# if ! (in((26, 1), ids_acquired))
#     pushfirst!(ids_acquired, (26, 1))
# end
# ids_acquired

In [None]:
# run_bayesian_optimization(X, y1, y2, nb_iterations::Int, 
#                            nb_COFs_initialization::Int;
#                            which_acquisition::Symbol=:EI,
#                            store_explore_exploit_terms::Bool=false,
#                            sample_gp::Bool=false,
#                            initialize_with::Union{Array{Int, 1}, Nothing}=nothing,
#                            kwargs::Dict{Symbol, Any}=Dict{Symbol, Any}())

In [None]:
# class MultitaskGPModel(gpytorch.models.ExactGP):
#     def __init__(self, train_x, train_y, likelihood):
#         super(MultitaskGPModel, self).__init__(train_x, train_y, likelihood)
#         self.mean_module = gpytorch.means.MultitaskMean(
#             gpytorch.means.ConstantMean(), num_tasks=2
#         )
#         self.covar_module = gpytorch.kernels.MultitaskKernel(
#             gpytorch.kernels.RBFKernel(), num_tasks=2, rank=1
#         )

#     def forward(self, x):
#         mean_x = self.mean_module(x)
#         covar_x = self.covar_module(x)
#         return gpytorch.distributions.MultitaskMultivariateNormal(mean_x, covar_x)


# likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(num_tasks=2)
# model = MultitaskGPModel(train_x, train_y, likelihood)