In [1]:
using CSV
using DataFrames
using PyPlot
using StatsBase # Statistics
using Formatting # string formatting
using ScikitLearn # machine learning package
using ScikitLearn.CrossValidation: train_test_split
@sk_import gaussian_process : GaussianProcessRegressor
@sk_import gaussian_process.kernels : Matern

@sk_import inspection : permutation_importance

PyObject <function permutation_importance at 0x7f6a0ee858c8>

## Import and Prepare Data

In [3]:
# load descriptor data
filename = joinpath(pwd(), "descriptors/cof_descriptors.csv")
descriptors = CSV.read(filename, DataFrame)
rename!(descriptors, "crystal_name" => "xtal")

# WARNING: for these "⟨N⟩ (mmol/g)" = [0.0, 0.0] producing sel = NaN... remove them from dataset
skip_for_now = ["13030N2_ddec.cif", "16371N2_ddec.cif", "20565N3_ddec.cif", "21090N3_ddec.cif"]
filter!(row -> ! (row["xtal"] in skip_for_now), descriptors)

# load target data
target_data = CSV.read(joinpath(pwd(), "target_and_cost_data.csv"), DataFrame)
target_data[1:5, :]

Unnamed: 0_level_0,xtal,henry_selectivity,gcmc_selectivity
Unnamed: 0_level_1,String,Float64,Float64
1,05000N2_ddec.cif,1.58051,1.69624
2,05001N2_ddec.cif,3.27135,3.2725
3,07000N2_ddec.cif,2.9979,2.99372
4,07001N2_ddec.cif,9.01379,9.65039
5,07002N2_ddec.cif,3.60253,3.64401


In [4]:
###
#  construct feature matrix and target vector for GP
###
x_cols = [name for name in names(descriptors) if name != "xtal"]
n_xtals = nrow(descriptors)

X = zeros(n_xtals, length(x_cols))
henry_y = zeros(n_xtals)
gcmc_y  = zeros(n_xtals)

for (i, row) in enumerate(eachrow(descriptors))   
    for (j, col_name) in enumerate(x_cols)
        X[i, j] = row[col_name]
    end
    # Xe/Kr Selectivity
    henry_y[i] = target_data[i, :henry_selectivity] 
    gcmc_y[i]  = target_data[i, :gcmc_selectivity]
end
X # look at y too!

608×14 Matrix{Float64}:
  3.84928  0.2  4175.64  1049.37   0.0292303   …  0.00974344   0.0  0.0  0.0
 19.7302   0.6  3156.96   585.477  0.0204808      0.00227564   0.0  0.0  0.0
 26.1625   0.8  3407.23   458.054  0.0171524      0.00142936   0.0  0.0  0.0
  7.82083  0.4  2565.59  1100.22   0.037349       0.00466863   0.0  0.0  0.0
 14.8469   0.4  3345.75   703.114  0.0271906      0.00194218   0.0  0.0  0.0
  9.33026  0.7  5261.66   421.68   0.0149858   …  0.00239773   0.0  0.0  0.0
  9.94867  0.7  5452.36   383.117  0.0125929      0.00209882   0.0  0.0  0.0
 18.946    0.9  5335.37   180.981  0.00653913     0.000544927  0.0  0.0  0.0
 27.193    0.9  5190.06   171.075  0.00643935     0.000525661  0.0  0.0  0.0
 10.0414   0.6  5593.85   519.979  0.0156241      0.00175223   0.0  0.0  0.0
  4.7838   0.2  3845.42  1143.15   0.0374755   …  0.00468444   0.0  0.0  0.0
  7.51918  0.4  3597.55   988.327  0.0313415      0.00447735   0.0  0.0  0.0
  9.8332   0.5  3440.67   874.7    0.0265802      0.

In [5]:
# Normalize the feature vectors
for j in 1:length(x_cols)
    X[:, j] = (X[:, j] .- minimum(X[:, j])) / (maximum(X[:, j]) - minimum(X[:, j]))
end
println(minimum(X[:, 2]))
println(maximum(X[:, 2]))

0.0
1.0


## BO Function

In [13]:
### procedure:
# 1. select initial COF identifiers set to train GP 
# 2. initialize and normalize array of initial target data 
# 3. itterate through budgetted number of BO runs
#    a. construct GP model with kernel
#    b. fit GP to current data for acquired COFs => gives ŷ(x)
#    c. construct acquiaition function A(x)
#    d. determine which COF to acquire next => evaluate argmax(A(x))
#       i. for EI track if this is an exploitation or an exploration
#    e. append COF identified in step 3.d to list of acquired COFs 
# 4. update final set of acquired COF data and normalize
# 5. return the IDs for the set of acquired COFs
###
function run_bayesian_optimization(X, y, nb_iterations::Int, 
                                   nb_COFs_initialization::Int;
                                   which_acquisition::Symbol=:UCB,
                                   store_explore_exploit_terms::Bool=false)
    # quick checks
    @assert nb_iterations > nb_COFs_initialization "More initializations than itterations not allowed."
    @assert which_acquisition in [:UCB] "Acquisition function not supported:\t $(which_acquisition)"
    
    ###
    #  1. randomly select COF IDs for training initial GP
    ###
    ids_acquired = StatsBase.sample(1:nb_COFs, nb_COFs_initialization, replace=false)
    @assert (length(unique(ids_acquired)) == nb_COFs_initialization)
     
    ###
    #  3. itterate through budgetted number of BO runs
    ###
    for i in range(nb_COFs_initialization, stop=nb_iterations)
        ###
        #  a-b. construct and fit GP model
        ###
        kernel = Matern(nu=2.5, length_scale=0.25) 
        model = GaussianProcessRegressor(kernel=kernel, normalize_y=true, n_restarts_optimizer=5)
        model.fit(X[ids_acquired, :], y[ids_acquired])
        
        ###
        #  c. setup acquisition function
        ###
        if which_acquisition == :UCB
            # A(x) = ŷ(x) + βσ(x)
            # where β controlls exploitation-exploration trade-off
            β = 2.0
            ŷ, σ = model.predict(X, return_std=true)
            acquisition_values = ŷ .+ β * σ
        elseif which_acquisition == :EI
            # A(x) = integral{-∞}{+∞}[I(x)N(y|ŷ(x), σ²(x))dy]
            # where I(x) = max(0, f(x) - maxᵢ(yᵢ))
        end
        
        ###
        #  d. determine which COF to acquire next
        ###
        ids_sorted_by_aquisition = sort(acquisition_values, rev=true)
        for id_max_aquisition_all in ids_sorted_by_aquisition
            if ! (id_max_aquisition_all in ids_acquired)
                id_max_aquisition = id_max_aquisition_all
                break
            end
        end
        
        ###
        #  e. acqurie this COF (i.e. update list)
        ###
        ids_acquired = push!(ids_acquired, id_max_aquisition)
        @assert length(ids_acquired) == i + 1
    end
    
    # quick check
    @assert length(ids_acquired) == nb_iterations
    
    ###
    #  5. return the IDs for the set of acquired COFs
    ###
    return ids_acquired
end

run_bayesian_optimization (generic function with 1 method)

## Apply BO

In [None]:
nb_COFs = length(descriptors[:, "xtal"])
nb_iterations = 300
nb_COFs_initialization = 10
which_acquisition = :UCB # :EI

## search efficiency curve

In [None]:
# make a plot of ŷ and σ as a function of itterations to see exploitation vs exploration