In [1]:
# Implementing freeze-thaw bayesian optimization with two-level Gaussian processes
# A global GP models the asymptotic mean of the learning curves of each HP-config
# Local GPs model the learning curves of each HP-config

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, Kernel, Sum, WhiteKernel
import numpy as np
import matplotlib.pyplot as plt


ALPHA,BETA = 1,0.5
NOISE = 0.1
B_OLD=2 # 10
B_NEW=2 # 3
MAX_EPOCHS=100
EPOCH_STEP=1
N_SAMPLES_MC=10 # number of samples for Monte Carlo integration
N_FANT=5 # number of observations we fantasize for each point in the basket

In [2]:
'''
Equation 3: Expected Improvement-formula
Equation 4: Entropy Search-formula
Equation 5&6: Exponential Decay-kernel for Gaussian Process
Equation 12,13,17 & 18: Posterior distribution of global Gaussian Process
Equation 14 & 19: Posterior predictive distribution of global Gaussian Process
Equation 15 & 20: Posterior predictive distribution of a local Gaussian Process with existing oberservations of this HP-config
Equation 16 & 21: Posterior predicitve distribution of a local Gaussian Process without existing oberservations of this HP-config
'''

'\nEquation 3: Expected Improvement-formula\nEquation 4: Entropy Search-formula\nEquation 5&6: Exponential Decay-kernel for Gaussian Process\nEquation 12,13,17 & 18: Posterior distribution of global Gaussian Process\nEquation 14 & 19: Posterior predictive distribution of global Gaussian Process\nEquation 15 & 20: Posterior predictive distribution of a local Gaussian Process with existing oberservations of this HP-config\nEquation 16 & 21: Posterior predicitve distribution of a local Gaussian Process without existing oberservations of this HP-config\n'

In [23]:
observed_configs_dicts=[{'HP1':0.},{'HP1':1.}]
observed_configs_list=np.array([[0.],[1.]])


from botorch.models import SingleTaskGP
from gpytorch.kernels import MaternKernel, ScaleKernel
from gpytorch.means import ConstantMean
import torch


# Define the kernel for the global GP
kernel_global = MaternKernel(nu=2.5)
print(torch.from_numpy(observed_configs_list))
k_x = kernel_global(torch.from_numpy(observed_configs_list)).numpy()
print(k_x)




tensor([[0.],
        [1.]], dtype=torch.float64)
[[1.         0.30562244]
 [0.30562244 1.        ]]


In [7]:

'''
Our Bayesian optimization strategy proceeds by maintaining a basket of B = Bold + Bnew candidate
models. Bold represents some number of models that have already been trained to some degree, while Bnew
represents some number of brand new models. In practice, we set Bold = 10 and Bnew = 3. The entire
basket is chosen using models with the maximum EI at the asymptote, which is computed using Equations
19 and 3. Each round, after a new observation has been collected, the basket is re-built using possibly
different models. This step is essentially standard Bayesian optimization using EI.
'''
# Fill the basket with configs
basket_new=[]
basket_old=[]


# Greedily choose the best HP-config using Equation 19 & 3 until B_OLD existing configs and B_NEW new configs are found
iterator=0
while not (len(basket_new)==B_NEW and len(basket_old)==B_OLD):
    # Sample another config from EI and try to add it to the basket
    # If it is already in the basket, or the basket is full, skip it
    # TODO: Implement equations 19 & 3 here
    # Get the iterator best configs from EI
    sampled_EI_config = {'HP1':iterator} # Take the iterator-th config from EI
    if not sampled_EI_config in observed_configs_dicts and not sampled_EI_config in basket_new and len(basket_new)<B_NEW:
        basket_new.append(sampled_EI_config)
    elif sampled_EI_config in observed_configs_dicts and not sampled_EI_config in basket_old and len(basket_old)<B_OLD:
        basket_old.append(sampled_EI_config)
    iterator+=1


print(basket_new,basket_old)
a=np.zeros(len(basket_new+basket_old))


[{'HP1': 2}, {'HP1': 3}] [{'HP1': 0}, {'HP1': 1}]


In [4]:
# Compute p_min over the basket using Monte Carlo simulation and Equation 19

# p_min is the posterior predictive distribution over the minimum of the global GP
# We will use it's entropy to observe configs with results that make the p_min distribution spikier

# Monte Carlo simulation
# Using the global GP, we create a discrete grid of the N_SAMPLES_MC highest EI configs
# From the grid, compute P_min and the entropy of P_min, H(P_min)

h_p_min=1

In [5]:
# For each config in the basket, N_FANT times fantasize an observation and recompute the information gain from it, collecting it in a
for k,config in enumerate(basket_new):
    for f_n in range(N_FANT):
        # Fantasize an observation using Equation 21
        # TODO fantasize an observation

        # Compute p_min_y, the new minimum distribution, including the fantasized observation y, again using Monte Carlo and Equation 19
        # TODO compute p_min_y

        # Compute the new entropy of p_min_y, H(P_min_y)
        # TODO compute H(P_min_y)
        h_p_min_y=0.5
        # 
        a[k]+=(h_p_min_y-h_p_min)/N_FANT

for k,config in enumerate(basket_old):
    for f_n in range(N_FANT):
        # Fantasize an observation using Equation 20
        # TODO fantasize an observation

        # Compute p_min_y, the new minimum distribution, including the fantasized observation y, again using Monte Carlo and Equation 19
        # TODO compute p_min_y

        # Compute the new entropy of p_min_y, H(P_min_y)
        # TODO compute H(P_min_y)
        h_p_min_y=0

        a[k+B_NEW]+=(h_p_min_y-h_p_min)/N_FANT


# Select the config with the highest information gain
best_config=(basket_new+basket_old)[np.argmax(a)]
print(best_config)


{'HP1': 2}
