In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.gaussian_process import GaussianProcessRegressor
import pandas as pd
from capstone_library import *

# Hints
## Fast, but Inaccurate Modelling
This example is for a particular business relying heavily on online sales. It can run very accurate calculations to figure out what is the optimal placement of their product across warehouses. Unfortunately, the calculations are extremely expensive (computationally) to run, so they can only do it once every two weeks. Instead, they propose using a machine learning model which approximates the solution quickly (in a few hours). The model has four hyper-parameters you need to tune, and the output corresponds to the difference between the expensive calculation, and the model. Since you are modelling a dynamical system, expect a lot of local optima!

# Let's go!

Let's load the data.

In [2]:
X = np.load('initial_data/function_4/initial_inputs.npy')
y = np.load('initial_data/function_4/initial_outputs.npy')

In [3]:
# loading new data
new_queries = get_function_data_from_file('new_data/queries.txt', 4)
new_observ = get_function_data_from_file('new_data/observations.txt', 4)

In [4]:
# adding new_queries to X
new_queries = np.array(new_queries).reshape(-1, 4)
X = np.concatenate((X, new_queries), axis=0)

# adding new_observ to Y
new_observ = np.array(new_observ).reshape(-1)
y = np.concatenate((y, new_observ), axis=0)


## Visualizing the data and thinking of the problem

In [5]:
# visualising the data as a table
df = pd.DataFrame(np.hstack((X, y.reshape(-1, 1))), columns=['x1', 'x2', 'x3', 'x4', 'y'])
df.head(100)


Unnamed: 0,x1,x2,x3,x4,y
0,0.896981,0.725628,0.175404,0.701694,-22.108288
1,0.889356,0.499588,0.539269,0.508783,-14.601397
2,0.250946,0.033693,0.14538,0.494932,-11.699932
3,0.346962,0.00625,0.760564,0.613024,-16.053765
4,0.124871,0.12977,0.3844,0.287076,-10.069633
5,0.801303,0.500231,0.706645,0.195103,-15.487083
6,0.247708,0.060445,0.042186,0.441324,-12.681685
7,0.746702,0.757092,0.369353,0.206566,-16.0264
8,0.400665,0.072574,0.886768,0.243842,-17.049235
9,0.626071,0.586751,0.438806,0.778858,-12.741766


In [6]:
# sort the data by the output, with the best value at the top
df = df.sort_values(by=['y'], ascending=False)
df.head(100)

Unnamed: 0,x1,x2,x3,x4,y
30,0.433333,0.366667,0.366667,0.4,0.49643
39,0.433333,0.4,0.366667,0.4,0.404832
38,0.4,0.433333,0.366667,0.4,0.404832
31,0.4,0.366667,0.333333,0.433333,0.205961
42,0.4,0.4,0.4,0.4,0.194702
36,0.4,0.333333,0.433333,0.4,0.126169
35,0.4,0.333333,0.433333,0.4,0.126169
34,0.4,0.333333,0.433333,0.4,0.126169
33,0.4,0.333333,0.433333,0.4,0.126169
32,0.4,0.333333,0.433333,0.4,0.126169


Let's start with Bayesian Optimization with an Upper Confidence Bound acquisition function that uses a Gaussian Process as a surrogate model.

In [7]:
def get_next_query(kappa, X, y):
    # parameters for the gaussian process
    gpr = GaussianProcessRegressor()
    gpr.fit(X, y)

    # to optimize the acquisition function, we will simply use gridsearch over a space of 30^4 gridpoints
    x1 = np.linspace(0, 1, 30, endpoint=False)
    x2 = np.linspace(0, 1, 30, endpoint=False)
    x3 = np.linspace(0, 1, 30, endpoint=False)
    x4 = np.linspace(0, 1, 30, endpoint=False)

    X_grid = []
    for i in range(len(x1)):
        for j in range(len(x2)):
            for k in range(len(x3)):
                for l in range(len(x4)):
                    X_grid.append([x1[i], x2[j], x3[k], x4[l]])

    X_grid = np.array(X_grid)
    mean, std = gpr.predict(X_grid, return_std = True)
    ucb = mean + kappa * std

    idx_max = np.argmax(ucb)
    next_query = X_grid[idx_max]
    return next_query, X_grid, ucb


In [9]:
# 
next_query, X_grid, ucb = get_next_query(1, X, y)
print(format_query(next_query))

0.733333-0.000000-0.533333-0.500000


### Observations with original data
With a low kappa, the algorithm explores the space. With a high kappa, the algorithm exploits the space.
Next query: 0.433333-0.366667-0.366667-0.400000

### Observations after query 1
Next query, still with kappa=1: 0.400000-0.366667-0.333333-0.433333

### After observation 23 (13th query)
Let's go with the suggested query: 0.733333-0.000000-0.533333-0.500000