In [1]:
import GPy
import GPyOpt
import numpy as np
import math

In [2]:
def ReadDataExercise3(filepath):
    data = []
    with open(filepath) as f:
        f.readline()
        for line in f:
            line = line.strip().split(',')
            data.append(line[-2:])        
    return data

In [3]:
filepath = 'ex3_data.csv'
data = ReadDataExercise3(filepath)

In [4]:
def OneHotEncodePeptide(sequence):
    num_aa = len(sequence)
    AminoAcids = [
        "G", "A", "S", "P", "V",
        "T", "C", "I", "L", "N",
        "D", "K", "Q", "E", "M",
        "H", "F", "R", "Y", "W"
    ]
    AminoAcids = sorted(AminoAcids)
    AA_Dictionary = {key:i for i,key in enumerate(AminoAcids)}
    
    one_hot_seq = np.zeros((num_aa,len(AminoAcids)))
    for i, letter in enumerate(sequence):
        one_hot_seq[i,AA_Dictionary[letter]] = 1
    return one_hot_seq.flatten()
    

In [5]:
oh_data = [OneHotEncodePeptide(str(x[0])) for x in data]
processed_data = [[x, *y] for x,y in zip(oh_data, data)] # one-hot encoded, aa_sequence, label
x = np.array([ele[0] for ele in processed_data])
x_seq = [ele[1] for ele in processed_data]
y = np.array([float(ele[2]) for ele in processed_data])  

In [None]:
# We're going to use the GP in the same way multiple times, so let's create a function
def GP_analysis(X, Y, X_grid):
    # Use GP regression to fit the data
    k = GPy.kern.RBF(X.shape[1])
    m = GPy.models.SparseGPRegression(X, Y, k)
    m.optimize('bfgs', max_iters=10)

    # Predict the mean and covariance of the GP fit over the grid
    mean, Cov = m.predict(X_grid, full_cov=True)
    variance = np.diag(Cov)
    return mean, Cov, variance, m

In [None]:
def GaussianProcess(x, y, split_percent, seed):
    """ Chooses next sample using gaussian process """
    # Split data into test and train.
    rng = np.random.default_rng(seed)
    shuffled_idx = rng.shuffle(np.arange(x.shape[0]))
    shuffled_x = x[shuffled_idx]
    shuffled_y = y[shuffled_idx]

    n = len(y)
    x_lab = shuffled_x[:int(n*split_percent)]
    y_lab = shuffled_y[:int(n*split_percent)]
    x_unlab = shuffled_x[int(n*split_percent):]
    y_unlab = shuffled_y[int(n*split_percent):]

    Dsize = len(x)
    BO_lambda = .1
    BO_number_of_iterations = 1

    # Random sample locations
    # X_samples = np.random.uniform(-np.pi, np.pi * 3/4, (num_measurements, 1))

    # # Get the function value
    # Y_samples = f(X_samples)

    count = 0
    percentages = []
    samples = []
    for i in range(0, 200):

        # Use GP regression to fit the data
        mean, Cov, variance, m = GP_analysis(x_lab, y_lab, x_unlab)

        # Compute UCB
        BO_beta = 2 * math.log(Dsize * math.pow(BO_number_of_iterations,2) * math.pow(np.pi,2) / (6 * BO_lambda) )
        alpha_full = mean + math.sqrt(BO_beta) * variance[:,None]

        # Find the next sample
        next_sample_index = np.argmax(alpha_full)

        # Logging metrics.
        if y_unlab[next_sample_index] == 9.0:
            count += 1
            samples.append(x_unlab[next_sample_index])
        percentages.append(count/(i + 1)*100)

        # Remove chosen sample from unlabled and add to labled
        x_lab = np.vstack((x_lab, x_unlab[next_sample_index]))
        y_lab = np.append((y_lab, y_unlab[next_sample_index]))

        x_unlab = np.delete(x_unlab, next_sample_index, axis = 0)
        y_unlab = np.delete(y_unlab, next_sample_index)

    return count, percentages, samples


In [None]:
Dsize = len(x)
num_measurements = x.shape[0]
BO_lambda = .1
BO_number_of_iterations = 1

# Random sample locations
X_samples = np.random.uniform(-np.pi, np.pi * 3/4, (num_measurements, 1))

# Get the function value
Y_samples = f(X_samples)

for i in range(0, 10):

    # Use GP regression to fit the data
    mean, Cov, variance, m = GP_analysis(X_samples, Y_samples, x)

    # Compute UCB
    BO_beta = 2 * math.log(Dsize * math.pow(BO_number_of_iterations,2) * math.pow(np.pi,2) / (6 * BO_lambda) )
    alpha_full = mean + math.sqrt(BO_beta) * variance[:,None]

    # Find the next sample
    next_sample_index = np.argmax(alpha_full)