In [1]:
import sys
import os
import pandas as pd
sys.path.append('..')  # to import from GP.kernels and property_predition.data_utils

In [2]:
import gpflow
from gpflow.mean_functions import Constant
from gpflow.utilities import print_summary
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import tensorflow as tf
from rdkit.Chem import MolFromSmiles
from rdkit.Chem.rdmolops import GetAdjacencyMatrix

from property_prediction.data_utils import transform_data
from GP.kernel_modules.random_walk import RandomWalk
from GP.kernel_modules.kernel_utils import pad_tensors
from GP.kernel_modules.kernel_utils import preprocess_adjacency_matrix_inputs

In [3]:
df = pd.read_csv("../datasets/ESOL.csv")[:50]
smiles = df["smiles"].to_numpy()
y = df['measured log solubility in mols per litre'].to_numpy()

In [4]:
X = [tf.convert_to_tensor(GetAdjacencyMatrix(MolFromSmiles(smiles))) for smiles in smiles]
X = preprocess_adjacency_matrix_inputs(X)

We define the Gaussian Process Regression training objective

In [6]:
def objective_closure():
    return -m.log_marginal_likelihood()

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X.numpy(), y, test_size=0.2, random_state=0)
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

 We standardise the outputs but leave the inputs unchanged

In [8]:
_, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test)

In [9]:
k = RandomWalk()
m = gpflow.models.GPR(data=(X_train, y_train), mean_function=Constant(np.mean(y_train)), kernel=k, noise_variance=1)

Optimise the kernel variance and noise level by the marginal likelihood

In [None]:
opt = gpflow.optimizers.Scipy()
opt.minimize(objective_closure, m.trainable_variables, options=dict(maxiter=100))
print_summary(m)  # Model summary