## Gaussian Process Regression with a String Kernel

The string kernel operates on the raw SMILES representations of molecules

In [1]:
import sys
import os
sys.path.append('..')  # to import from GP.kernels and property_predition.data_utils

import gpflow
from gpflow.mean_functions import Constant
from gpflow.utilities import print_summary
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from GP.kernels import SSK
from property_prediction.data_utils import transform_data, TaskDataLoader, featurise_mols

We load in SMILES from the photoswitch dataset

In [2]:
data_loader = TaskDataLoader('Photoswitch', '../datasets/photoswitches.csv')
smiles_list, y = data_loader.load_property_data()
smiles=np.array(smiles_list, dtype=object).reshape(-1,1)
print(smiles_list[0:5])

['C[N]1N=NC(=N1)N=NC2=CC=CC=C2', 'C[N]1C=NC(=N1)N=NC2=CC=CC=C2', 'C[N]1C=CC(=N1)N=NC2=CC=CC=C2', 'C[N]1C=C(C)C(=N1)N=NC2=CC=CC=C2', 'C[N]1C=C(C=N1)N=NC2=CC=CC=C2']


In [3]:
y = y.reshape(-1, 1)
y_scaler = StandardScaler()
y_scaled = y_scaler.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(smiles, y_scaled, test_size=0.2)
y_test = y_scaler.inverse_transform(y_test)

In [None]:
# kernel choices
max_subsequence_length=5
alphabet = list(set("".join([x[0] for x in X_train])))
k = SSK(batch_size=4000, gap_decay=0.46, match_decay=0.99, alphabet=alphabet, max_subsequence_length = max_subsequence_length, maxlen=85)
cst = gpflow.kernels.Constant(2.75)
m = gpflow.models.GPR(data=(X_train, y_train), mean_function=Constant(-1.7), kernel= cst*k, noise_variance=0.056)
loss=m.log_marginal_likelihood()

# fit model 
optimizer = gpflow.optimizers.Scipy()
optimizer.minimize(m.training_loss , m.trainable_variables,options=dict(ftol=0.00001),compile=False)

Instructions for updating:
`AffineScalar` bijector is deprecated; please use `tfb.Shift(loc)(tfb.Scale(...))` instead.


In [None]:
y_pred, y_var = m.predict_f(X_test)
y_pred = y_scaler.inverse_transform(y_pred)

In [None]:
# Compute R^2, RMSE and MAE on test set molecules

score = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

In [None]:
print("\nR^2: {:.3f}".format(score))
print("RMSE: {:.3f}".format(rmse))
print("MAE: {:.3f}".format(mae))