To open on Google Colab\
https://colab.research.google.com/github/RodrigoAVargasHdz/CHEM-4PB3/blob/main/Course_Notes/Week5/gpytorch_molecules.ipynb

## Data an original model
Load data from the [paper](th.fhi-berlin.mpg.de/site/uploads/Publications/QM-NJP_20130315.pdf)

model was first introduce in 2012
[paper](https://www.mrupp.info/Data/2012rtmvl_prl.pdf)

In [108]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

#load data
data_url = "https://github.com/RodrigoAVargasHdz/CHEM-4PB3/raw/main/Course_Notes/data/qm7.csv"
data = pd.read_csv(data_url)
# print(data.head)
print(data.columns)

Xtot = data.drop(['Unnamed: 0','AtomizationEnergy'], axis=1).to_numpy()
ytot = data['AtomizationEnergy'].to_numpy()  # [:,np.newaxis]
Ctot = Xtot.reshape(Xtot.shape[0],23,23)

Index(['Unnamed: 0', 'C Matrix 0', 'C Matrix 1', 'C Matrix 2', 'C Matrix 3',
       'C Matrix 4', 'C Matrix 5', 'C Matrix 6', 'C Matrix 7', 'C Matrix 8',
       ...
       'C Matrix 520', 'C Matrix 521', 'C Matrix 522', 'C Matrix 523',
       'C Matrix 524', 'C Matrix 525', 'C Matrix 526', 'C Matrix 527',
       'C Matrix 528', 'AtomizationEnergy'],
      dtype='object', length=531)


From Eq. 2 in the [paper](https://www.mrupp.info/Data/2012rtmvl_prl.pdf), we see that the authors proposed the eigenvalues of the Coulomb matrix as the 
features of our model. 
1. $\mathbf{\epsilon}$ -> eigenvalues of the Coulomb matrix
2. $C$ -> Coulomb matrix

The elements of the C matrix are given by,\
$C_{ij} = \Big\{ \begin{matrix} 0.5 Z_i^{2.4} \;\;\;\;\text{if }\;\; i=j\\ \frac{Z_iZ_j}{|R_i - R_j|} \;\;\;\;\text{if }\;\; i\neq j \end{matrix}$,
where,
1. $R_{i}$ is the position of atom-i
2. $Z_{i}$ is the atomic number of atom-i


To compute the eigenvalues of C we can use numpy ``` np.linalg.eigh(C)```

In [109]:
Ctot_eig = []
for c in Ctot:
    e,_ = np.linalg.eigh(c)
    Ctot_eig.append(e)

Ctot_eig = np.array(Ctot_eig)

## Train a GP using GPyTorch
The kernel function proposed in the [paper](https://www.mrupp.info/Data/2012rtmvl_prl.pdf) is, (below if Equation (3))\
$K_{ij}=\exp^{-\frac{d(\mathbf{C}_i,\mathbf{C}_j)}{2\ell^2}}$,\
where,\
$d(\mathbf{C}_i,\mathbf{C}_j) = d(\mathbf{\epsilon}_i,\mathbf{\epsilon}_j) = \sqrt{\sum_\kappa |\epsilon^\kappa_i - \epsilon^\kappa_j|^2}$,\
where,
* $\epsilon^\kappa_i$ is the $\kappa$-th eigenvalue of the C matrix from molecule-i.

$d(\mathbf{C}_i,\mathbf{C}_j)$ is simply an isotropic RBF kernel (can you see it?)

In [110]:
import torch
import gpytorch

In [111]:
class ExactGPModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(
            gpytorch.kernels.RBFKernel())

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

In [112]:
# # data
N = 1500
Nval = 250
X_train, X_test, y_train, y_test = train_test_split(
    Ctot_eig, ytot, test_size=ytot.shape[0] - N, random_state=0)
X_test, y_test = X_test[:Nval], y_test[:Nval]
Xtr, Xtst, ytr, ytst = X_train, X_test, y_train, y_test

if torch.cuda.is_available():
    Xtr = torch.from_numpy(Xtr).cuda()
    ytr = torch.from_numpy(ytr).cuda()
    Xtst = torch.from_numpy(Xtst).cuda()
    ytst = torch.from_numpy(ytst).cuda()
else:
    Xtr = torch.from_numpy(Xtr)
    ytr = torch.from_numpy(ytr)
    Xtst = torch.from_numpy(Xtst)
    ytst = torch.from_numpy(ytst)

Xtot = torch.from_numpy(Xtot)
ytot = torch.from_numpy(ytot)

### training

In [113]:
# initialize likelihood and model
likelihood = gpytorch.likelihoods.GaussianLikelihood()
model = ExactGPModel(Xtr, ytr, likelihood)

if torch.cuda.is_available():
  likelihood.cuda()
  model.cuda()

In [114]:
# Find optimal model hyperparameters using ADAM

# Use the adam optimizer
# Includes GaussianLikelihood parameters
optimizer = torch.optim.Adam(model.parameters(), lr=0.2)

# "Loss" for GPs - the marginal log likelihood
mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

training_iter = 150
mll_trajectory = []
mll_trajectory_tst = []
for i in range(training_iter):
    model.train()
    likelihood.train()
    # Zero gradients from previous iteration
    optimizer.zero_grad()
    # Output from model
    output = model(Xtr)
    # Calc loss and backprop gradients
    loss = -mll(output, ytr)
    loss.backward()
    print('Iter %d/%d - Loss: %.3f  noise: %.6f' % (
        i + 1, training_iter, loss.item(),
        model.likelihood.noise.item()
    ))
    print('lengthscale: ', model.covar_module.base_kernel.lengthscale[0])
    mll_trajectory.append(loss.item())
    optimizer.step()
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
        model.eval()
        likelihood.eval()
        ypred = likelihood(model(Xtst))
        mse = torch.sqrt(torch.sum((ypred.mean - ytst)**2))
        mll_trajectory_tst.append(mse.cpu())
        

Iter 1/150 - Loss: 841314.359  noise: 0.693247
lengthscale:  tensor([0.6931], grad_fn=<SelectBackward0>)


RuntimeError: expected scalar type Double but found Float

# Single molecule test case after training a GP model with GPytorch

In [96]:
# Benzene
molec = '''
  C        0.00000        1.40272        0.00000
  H        0.00000        2.49029        0.00000
  C       -1.21479        0.70136        0.00000
  H       -2.15666        1.24515        0.00000
  C       -1.21479       -0.70136        0.00000
  H       -2.15666       -1.24515        0.00000
  C        0.00000       -1.40272        0.00000
  H        0.00000       -2.49029        0.00000
  C        1.21479       -0.70136        0.00000
  H        2.15666       -1.24515        0.00000
  C        1.21479        0.70136        0.00000
  H        2.15666        1.24515        0.00000
  '''
molec = molec.split()
molec = np.array(molec)
molec = molec.reshape(12,4)
print(molec)

[['C' '0.00000' '1.40272' '0.00000']
 ['H' '0.00000' '2.49029' '0.00000']
 ['C' '-1.21479' '0.70136' '0.00000']
 ['H' '-2.15666' '1.24515' '0.00000']
 ['C' '-1.21479' '-0.70136' '0.00000']
 ['H' '-2.15666' '-1.24515' '0.00000']
 ['C' '0.00000' '-1.40272' '0.00000']
 ['H' '0.00000' '-2.49029' '0.00000']
 ['C' '1.21479' '-0.70136' '0.00000']
 ['H' '2.15666' '-1.24515' '0.00000']
 ['C' '1.21479' '0.70136' '0.00000']
 ['H' '2.15666' '1.24515' '0.00000']]


In [48]:
atomic_numbers = {'H':1.,'C':6}

Z = []
for x in molec:
    print(x,)
    zi = atomic_numbers[x[0]]
    Z.append(zi)
print(Z)
Z = np.array(Z)

['C' '0.00000' '1.40272' '0.00000']
['H' '0.00000' '2.49029' '0.00000']
['C' '-1.21479' '0.70136' '0.00000']
['H' '-2.15666' '1.24515' '0.00000']
['C' '-1.21479' '-0.70136' '0.00000']
['H' '-2.15666' '-1.24515' '0.00000']
['C' '0.00000' '-1.40272' '0.00000']
['H' '0.00000' '-2.49029' '0.00000']
['C' '1.21479' '-0.70136' '0.00000']
['H' '2.15666' '-1.24515' '0.00000']
['C' '1.21479' '0.70136' '0.00000']
['H' '2.15666' '1.24515' '0.00000']
[6, 1.0, 6, 1.0, 6, 1.0, 6, 1.0, 6, 1.0, 6, 1.0]


In [68]:
XYZ = np.array(molec[:,1:],dtype=np.float32)
diff =  XYZ[:,np.newaxis] - XYZ[np.newaxis,:]
R = np.linalg.norm(diff,axis=-1)
Rdiag = np.eye(R.shape[0])
R = R + Rdiag
R_inv = 1/R
Zij = Z[:,np.newaxis] * Z[np.newaxis,:]
C = Zij * R_inv
C_diag = 0.5*np.power(Z, 2.4)
C[np.diag_indices(R.shape[0])] = C_diag
print(C)
print((6**(2.4)/2))

[[36.8581052   5.51688651 25.66444132  2.77468376 14.81736583  1.75694151
  12.83221194  1.54122393 14.81736583  1.75694151 25.66444132  2.77468376]
 [ 5.51688651  0.5         2.77469141  0.40155929  1.75694568  0.2318402
   1.54122393  0.20077983  1.75694568  0.2318402   2.77469141  0.40155929]
 [25.66444132  2.77469141 36.8581052   5.51684358 25.66442387  2.77468101
  14.81736583  1.75694568 12.83222066  1.54122138 14.81737601  1.75694273]
 [ 2.77468376  0.40155929  5.51684358  0.5         2.77468101  0.40155805
   1.75694151  0.2318402   1.54122138  0.20077924  1.75694273  0.23183997]
 [14.81736583  1.75694568 25.66442387  2.77468101 36.8581052   5.51684358
  25.66444132  2.77469141 14.81737601  1.75694273 12.83222066  1.54122138]
 [ 1.75694151  0.2318402   2.77468101  0.40155805  5.51684358  0.5
   2.77468376  0.40155929  1.75694273  0.23183997  1.54122138  0.20077924]
 [12.83221194  1.54122393 14.81736583  1.75694151 25.66444132  2.77468376
  36.8581052   5.51688651 25.66444132  2