In [None]:
!pip install py3Dmol
!pip install rdkit
!pip install pyscf
!pip install botorch

<a target="_blank" href="https://colab.research.google.com/github/RodrigoAVargasHdz/CHEM-4PB3/blob/w2024/Course_Notes/Week%208/BayesOpt_H2O_PES.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [None]:
import numpy as np
import pyscf
from pyscf import dft

import py3Dmol
import rdkit
from rdkit import Chem
from rdkit.Chem import Draw, rdDetermineBonds, MolFromXYZBlock
from rdkit.Chem import rdDetermineBonds
from rdkit.Chem.Draw import IPythonConsole
IPythonConsole.ipython_3d = True

# Introduction to Bayesian Optimization using [BOTorch](https://botorch.org/) #

There are many libraries that are capable of doing Bayesian Optimization (BO). <br>
In this tutorial, we will learn how to use [BOTorch](https://botorch.org/), a BO library built in Torch and GPyTorch.


## Generate a water molecule dataset ##
The molecule of water has two internal variables,
1. O-H distance
2. H-O-H angle

We will use PySCF to generate DFT-level data for different water geometries using the Z-matrix notation.


In [None]:
def get_z_matrix(angle, dist, n_atoms = 3):
    xyz = []

    mol = pyscf.gto.Mole()
    mol.atom = '''
      O
      H  1  	1.2
      H  1  %.3f  2 %.3f
    '''%(dist,angle)
    mol.unit = 'Angstrom'
    mol.build()
    for i in range(n_atoms):
        xyzi = mol.atom_coord(i).tolist()
        xyzi = [mol.atom_symbol(i)] + xyzi
        xyz.append(xyzi)

    xyz_str = '%s\n Generated by PySCF\n'%(n_atoms)
    for xyzi in xyz:
        print(xyzi)
        xyzi_str = '%s     %.4f     %.4f     %.4f\n'%(xyzi[0],xyzi[1],xyzi[2],xyzi[3])
        xyz_str += xyzi_str
    return xyz_str

In [None]:
def energy_water_calculation(angle,dist):
    mol = pyscf.gto.Mole()
    mol.atom = '''
      O
      H  1  	1.2
      H  1  %.3f  2 %.3f
    '''%(dist,angle)
    mol.unit = 'Angstrom'
    mol.basis = 'sto-3g' #basis set level
    mol.build()
    rks_h2o = dft.RKS(mol)
    rks_h2o.xc = 'b3lyp' # dft model
    energy =rks_h2o.kernel()

    return energy, get_z_matrix(angle,dist)

In [None]:
# generate grid for the water geometries

angle_list = np.linspace(40., 140., 12)
dist_list = np.linspace(0.8, 1.6, 12)


In [None]:
xyz_all = []
pes = []

X,Y = np.meshgrid(angle_list,dist_list)
for angle,dist in zip(X.flatten(),Y.flatten()):
      # pes_i, xyz_i = scan_pes(angle,dist)
      pes_i, xyz_i = energy_water_calculation(angle,dist)
      pes.append(pes_i)
      xyz_all.append(xyz_i)

In [None]:
import matplotlib
import matplotlib.pyplot as plt

D = {'Energy':np.asarray(pes),
     'X':np.column_stack((X.flatten(),Y.flatten()))}

# X,Y = np.meshgrid(angle_list,dist_list)
Z = np.asarray(pes).reshape(X.shape)

plt.figure(figsize=(10,10))
plt.contourf(X,Y,Z,levels=10)
plt.xlabel('H-O-H Angle',fontsize=15)
plt.ylabel('H-O Bond distance',fontsize=15)

###  BOTorch for geometry optimization ###

The goal is to find the geometry of water that has the lowest energy,

$$
{\cal M}^* = \arg\min_{{\cal M}} E({\cal M}),
$$
where ${\cal M}$ is a molecular geometry and $E(\cdot)$ is the quantum chemistry methodology to compute the energy.

For this example, ${\cal M} = [r^{OH}, \theta^{HOH}]$.

Tutorial based on [link](https://botorch.org/docs/getting_started)

In [None]:
import torch
from botorch.models import SingleTaskGP
from botorch.fit import fit_gpytorch_mll
from gpytorch.mlls import ExactMarginalLogLikelihood
from botorch.acquisition import UpperConfidenceBound
from botorch.optim import optimize_acqf

In [None]:
# utility functions
def scale_data(y,bool_min=True):
    mu = np.mean(y)
    std = np.std(y)
    if bool_min:
      z = -1
    else:
      z = 1

    def scale(x):
      x = (x-mu)/std
      return z*x
    return scale


The geometry represeantations of our water system is,
$$
X =  \begin{bmatrix}
 {\cal M}_0 \\
 \vdots\\
 {\cal M}_N
\end{bmatrix} = \begin{bmatrix}
[r^{OH}, \theta^{HOH}]_0 \\
 \vdots\\
 [r^{OH}, \theta^{HOH}]_N
\end{bmatrix}
$$

In [None]:
Xtot = D['X'] #all geometries
ytot = D['Energy']

# f_scale = scale_data(ytot)
# ytot = f_scale(ytot)


# random initial geometries
Ninit = 5
i0 = np.random.randint(0,Xtot.shape[0], Ninit)
X_train = torch.tensor(Xtot[i0])
y_train = torch.tensor(ytot[i0]).unsqueeze(-1)

print(X_train.shape,y_train.shape)

### Step 1: Define a Gaussian Process for BO ###

In [None]:
# define a GP for Bayesian Optimization

gp = SingleTaskGP(X_train, y_train)
mll = ExactMarginalLogLikelihood(gp.likelihood, gp)
fit_gpytorch_mll(mll);

### Step 2: Define an Aquisition function ($\alpha$) ###

**UpperConfidentBound**
$$
\alpha({\cal M}) = \mu({\cal M}) + \kappa \sigma({\cal M}),
$$
where $\mu({\cal M})$ is the mean of the GP, and $\sigma({\cal M})$ is the standard deviation.

In [None]:
# Construct an aquisition function

UCB = UpperConfidenceBound(gp, beta=0.1)

### Step 3: Optimize the Aquisition function ###

$$
{\cal M}^* = \arg\max_{{\cal M}} \alpha({\cal M}),
$$
where ${\cal M}^*$ is the proposed point by the acquisition function.

In [None]:
# Optimize the acquisition function

bounds = torch.tensor([[20.,0.5],[180,2.]])
candidate, acq_value = optimize_acqf(
    UCB, bounds=bounds, q=1, num_restarts=5, raw_samples=20,
)
print(candidate)

In [None]:
# X,Y = np.meshgrid(angle_list,dist_list)
Z = np.asarray(pes).reshape(X.shape)

plt.figure(figsize=(10,10))
plt.contourf(X,Y,Z,levels=10)
x_candidate = candidate.detach()
plt.scatter(x_candidate[:,0], x_candidate[:,1], marker='x',c='w',s=100)
plt.xlabel('H-O-H Angle',fontsize=15)
plt.ylabel('H-O Bond distance',fontsize=15)

## BO algorithm ##

In [None]:
candidates = []
X_train_bo = X_train
y_train_bo = y_train
for i in range(20):
    # step 1
    gp = SingleTaskGP(X_train_bo, y_train_bo)
    mll = ExactMarginalLogLikelihood(gp.likelihood, gp)
    fit_gpytorch_mll(mll);

    # step 2 and 3
    candidate, acq_value = optimize_acqf(
        UCB, bounds=bounds, q=1, num_restarts=5, raw_samples=20,
    )
    candidates.append(candidate.detach().numpy()[0])


    xi = candidate.detach().numpy()[0]
    angle = xi[0]
    dist = xi[1]
    y_energy0,_ = energy_water_calculation(angle,dist)
    # y_energy = f_scale(np.array([y_energy0]))
    y_energy = y_energy0

    X_train_bo = torch.vstack((X_train_bo,candidate))
    y_train_bo = torch.vstack((y_train_bo,torch.tensor([y_energy])))
    print(i,xi,y_energy0)


In [None]:
plt.plot(np.arange(y_train_bo.detach().shape[0]),y_train_bo.detach())
plt.xlabel('Iterations')
plt.ylabel('Energy of the candidate point')

In [None]:
candidates = np.asarray(candidates)
for i,x in enumerate(candidates):
    plt.figure(figsize=(5,5))
    plt.clf()
    plt.contourf(X,Y,Z,levels=10)
    plt.scatter(X_train_bo.detach().numpy()[:i,0],X_train_bo.detach().numpy()[:i,1],color='w',s=30,marker='s')
    plt.scatter(candidates[i,0],candidates[i,1],color='w',s=30,marker='x')
    plt.xlabel('H-O-H Angle',fontsize=15)
    plt.ylabel('H-O Bond distance',fontsize=15)

### Plot a molecule ###

In [None]:
def draw_with_spheres(xyz):
    raw_mol = Chem.MolFromXYZBlock(xyz)
    conn_mol = Chem.Mol(raw_mol)
    rdDetermineBonds.DetermineConnectivity(conn_mol)

    v = py3Dmol.view(width=400, height=400)
    IPythonConsole.addMolToView(conn_mol, v)
    v.zoomTo()
    v.setStyle({'sphere': {'radius': 0.35}, 'stick': {'radius': 0.1}})
    v.show()

In [None]:
xyz = '''3
* (null), Energy   -1000.0000000
H     0.00000     0.7554     -0.47116
H     0.00000    -0.75545     -0.4711
O     0.00000     0.00000     0.11779
'''

draw_with_spheres(xyz)