Load NumPy and models

In [18]:
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn import linear_model

Get representation library

In [None]:
!pip install qml

In [3]:
import qml

Load and inspect data

In [None]:
!wget https://pubs.acs.org/doi/suppl/10.1021/acs.jctc.8b00832/suppl_file/ct8b00832_si_001.zip
!unzip ct8b00832_si_001.zip

In [None]:
!ls supplementary/geometry

In [None]:
!cat supplementary/geometry/frag_0001.xyz

Generate representations.

In [None]:
print("Loading molecule data...")
N=500
mols = []
for i in range(1,N+1):
    filename="supplementary/geometry/frag_%04d.xyz"
    mol = qml.Compound(xyz=(filename%(i)))
    mols.append(mol)

print("Generating representations...")
i=0
for mol in mols: 
    mol.generate_coulomb_matrix(size=23)
    i=i+1
    if i % 100 == 0:
        print(i)

X = np.array([mol.representation for mol in mols])


In [8]:
X.shape

(500, 276)

Load outputs.

In [None]:
!cat supplementary/E_ccpvdz.txt

In [10]:
Y=np.loadtxt("supplementary/E_ccpvdz.txt")
Y = Y[:500,1]

In [13]:
X_train = X[:400,:]
Y_train = Y[:400]

X_val = X[400:,:]
Y_val = Y[400:]

(100, 276)


In [41]:
k = 3
neigh = KNeighborsRegressor(n_neighbors=k)
neigh.fit(X_train, Y_train)

Y_train_pred = neigh.predict(X_train)

training_error = np.sum((Y_train_pred-Y_train)**2)/X_train.shape[0]
print(training_error)

Y_val_pred = neigh.predict(X_val)
generalization_error = np.sum((Y_val_pred-Y_val)**2)/X_val.shape[0]
print(generalization_error)

85.23490591666666
272.118891


In [42]:
lin_regr = linear_model.LinearRegression()
lin_regr.fit(X, Y)

Y_train_pred = lin_regr.predict(X_train)

training_error = np.sum((Y_train_pred-Y_train)**2)/X_train.shape[0]
print(training_error)

Y_val_pred = lin_regr.predict(X_val)
generalization_error = np.sum((Y_val_pred-Y_val)**2)/X_val.shape[0]
print(generalization_error)

87.40559576788478
45.25276038705653
