In [1]:
# import packages
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt

# import qubit
#!pip install git+https://github.com/Xergon-sci/Qubit.git@development
from qubit.preprocessing.descriptors import tensorise_coulomb_matrix
from qubit.preprocessing.matrix_operations import pad_matrix

sys.path.append(r'C:\Users\Michiel Jacobs\Research\Master Thesis\Experimental-Reactivity-Prediction\code\utilities')

from data_utility import loaddata

In [2]:
# import the data
data = loaddata(r'C:\Users\Michiel Jacobs\Research\Master Thesis\Experimental-Reactivity-Prediction\data')
data = data[['coulomb_matrix', 'zero_point_energy']]
# pandas loads data as list convert it to numpy arrays
data['coulomb_matrix'] = data['coulomb_matrix'].apply(lambda x: np.array(x))
# shuffle the data
data = data.sample(frac=1)

# for development
develop = True
if develop:
    data = data.iloc[:100,:]

In [3]:
# The data set contains molecules with maximum 20 heavy atoms
# So XnH2n+2 can be used to calculate the maximum atoms that can be present and thus
# the maximum shape of our molecules
n = 20
maxsize = n + ((2*n)+2)

# formalize the data to a constant size
data['coulomb_matrix'] = data['coulomb_matrix'].apply(pad_matrix, size=maxsize)

In [4]:
# tensorize the data ===== takes long time since arrays are padded and so bigger
data['tensors'] = data['coulomb_matrix'].apply(tensorise_coulomb_matrix, negative_dimensions=5)

In [5]:
# wrap the matrix to provide channels
data['features'] = data['tensors'].apply(np.expand_dims, axis=3)

In [6]:
DATASETLENGHT = data.shape[0]
train_size = int(0.75 * DATASETLENGHT)
# test size is whatever is left after the 0.75 split

# split into train,val and test sets.
train = data.iloc[:train_size,:]
test = data.iloc[train_size+1:,:]

In [7]:
# attempt to make a np array filled with tensorflow tensors
train_tensor_list = []
for t in train['features'].values:
    t = tf.convert_to_tensor(t)
    train_tensor_list.append(t)

test_tensor_list = []
for t in test['features'].values:
    t = tf.convert_to_tensor(t)
    test_tensor_list.append(t)

In [8]:
input_shape = (6,62,62,1)
batch_size = 128
kernel_size = (1,3,3)
pool_size = (2,2,2)
filters = 64
dropout = 0.2

model = keras.Sequential()
model.add(layers.Conv3D(
    filters=filters,
    kernel_size=kernel_size,
    activation = 'relu',
    input_shape = input_shape))
model.add(layers.MaxPool3D(pool_size))
model.add(layers.Conv3D(
    filters=filters,
    kernel_size=kernel_size,
    activation = 'relu'))
model.add(layers.MaxPool3D(pool_size))
model.add(layers.Conv3D(
    filters=filters,
    kernel_size=kernel_size,
    activation = 'relu'))
model.add(layers.Flatten())
model.add(layers.Dropout(dropout))
model.add(layers.Dense(1))

In [9]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv3d (Conv3D)              (None, 6, 60, 60, 64)     640       
_________________________________________________________________
max_pooling3d (MaxPooling3D) (None, 3, 30, 30, 64)     0         
_________________________________________________________________
conv3d_1 (Conv3D)            (None, 3, 28, 28, 64)     36928     
_________________________________________________________________
max_pooling3d_1 (MaxPooling3 (None, 1, 14, 14, 64)     0         
_________________________________________________________________
conv3d_2 (Conv3D)            (None, 1, 12, 12, 64)     36928     
_________________________________________________________________
flatten (Flatten)            (None, 9216)              0         
_________________________________________________________________
dropout (Dropout)            (None, 9216)              0

In [10]:
model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.RMSprop(),
    metrics=["accuracy"],
    )

zpe = train['zero_point_energy'].values.astype(np.float).tolist()
zpe = np.array(zpe)

history = model.fit(
    np.array(train_tensor_list),
    zpe,
    batch_size=512,
    epochs=2,
    validation_split=0.2)

test_zpe = test['zero_point_energy'].values.astype(np.float).tolist()
test_zpe = np.array(test_zpe)

test_scores = model.evaluate(
    np.array(test_tensor_list),
    test_zpe,
    verbose=2)
print("Test loss:", test_scores[0])
print("Test accuracy:", test_scores[1])

Epoch 1/2
Epoch 2/2
1/1 - 1s - loss: 0.0000e+00 - accuracy: 0.0000e+00
Test loss: 0.0
Test accuracy: 0.0


In [22]:
history.history

{'loss': [0.0, 0.0],
 'accuracy': [0.0, 0.0],
 'val_loss': [0.0, 0.0],
 'val_accuracy': [0.0, 0.0]}

In [12]:
test_zpe.shape

(24,)

In [13]:
#model.save("path_to_my_model")
#del model