This is my project for predicting latent heats by means of encoding molecular fingerprints as the input to an ANN. This ANN compares the performance with topological fingerprints, ECFP4 fingerprints and Morgan fingerprints. Previously established methods use a combination of molecular descriptors to regressively predict latent heats of vaporization.

In [None]:
from rdkit.Chem import AllChem
from rdkit import Chem
import pandas as pd

df = pd.read_excel('Data_SMILES.xlsx')
print(df.head)
XY = df.set_index('SMILES').T.to_dict('list')

In [None]:
y = []
X = []
for smile, Hlist in XY.items():
    try:
        X.append(AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smile), 10))
        y.append(Hlist[0])
    except:
        pass
X = [list(a) for a in X]
print(len(X), len(y))

In [None]:
from sklearn.model_selection import train_test_split

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, y, test_size=0.2, random_state=1)
print(len(Xtrain))

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV

train_nn = MLPRegressor(max_iter=1000)
param_grid = {
    'activation' : ['identity', 'logistic', 'tanh', 'relu'],
    'solver' : ['lbfgs', 'sgd', 'adam'],
    'hidden_layer_sizes': [(1,),(2,),(3,),(4,),(5,),(6,),(7,),(8,),(9,),(10,)],
    'alpha': [0.0001, 0.05]
}

clf = GridSearchCV(train_nn , param_grid, n_jobs=-1, cv=3)
clf.fit(Xtrain, Ytrain)
print("Best parameters set found on development set:")
print(clf.best_params_)

In [None]:
nn = MLPRegressor(activation='logistic', solver='lbfgs', hidden_layer_sizes=(10,), alpha=0.05, random_state=1, max_iter=10000)
nn.fit(Xtrain, Ytrain)
pred = nn.predict(Xtest)
print(nn.score(Xtest, Ytest))
print(nn.score(Xtrain, Ytrain))