In [17]:
from util import load_model
random_forest = load_model("adaboost")

import pandas as pd
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
from rdkit import Chem

from sklearn.metrics import mean_squared_error, r2_score

from pickle import load
descriptors = load(open("./descriptors.pkl", "rb"))
descriptor = MolecularDescriptorCalculator(descriptors)

from rdkit import rdBase
blocker = rdBase.BlockLogs()

In [3]:
train_df = pd.read_csv("../data/aqsoldb.csv")

df = pd.read_csv("../data/delaney.csv")
df.head()

Unnamed: 0,Compound ID,measured log(solubility:mol/L),ESOL predicted log(solubility:mol/L),SMILES
0,"1,1,1,2-Tetrachloroethane",-2.18,-2.794,ClCC(Cl)(Cl)Cl
1,"1,1,1-Trichloroethane",-2.0,-2.232,CC(Cl)(Cl)Cl
2,"1,1,2,2-Tetrachloroethane",-1.74,-2.549,ClC(Cl)C(Cl)Cl
3,"1,1,2-Trichloroethane",-1.48,-1.961,ClCC(Cl)Cl
4,"1,1,2-Trichlorotrifluoroethane",-3.04,-3.077,FC(F)(Cl)C(F)(Cl)Cl


In [6]:
train_smiles = train_df["SMILES"].values
new_smiles = df["SMILES"].values

train_smiles = [Chem.MolToSmiles(Chem.MolFromSmiles(x)) for x in train_smiles]
new_smiles = [Chem.MolToSmiles(Chem.MolFromSmiles(x)) for x in new_smiles]

non_similar = [x for x in new_smiles if x not in train_smiles]
non_similar_indicies = [i for i, x in enumerate(new_smiles) if x not in train_smiles]
print(f"{len(non_similar)} of {len(new_smiles)} are unique")

mols = [Chem.MolFromSmiles(x) for x in non_similar]

18 of 1144 are unique


In [7]:
smiles = df["SMILES"]
feats = [descriptor.CalcDescriptors(mol) for mol in mols]

In [18]:
preds = random_forest.predict(feats)
real = df["ESOL predicted log(solubility:mol/L)"].values[non_similar_indicies]

print(f"RMSE: {mean_squared_error(real, preds, squared=False)}")
print(f"R2: {r2_score(real, preds)}")
print()

for i, (p, r) in enumerate(zip(preds, real)):
    smiles = df["SMILES"][i]
    print(f"{f'Pred: {p:.4f}'.ljust(13)} | {f'Real: {r:.4f}'.ljust(13)} | {f'Error: {(r - p):.4f}'.ljust(14)} | {smiles}")

RMSE: 0.8835414216745759
R2: 0.41423122402220314

Pred: -1.9240 | Real: -1.4040 | Error: 0.5200  | ClCC(Cl)(Cl)Cl
Pred: -1.2630 | Real: -1.6550 | Error: -0.3920 | CC(Cl)(Cl)Cl
Pred: -1.2630 | Real: -1.6550 | Error: -0.3920 | ClC(Cl)C(Cl)Cl
Pred: -1.9757 | Real: -2.7250 | Error: -0.7493 | ClCC(Cl)Cl
Pred: -1.6637 | Real: -0.8400 | Error: 0.8237  | FC(F)(Cl)C(F)(Cl)Cl
Pred: -1.2630 | Real: -0.6740 | Error: 0.5890  | CC(Cl)Cl
Pred: -1.4393 | Real: -2.2100 | Error: -0.7707 | ClC(=C)Cl
Pred: -1.2995 | Real: 0.0510  | Error: 1.3505  | CCOC(C)OCC
Pred: -1.9057 | Real: -3.5700 | Error: -1.6643 | Clc1ccc(Cl)c(Cl)c1Cl
Pred: -1.9770 | Real: -2.7320 | Error: -0.7550 | C1CCc2ccccc2C1
Pred: -1.6422 | Real: -0.6700 | Error: 0.9722  | Clc1cc(Cl)c(Cl)c(Cl)c1
Pred: -1.6341 | Real: -0.6560 | Error: 0.9781  | Clc1cccc(Cl)c1Cl
Pred: -1.9770 | Real: -0.8340 | Error: 1.1430  | Cc1cccc(C)c1C
Pred: -1.9057 | Real: -1.7400 | Error: 0.1657  | Brc1cc(Br)c(Br)cc1Br
Pred: -3.5814 | Real: -4.1180 | Error: -0.5366 | 