# Use solprop environment for this script

Install `env_solprop` conda environment by following the installation guide for option 1 at https://github.com/fhvermei/SolProp_ML before you run this script.

If you are unable to install the `env_solprop` environment, you can obtain the Abraham features (E, S, A, B, L) using the web-based tool at https://rmg.mit.edu/database/solvation/soluteSearch/.

In [1]:
import pandas as pd
from chemprop_solvation.solvation_estimator import load_SoluteML_estimator



## Load the ML model files

In [2]:
abraham_model = load_SoluteML_estimator()

Loading pretrained parameter "encoder.encoder.cached_zero_vector".
Loading pretrained parameter "encoder.encoder.W_i.weight".
Loading pretrained parameter "encoder.encoder.W_h.weight".
Loading pretrained parameter "encoder.encoder.W_o.weight".
Loading pretrained parameter "encoder.encoder.W_o.bias".
Loading pretrained parameter "ffn.1.weight".
Loading pretrained parameter "ffn.1.bias".
Loading pretrained parameter "ffn.4.weight".
Loading pretrained parameter "ffn.4.bias".
Loading pretrained parameter "ffn.7.weight".
Loading pretrained parameter "ffn.7.bias".
Loading pretrained parameter "ffn.10.weight".
Loading pretrained parameter "ffn.10.bias".
Loading pretrained parameter "encoder.encoder.cached_zero_vector".
Loading pretrained parameter "encoder.encoder.W_i.weight".
Loading pretrained parameter "encoder.encoder.W_h.weight".
Loading pretrained parameter "encoder.encoder.W_o.weight".
Loading pretrained parameter "encoder.encoder.W_o.bias".
Loading pretrained parameter "ffn.1.weight".

## Load the sample csv file with SMILES strings

In [3]:
df_sample = pd.read_csv('crit_prop_sample_dataset.csv')

smiles_list = df_sample['smiles'].to_list()

## Get the Abraham feature predictions

In [4]:
smiles_input_list = [[sm] for sm in smiles_list]
average_prediction, epistemic_uncertainty, valid_indices = abraham_model(smiles_input_list)

100%|██████████| 50/50 [00:03<00:00, 16.56it/s]
100%|██████████| 50/50 [00:00<00:00, 534.41it/s]
100%|██████████| 50/50 [00:00<00:00, 531.31it/s]
100%|██████████| 50/50 [00:00<00:00, 535.10it/s]
100%|██████████| 50/50 [00:00<00:00, 531.89it/s]
100%|██████████| 50/50 [00:00<00:00, 543.80it/s]
100%|██████████| 50/50 [00:00<00:00, 537.45it/s]
100%|██████████| 50/50 [00:00<00:00, 534.59it/s]
100%|██████████| 50/50 [00:00<00:00, 535.38it/s]
100%|██████████| 50/50 [00:00<00:00, 535.47it/s]
100%|██████████| 50/50 [00:00<00:00, 536.61it/s]
100%|██████████| 50/50 [00:00<00:00, 539.82it/s]
100%|██████████| 50/50 [00:00<00:00, 531.50it/s]
100%|██████████| 50/50 [00:00<00:00, 535.41it/s]
100%|██████████| 50/50 [00:00<00:00, 539.00it/s]
100%|██████████| 50/50 [00:00<00:00, 529.73it/s]
100%|██████████| 50/50 [00:00<00:00, 531.55it/s]
100%|██████████| 50/50 [00:00<00:00, 525.33it/s]
100%|██████████| 50/50 [00:00<00:00, 524.28it/s]
100%|██████████| 50/50 [00:00<00:00, 526.21it/s]
100%|██████████| 50/5

## Save the results as a csv file in a Chemprop additional feature format

In [5]:
abraham_param_name_list = ['E', 'S', 'A', 'B', 'L']

df_abraham = {}
for param_name in abraham_param_name_list:
    df_abraham[param_name] = [None] * len(smiles_list)

for i in range(len(valid_indices)):
    index = valid_indices[i]
    pred_list = average_prediction[i]
    for name, pred in zip(abraham_param_name_list, pred_list):
        df_abraham[name][index] = pred

df_abraham = pd.DataFrame(df_abraham)
df_abraham.to_csv('Abraham_features.csv', index=False)


## Double check that the values are correct

In [6]:
results_text = '''
E 		S 		A 		B 		L
0.315230 	0.270512 	0.125660 	0.114181 	1.581773
0.002610 	-0.002287 	0.000544 	-0.000556 	3.343804
0.234744 	0.568298 	0.002672 	0.435590 	2.161170
0.121986 	0.645907 	-0.001568 	0.517071 	3.217184
1.420635 	0.999431 	0.001300 	0.220972 	6.537796
0.003992 	-0.005317 	0.000185 	-0.001430 	3.136133
-0.048397 	0.127866 	0.001210 	0.069718 	0.987169
0.267236 	0.118395 	0.000508 	-0.004405 	3.833012
0.235421 	0.406712 	0.003452 	0.101534 	1.656188
0.162136 	0.667995 	0.000785 	0.518607 	2.780620'''

print('The first ten rows should have the following values:')
print(results_text)

The first ten rows should have the following values:

E 		S 		A 		B 		L
0.315230 	0.270512 	0.125660 	0.114181 	1.581773
0.002610 	-0.002287 	0.000544 	-0.000556 	3.343804
0.234744 	0.568298 	0.002672 	0.435590 	2.161170
0.121986 	0.645907 	-0.001568 	0.517071 	3.217184
1.420635 	0.999431 	0.001300 	0.220972 	6.537796
0.003992 	-0.005317 	0.000185 	-0.001430 	3.136133
-0.048397 	0.127866 	0.001210 	0.069718 	0.987169
0.267236 	0.118395 	0.000508 	-0.004405 	3.833012
0.235421 	0.406712 	0.003452 	0.101534 	1.656188
0.162136 	0.667995 	0.000785 	0.518607 	2.780620


In [7]:
df_abraham[:10]

Unnamed: 0,E,S,A,B,L
0,0.31523,0.270512,0.12566,0.114181,1.581773
1,0.00261,-0.002287,0.000544,-0.000556,3.343804
2,0.234744,0.568298,0.002672,0.43559,2.16117
3,0.121986,0.645907,-0.001568,0.517071,3.217184
4,1.420635,0.999431,0.0013,0.220972,6.537796
5,0.003992,-0.005317,0.000185,-0.00143,3.136133
6,-0.048397,0.127866,0.00121,0.069718,0.987169
7,0.267236,0.118395,0.000508,-0.004405,3.833012
8,0.235421,0.406712,0.003452,0.101534,1.656188
9,0.162136,0.667995,0.000785,0.518607,2.78062
