# FreeSolv dataset preparation

Original data repository: https://github.com/MobleyLab/FreeSolv

# Imports

In [46]:
import pandas as pd
import requests
import yaml

# Data processing

## Download data

In [47]:
data_path = "https://raw.githubusercontent.com/MobleyLab/FreeSolv/master/database.txt"

In [48]:
fn_data_original = "data_original.txt"

In [49]:
data = requests.get(data_path)
with open(fn_data_original, "wb") as f:
    f.write(data.content)

In [None]:
!ls -lh

## Load original data

In [4]:
!head -n 5 {fn_data_original}

#Hydration free energy datbase v0.52, 6/11/17.
#Semicolon-delimited text file with fields in the following format:
# compound id (and file prefix); SMILES; iupac name (or alternative if IUPAC is unavailable or not parseable by OEChem); experimental value (kcal/mol); experimental uncertainty (kcal/mol); Mobley group calculated value (GAFF) (kcal/mol); calculated uncertainty (kcal/mol); experimental reference (original or paper this value was taken from); calculated reference; text notes.
mobley_1017962; CCCCCC(=O)OC; methyl hexanoate; -2.49; 0.60; -3.30; 0.03; 10.1021/ct050097l; 10.1021/acs.jced.7b00104; Experimental uncertainty not presently available, so assigned a default value.  
mobley_1019269; CCCCO; butan-1-ol; -4.72; 0.60; -3.23; 0.03; 10.1021/ct050097l; 10.1021/acs.jced.7b00104; Experimental uncertainty not presently available, so assigned a default value.  


In [5]:
df = pd.read_csv(fn_data_original, delimiter=";", skiprows=2)

In [6]:
df.head()

Unnamed: 0,# compound id (and file prefix),SMILES,iupac name (or alternative if IUPAC is unavailable or not parseable by OEChem),experimental value (kcal/mol),experimental uncertainty (kcal/mol),Mobley group calculated value (GAFF) (kcal/mol),calculated uncertainty (kcal/mol),experimental reference (original or paper this value was taken from),calculated reference,text notes.
0,mobley_1017962,CCCCCC(=O)OC,methyl hexanoate,-2.49,0.6,-3.3,0.03,10.1021/ct050097l,10.1021/acs.jced.7b00104,Experimental uncertainty not presently availa...
1,mobley_1019269,CCCCO,butan-1-ol,-4.72,0.6,-3.23,0.03,10.1021/ct050097l,10.1021/acs.jced.7b00104,Experimental uncertainty not presently availa...
2,mobley_1034539,c1cc(c(cc1c2cc(c(c(c2Cl)Cl)Cl)Cl)Cl)Cl,"1,2,3,4-tetrachloro-5-(3,4-dichlorophenyl)ben...",-3.04,0.1,-1.08,0.04,10.1007/s10822-012-9568-8,10.1021/acs.jced.7b00104,Experimental uncertainty not presently availa...
3,mobley_1036761,C1CCC(CC1)N,cyclohexanamine,-4.59,0.6,-3.95,0.03,10.1021/ct050097l,10.1021/acs.jced.7b00104,Experimental uncertainty not presently availa...
4,mobley_1046331,c1ccc(cc1)OC=O,phenyl formate,-3.82,0.6,-5.44,0.03,"J. Peter Guthrie, unpublished data, as provid...",10.1021/acs.jced.7b00104,Experimental uncertainty not presently availa...


## Add column = field names
Clean column names (`fields_clean`) and keep original names (`fields_orig`)

In [7]:
fields_orig = df.columns.tolist()
fields_orig

['# compound id (and file prefix)',
 ' SMILES',
 ' iupac name (or alternative if IUPAC is unavailable or not parseable by OEChem)',
 ' experimental value (kcal/mol)',
 ' experimental uncertainty (kcal/mol)',
 ' Mobley group calculated value (GAFF) (kcal/mol)',
 ' calculated uncertainty (kcal/mol)',
 ' experimental reference (original or paper this value was taken from)',
 ' calculated reference',
 ' text notes.']

In [8]:
fields_clean = [
    "compound_id",
    "SMILES",
    "iupac_name",
    "exp_value",
    "exp_uncertainty",
    "GAFF",
    "calc_uncertainty",
    "exp_ref",
    "calc_reference",
    "notes",
]

In [9]:
df.columns = fields_clean

In [10]:
df.head()

Unnamed: 0,compound_id,SMILES,iupac_name,exp_value,exp_uncertainty,GAFF,calc_uncertainty,exp_ref,calc_reference,notes
0,mobley_1017962,CCCCCC(=O)OC,methyl hexanoate,-2.49,0.6,-3.3,0.03,10.1021/ct050097l,10.1021/acs.jced.7b00104,Experimental uncertainty not presently availa...
1,mobley_1019269,CCCCO,butan-1-ol,-4.72,0.6,-3.23,0.03,10.1021/ct050097l,10.1021/acs.jced.7b00104,Experimental uncertainty not presently availa...
2,mobley_1034539,c1cc(c(cc1c2cc(c(c(c2Cl)Cl)Cl)Cl)Cl)Cl,"1,2,3,4-tetrachloro-5-(3,4-dichlorophenyl)ben...",-3.04,0.1,-1.08,0.04,10.1007/s10822-012-9568-8,10.1021/acs.jced.7b00104,Experimental uncertainty not presently availa...
3,mobley_1036761,C1CCC(CC1)N,cyclohexanamine,-4.59,0.6,-3.95,0.03,10.1021/ct050097l,10.1021/acs.jced.7b00104,Experimental uncertainty not presently availa...
4,mobley_1046331,c1ccc(cc1)OC=O,phenyl formate,-3.82,0.6,-5.44,0.03,"J. Peter Guthrie, unpublished data, as provid...",10.1021/acs.jced.7b00104,Experimental uncertainty not presently availa...


## Data cleaning

In [11]:
df.notes = df.notes.str.strip()  # remove leading and trailing white space characters

## Save to csv

In [14]:
fn_data_csv = "data_clean.csv"

In [15]:
df.to_csv(fn_data_csv, index=False)

In [None]:
!ls -lh {fn_data_csv}

In [17]:
!head -n 5 {fn_data_csv}

compound_id,SMILES,iupac_name,exp_value,exp_uncertainty,GAFF,calc_uncertainty,exp_ref,calc_reference,notes
mobley_1017962, CCCCCC(=O)OC, methyl hexanoate,-2.49,0.6,-3.3,0.03, 10.1021/ct050097l, 10.1021/acs.jced.7b00104,"Experimental uncertainty not presently available, so assigned a default value."
mobley_1019269, CCCCO, butan-1-ol,-4.72,0.6,-3.23,0.03, 10.1021/ct050097l, 10.1021/acs.jced.7b00104,"Experimental uncertainty not presently available, so assigned a default value."
mobley_1034539, c1cc(c(cc1c2cc(c(c(c2Cl)Cl)Cl)Cl)Cl)Cl," 1,2,3,4-tetrachloro-5-(3,4-dichlorophenyl)benzene",-3.04,0.1,-1.08,0.04, 10.1007/s10822-012-9568-8, 10.1021/acs.jced.7b00104,"Experimental uncertainty not presently available, so assigned a default value."
mobley_1036761, C1CCC(CC1)N, cyclohexanamine,-4.59,0.6,-3.95,0.03, 10.1021/ct050097l, 10.1021/acs.jced.7b00104,"Experimental uncertainty not presently available, so assigned a default value."


In [18]:
df.head()

Unnamed: 0,compound_id,SMILES,iupac_name,exp_value,exp_uncertainty,GAFF,calc_uncertainty,exp_ref,calc_reference,notes
0,mobley_1017962,CCCCCC(=O)OC,methyl hexanoate,-2.49,0.6,-3.3,0.03,10.1021/ct050097l,10.1021/acs.jced.7b00104,Experimental uncertainty not presently availab...
1,mobley_1019269,CCCCO,butan-1-ol,-4.72,0.6,-3.23,0.03,10.1021/ct050097l,10.1021/acs.jced.7b00104,Experimental uncertainty not presently availab...
2,mobley_1034539,c1cc(c(cc1c2cc(c(c(c2Cl)Cl)Cl)Cl)Cl)Cl,"1,2,3,4-tetrachloro-5-(3,4-dichlorophenyl)ben...",-3.04,0.1,-1.08,0.04,10.1007/s10822-012-9568-8,10.1021/acs.jced.7b00104,Experimental uncertainty not presently availab...
3,mobley_1036761,C1CCC(CC1)N,cyclohexanamine,-4.59,0.6,-3.95,0.03,10.1021/ct050097l,10.1021/acs.jced.7b00104,Experimental uncertainty not presently availab...
4,mobley_1046331,c1ccc(cc1)OC=O,phenyl formate,-3.82,0.6,-5.44,0.03,"J. Peter Guthrie, unpublished data, as provid...",10.1021/acs.jced.7b00104,Experimental uncertainty not presently availab...


## Load from csv

In [59]:
fn_data_csv = "data_clean.csv"

In [60]:
df = pd.read_csv(fn_data_csv)

In [61]:
df.head()

Unnamed: 0,compound_id,SMILES,iupac_name,exp_value,exp_uncertainty,GAFF,calc_uncertainty,exp_ref,calc_reference,notes
0,mobley_1017962,CCCCCC(=O)OC,methyl hexanoate,-2.49,0.6,-3.3,0.03,10.1021/ct050097l,10.1021/acs.jced.7b00104,Experimental uncertainty not presently availab...
1,mobley_1019269,CCCCO,butan-1-ol,-4.72,0.6,-3.23,0.03,10.1021/ct050097l,10.1021/acs.jced.7b00104,Experimental uncertainty not presently availab...
2,mobley_1034539,c1cc(c(cc1c2cc(c(c(c2Cl)Cl)Cl)Cl)Cl)Cl,"1,2,3,4-tetrachloro-5-(3,4-dichlorophenyl)ben...",-3.04,0.1,-1.08,0.04,10.1007/s10822-012-9568-8,10.1021/acs.jced.7b00104,Experimental uncertainty not presently availab...
3,mobley_1036761,C1CCC(CC1)N,cyclohexanamine,-4.59,0.6,-3.95,0.03,10.1021/ct050097l,10.1021/acs.jced.7b00104,Experimental uncertainty not presently availab...
4,mobley_1046331,c1ccc(cc1)OC=O,phenyl formate,-3.82,0.6,-5.44,0.03,"J. Peter Guthrie, unpublished data, as provid...",10.1021/acs.jced.7b00104,Experimental uncertainty not presently availab...


# meta YAML

In [98]:
meta = {
    "name": "freesolv",  # unique identifier, we will also use this for directory names
    "description": "Experimental and calculated small molecule hydration free energies",
    "targets": [
        {
            "id": "exp_value",  # name of the column in a tabular dataset
            "description": "experimental hydration free energy value",  # description of what this column means
            "units": "kcal/mol",  # units of the values in this column (leave empty if unitless)
            "type": "continuous",  # can be "categorical", "ordinal", "continuous"
            "names": [  #  names for the property (to sample from for building the prompts)
                "hydration free energy",
            ],
        },
        {
            "id": "exp_uncertainty",
            "description": "experimental hydration free energy uncertainty",
            "units": "kcal/mol",
            "type": "continuos",
            "names": [
                "hydration free energy uncertainty",
            ],
        },
        {
            "id": "GAFF",  # name of the column in a tabular dataset
            "description": "mobley group calculated value",  # description of what this column means
            "units": "kcal/mol",  # units of the values in this column (leave empty if unitless)
            "type": "continuous",  # can be "categorical", "ordinal", "continuous"
            "names": [  # names for the property (to sample from for building the prompts)
                "GAFF",
                "mobley group calculated value",
            ],
        },
        {
            "id": "calc_uncertainty",
            "description": "mobley group calculated value calculated uncertainty",
            "units": "kcal/mol",
            "type": "continuos",
            "names": [
                "GAFF uncertainty",
                "mobley group calculated value uncertainty",
            ],
        },
    ],
    "identifiers": [
        {
            "id": "SMILES",  # column name
            "type": "SMILES",  # can be "SMILES", "SELFIES", "IUPAC", "OTHER"
            "description": "SMILES",  # description (optional, except for "OTHER")
        },
        {
            "id": "iupac_name",
            "type": "IUPAC",
            "description": "IUPAC",
        },
    ],
    "license": "CC BY-NC-SA 4.0",  # license under which the original dataset was published
    "links": [  # list of relevant links (original dataset, other uses, etc.)
        "https://github.com/MobleyLab/FreeSolv",
        "https://escholarship.org/uc/item/6sd403pz",
    ],
    "num_points": len(df),  # number of datapoints in this dataset
    "url": "https://github.com/MobleyLab/FreeSolv",
    "bibtex": [
        """@article{mobley2013experimental,
        title={Experimental and calculated small molecule hydration free energies},
        author={Mobley, David L},
        year={2013}""",
    ],
}

In [41]:
fn_meta = "meta.yaml"

In [42]:
with open(fn_meta, "w") as f:
    yaml.dump(meta, f, sort_keys=False)

In [None]:
!ls -lh {fn_meta}

In [62]:
!cat {fn_meta}

bibtex: "@article{mobley2013experimental,\n    title={Experimental and calculated\
  \ small molecule hydration free energies},\n    author={Mobley, David L},\n    year={2013}\n\
  \    "
description: Experimental and calculated small molecule hydration free energies
identifiers:
- description: SMILES
  id: SMILES
  type: SMILES
- description: IUPAC
  id: iupac_name
  type: IUPAC
license: CC BY-NC-SA 4.0
links:
- https://github.com/MobleyLab/FreeSolv
- https://escholarship.org/uc/item/6sd403pz
name: freesolv
num_points: 642
targets:
- description: experimental hydration free energy value
  id: exp_value
  names:
  - hydration free energy
  type: continuous
  units: kcal/mol
- description: experimental hydration free energy uncertainty
  id: exp_uncertainty
  names:
  - hydration free energy uncertainty
  type: continuos
  units: kcal/mol
- description: mobley group calculated value
  id: GAFF
  names:
  - GAFF
  - mobley group calculated value
  type: continuous
  units: kcal/mol
- des

# create transform.py

In [51]:
path_file = "transform.py"

In [101]:
%%writefile $path_file
import pandas as pd
import requests
import yaml


def get_and_transform_data():
    # get raw data
    data_path = (
        "https://raw.githubusercontent.com/MobleyLab/FreeSolv/master/database.txt"
    )
    fn_data_original = "data_original.txt"
    data = requests.get(data_path)
    with open(fn_data_original, "wb") as f:
        f.write(data.content)

    # create dataframe
    df = pd.read_csv(fn_data_original, delimiter=";", skiprows=2)

    # check if fields are the same
    fields_orig = df.columns.tolist()
    assert fields_orig == [
        "# compound id (and file prefix)",
        " SMILES",
        " iupac name (or alternative if IUPAC is unavailable or not parseable by OEChem)",
        " experimental value (kcal/mol)",
        " experimental uncertainty (kcal/mol)",
        " Mobley group calculated value (GAFF) (kcal/mol)",
        " calculated uncertainty (kcal/mol)",
        " experimental reference (original or paper this value was taken from)",
        " calculated reference",
        " text notes.",
    ]

    # overwrite column names = fields
    fields_clean = [
        "compound_id",
        "SMILES",
        "iupac_name",
        "exp_value",
        "exp_uncertainty",
        "GAFF",
        "calc_uncertainty",
        "exp_ref",
        "calc_reference",
        "notes",
    ]
    df.columns = fields_clean

    # data cleaning
    df.notes = (
        df.notes.str.strip()  # remove leading and trailing white space characters
    )

    # save to csv
    fn_data_csv = "data_clean.csv"
    df.to_csv(fn_data_csv, index=False)

    # create meta yaml
    meta = {
        "name": "freesolv",  # unique identifier, we will also use this for directory names
        "description": "Experimental and calculated small molecule hydration free energies",
        "targets": [
            {
                "id": "exp_value",  # name of the column in a tabular dataset
                "description": "experimental hydration free energy value",  # description of what this column means
                "units": "kcal/mol",  # units of the values in this column (leave empty if unitless)
                "type": "continuous",  # can be "categorical", "ordinal", "continuous"
                "names": [  #  names for the property (to sample from for building the prompts)
                    "hydration free energy",
                ],
            },
            {
                "id": "exp_uncertainty",
                "description": "experimental hydration free energy uncertainty",
                "units": "kcal/mol",
                "type": "continuos",
                "names": [
                    "hydration free energy uncertainty",
                ],
            },
            {
                "id": "GAFF",  # name of the column in a tabular dataset
                "description": "mobley group calculated value",  # description of what this column means
                "units": "kcal/mol",  # units of the values in this column (leave empty if unitless)
                "type": "continuous",  # can be "categorical", "ordinal", "continuous"
                "names": [  # names for the property (to sample from for building the prompts)
                    "GAFF",
                    "mobley group calculated value",
                ],
            },
            {
                "id": "calc_uncertainty",
                "description": "mobley group calculated value calculated uncertainty",
                "units": "kcal/mol",
                "type": "continuos",
                "names": [
                    "GAFF uncertainty",
                    "mobley group calculated value uncertainty",
                ],
            },
        ],
        "identifiers": [
            {
                "id": "SMILES",  # column name
                "type": "SMILES",  # can be "SMILES", "SELFIES", "IUPAC", "OTHER"
                "description": "SMILES",  # description (optional, except for "OTHER")
            },
            {
                "id": "iupac_name",
                "type": "IUPAC",
                "description": "IUPAC",
            },
        ],
        "license": "CC BY-NC-SA 4.0",  # license under which the original dataset was published
        "links": [  # list of relevant links (original dataset, other uses, etc.)
            "https://github.com/MobleyLab/FreeSolv",
            "https://escholarship.org/uc/item/6sd403pz",
        ],
        "num_points": len(df),  # number of datapoints in this dataset
        "url": "https://github.com/MobleyLab/FreeSolv",
        "bibtex": [
            """@article{mobley2013experimental,
            title={Experimental and calculated small molecule hydration free energies},
            author={Mobley, David L},
            year={2013}""",
        ],
    }
    fn_meta = "meta.yaml"
    with open(fn_meta, "w") as f:
        yaml.dump(meta, f, sort_keys=False)

    print(f"Finished processing {meta['name']} dataset!")


if __name__ == "__main__":
    get_and_transform_data()

Overwriting transform.py


In [102]:
!python3 transform.py

Finished processing freesolv dataset!


In [None]:
ls -lh  # fmt: skip

# End