# Lipophilicity dataset preparation

Original data repository: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv

# Imports

In [1]:
import requests
import pandas as pd
import yaml

# Data processing

## Download data

In [2]:
data_path = "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv"

In [3]:
fn_data_original = "data_original.csv"

In [4]:
data = requests.get(data_path)
with open(fn_data_original, "wb") as f:
    f.write(data.content)

In [None]:
!ls -lh

## Load original data

In [6]:
!head -n 5 {fn_data_original}

CMPD_CHEMBLID,exp,smiles
CHEMBL596271,3.54,Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14
CHEMBL1951080,-1.18,COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)CCc3ccccc23
CHEMBL1771,3.69,COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl
CHEMBL234951,3.37,OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(Cl)sc4[nH]3


In [7]:
df = pd.read_csv(fn_data_original, delimiter=",")

In [9]:
df.columns.tolist()

['CMPD_CHEMBLID', 'exp', 'smiles']

In [11]:
assert df.columns.tolist() == ['CMPD_CHEMBLID', 'exp', 'smiles']

In [12]:
df.head()

Unnamed: 0,CMPD_CHEMBLID,exp,smiles
0,CHEMBL596271,3.54,Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14
1,CHEMBL1951080,-1.18,COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...
2,CHEMBL1771,3.69,COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl
3,CHEMBL234951,3.37,OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...
4,CHEMBL565079,3.1,Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N...


## Add column = field names
Clean column names (`fields_clean`) and keep original names (`fields_orig`)

In [13]:
fields_orig = df.columns.tolist()
fields_orig

['CMPD_CHEMBLID', 'exp', 'smiles']

In [14]:
fields_clean = [
    "CMPD_CHEMBLID",
    "exp",
    "SMILES",
]

In [15]:
df.columns = fields_clean

In [16]:
df.head()

Unnamed: 0,CMPD_CHEMBLID,exp,SMILES
0,CHEMBL596271,3.54,Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14
1,CHEMBL1951080,-1.18,COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...
2,CHEMBL1771,3.69,COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl
3,CHEMBL234951,3.37,OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...
4,CHEMBL565079,3.1,Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N...


## Data cleaning

In [17]:
df.duplicated().sum()

0

In [18]:
assert not(df.duplicated().sum())

## Save to csv

In [19]:
fn_data_csv = "data_clean.csv"

In [20]:
df.to_csv(fn_data_csv, index=False)

In [None]:
!ls -lh {fn_data_csv}

In [22]:
!head -n 5 {fn_data_csv}

CMPD_CHEMBLID,exp,SMILES
CHEMBL596271,3.54,Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14
CHEMBL1951080,-1.18,COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)CCc3ccccc23
CHEMBL1771,3.69,COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl
CHEMBL234951,3.37,OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(Cl)sc4[nH]3


In [23]:
df.head()

Unnamed: 0,CMPD_CHEMBLID,exp,SMILES
0,CHEMBL596271,3.54,Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14
1,CHEMBL1951080,-1.18,COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...
2,CHEMBL1771,3.69,COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl
3,CHEMBL234951,3.37,OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...
4,CHEMBL565079,3.1,Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N...


## Load from csv

In [24]:
fn_data_csv = "data_clean.csv"

In [25]:
df = pd.read_csv(fn_data_csv)

In [26]:
df.head()

Unnamed: 0,CMPD_CHEMBLID,exp,SMILES
0,CHEMBL596271,3.54,Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14
1,CHEMBL1951080,-1.18,COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...
2,CHEMBL1771,3.69,COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl
3,CHEMBL234951,3.37,OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...
4,CHEMBL565079,3.1,Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N...


# meta YAML

In [56]:
meta = {
    "name": "lipophilicity",  # unique identifier, we will also use this for directory names
    "description": "Experimental results of octanol/water distribution coefficient (logD at pH 7.4).",
    "targets": [
        {
            "id": "exp",  # name of the column in a tabular dataset
            "description": "experimental results of octanol/water distribution coefficient (logD at pH 7.4)",  # description of what this column means
            "units": "logD",  # units of the values in this column (leave empty if unitless)
            "type": "continuous",  # can be "categorical", "ordinal", "continuous"
            "names": [  # names for the property (to sample from for building the prompts)
                "octanol/water distribution coefficient (logD at pH 7.4)",
                "octanol/water distribution coefficient",
            ],
        },
    ],
    "identifiers": [
        {
            "id": "SMILES",  # column name
            "type": "SMILES",  # can be "SMILES", "SELFIES", "IUPAC", "OTHER"
            "description": "SMILES",  # description (optional, except for "OTHER")
        },
    ],
    "license": "CC BY-SA 3.0",  # license under which the original dataset was published
    "links": [  # list of relevant links (original dataset, other uses, etc.)
        "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv",
        "https://github.com/cheminfo/molecule-features/blob/main/data/lipophilicity/meta.yaml",
        "https://deepchem.readthedocs.io/en/latest/api_reference/moleculenet.html",
        "https://www.ebi.ac.uk/chembl/document_report_card/CHEMBL3301361/",
        "https://chembl.gitbook.io/chembl-interface-documentation/about#data-licensing",
        "https://creativecommons.org/licenses/by-sa/3.0/",
    ],
    "num_points": len(df),  # number of datapoints in this dataset
    "url": "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv",
    "bibtex": ["""@techreport{hersey2015chembl,
    title={ChEMBL Deposited Data Set-AZ\_dataset},
    author={Hersey, Anne},
    year={2015},
    institution={Technical Report, Technical report, EMBL-EBI, 2015. https://www. ebi. ac. uk~…}}"""],
}

In [57]:
fn_meta = "meta.yaml"

In [58]:
with open(fn_meta, "w") as f:
    yaml.dump(meta, f, sort_keys=False)

In [None]:
!ls -lh {fn_meta}

In [59]:
!cat {fn_meta}

name: lipophilicity
description: Experimental results of octanol/water distribution coefficient (logD
  at pH 7.4).
targets:
- id: exp
  description: experimental results of octanol/water distribution coefficient (logD
    at pH 7.4)
  units: logD
  type: continuous
  names:
  - octanol/water distribution coefficient (logD at pH 7.4)
  - octanol/water distribution coefficient
identifiers:
- id: SMILES
  type: SMILES
  description: SMILES
license: CC BY-SA 3.0
links:
- https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv
- https://github.com/cheminfo/molecule-features/blob/main/data/lipophilicity/meta.yaml
- https://deepchem.readthedocs.io/en/latest/api_reference/moleculenet.html
- https://www.ebi.ac.uk/chembl/document_report_card/CHEMBL3301361/
- https://chembl.gitbook.io/chembl-interface-documentation/about#data-licensing
- https://creativecommons.org/licenses/by-sa/3.0/
num_points: 4200
url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.

# create transform.py

In [33]:
path_file = "transform.py"

In [71]:
%%writefile $path_file
import pandas as pd
import requests
import yaml


def get_and_transform_data():
    # get raw data
    data_path = (
        "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv"
    )
    fn_data_original = "data_original.txt"
    data = requests.get(data_path)
    with open(fn_data_original, "wb") as f:
        f.write(data.content)

    # create dataframe
    df = pd.read_csv(fn_data_original, delimiter=",")

    # check if fields are the same
    assert df.columns.tolist() == ["CMPD_CHEMBLID", "exp", "smiles"]
    
    # check if no duplicated
    assert not(df.duplicated().sum())
    
    # overwrite column names = fields
    fields_clean = [
        "CMPD_CHEMBLID",
        "exp",
        "SMILES",
    ]
    df.columns = fields_clean
    
    # save to csv
    fn_data_csv = "data_clean.csv"
    df.to_csv(fn_data_csv, index=False)
    
    # create meta yaml
    meta = {
        "name": "lipophilicity",  # unique identifier, we will also use this for directory names
        "description": "Experimental results of octanol/water distribution coefficient (logD at pH 7.4).",
        "targets": [
            {
                "id": "exp",  # name of the column in a tabular dataset
                "description": "experimental results of octanol/water distribution coefficient (logD at pH 7.4)",  # description of what this column means
                "units": "logD",  # units of the values in this column (leave empty if unitless)
                "type": "continuous",  # can be "categorical", "ordinal", "continuous"
                "names": [  # names for the property (to sample from for building the prompts)
                    "octanol/water distribution coefficient (logD at pH 7.4)",
                    "octanol/water distribution coefficient",
                ],
            },
        ],
        "identifiers": [
            {
                "id": "SMILES",  # column name
                "type": "SMILES",  # can be "SMILES", "SELFIES", "IUPAC", "OTHER"
                "description": "SMILES",  # description (optional, except for "OTHER")
            },
        ],
        "license": "CC BY-SA 3.0",  # license under which the original dataset was published
        "links": [  # list of relevant links (original dataset, other uses, etc.)
            "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv",
            "https://github.com/cheminfo/molecule-features/blob/main/data/lipophilicity/meta.yaml",
            "https://deepchem.readthedocs.io/en/latest/api_reference/moleculenet.html",
            "https://www.ebi.ac.uk/chembl/document_report_card/CHEMBL3301361/",
            "https://chembl.gitbook.io/chembl-interface-documentation/about#data-licensing",
            "https://creativecommons.org/licenses/by-sa/3.0/",
        ],
        "num_points": len(df),  # number of datapoints in this dataset
        "url": "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv",
        "bibtex": [
            """@techreport{hersey2015chembl,
            title={ChEMBL Deposited Data Set-AZ\_dataset},
            author={Hersey, Anne},
            year={2015},
            institution={Technical Report, Technical report, EMBL-EBI, 2015. https://www. ebi. ac. uk~…}}"""
        ],
    }
    fn_meta = "meta.yaml"
    with open(fn_meta, "w") as f:
        yaml.dump(meta, f, sort_keys=False)
        
    print(f"Finished processing {meta['name']} dataset!")


if __name__ == '__main__':
    get_and_transform_data()

Overwriting transform.py


In [69]:
!python3 transform.py

Finished processing lipophilicity dataset!


In [None]:
ls -lh

# End