# hERG blockers

Original data repository: https://tdcommons.ai/single_pred_tasks/tox/#herg-blockers

# Imports

In [154]:
import pandas as pd
import yaml
from tdc.single_pred import Tox

# Data processing

## Download data

In [155]:
fn_data_original = "data_original.csv"

In [156]:
data = Tox(name = 'hERG')

Downloading...
100%|██████████████████████████████████████| 50.2k/50.2k [00:00<00:00, 354kiB/s]
Loading...
Done!


In [157]:
data.get_data().to_csv(fn_data_original, index=False)

In [158]:
!ls -lh

total 104K
drwxrwxr-x 2 melo melo 4.0K مار  1 22:50 data
-rw-rw-r-- 1 melo melo  47K مار  1 22:50 data_original.csv
-rw-rw-r-- 1 melo melo  40K مار  1 22:26 example_processing_and_templates.ipynb
-rw-rw-r-- 1 melo melo 1.7K مار  1 22:25 meta.yaml
-rw-rw-r-- 1 melo melo 4.8K مار  1 22:25 transform.py


## Load original data

In [159]:
!head -n 5 {fn_data_original}

Drug_ID,Drug,Y
DEMETHYLASTEMIZOLE,Oc1ccc(CCN2CCC(Nc3nc4ccccc4n3Cc3ccc(F)cc3)CC2)cc1,1.0
GBR-12909,Fc1ccc(C(OCC[NH+]2CC[NH+](CCCc3ccccc3)CC2)c2ccc(F)cc2)cc1,1.0
LY-97241,CCCCCCCN(CC)CCCCc1ccc([N+](=O)[O-])cc1,1.0
CLOFILIUM PHOSPHATE,CCCCCCC[N+](CC)(CC)CCCCc1ccc(Cl)cc1.CCCCCCC[N+](CC)(CC)CCCCc1ccc(Cl)cc1.CCCCCCC[N+](CC)(CC)CCCCc1ccc(Cl)cc1.O=P([O-])([O-])[O-],1.0


In [160]:
df = pd.read_csv(fn_data_original, delimiter=",")

In [161]:
df.head()

Unnamed: 0,Drug_ID,Drug,Y
0,DEMETHYLASTEMIZOLE,Oc1ccc(CCN2CCC(Nc3nc4ccccc4n3Cc3ccc(F)cc3)CC2)cc1,1.0
1,GBR-12909,Fc1ccc(C(OCC[NH+]2CC[NH+](CCCc3ccccc3)CC2)c2cc...,1.0
2,LY-97241,CCCCCCCN(CC)CCCCc1ccc([N+](=O)[O-])cc1,1.0
3,CLOFILIUM PHOSPHATE,CCCCCCC[N+](CC)(CC)CCCCc1ccc(Cl)cc1.CCCCCCC[N+...,1.0
4,FLUSPIRILENE,O=C1NCN(c2ccccc2)C12CC[NH+](CCCC(c1ccc(F)cc1)c...,1.0


## Add column = field names
Clean column names (`fields_clean`) and keep original names (`fields_orig`)

In [162]:
fields_orig = df.columns.tolist()
fields_orig

['Drug_ID', 'Drug', 'Y']

In [163]:
assert fields_orig == ['Drug_ID', 'Drug', 'Y']

In [164]:
fields_clean = [
    "compound_name",
    "SMILES",
    "hERG_blocker",
]

In [165]:
df.columns = fields_clean

In [166]:
assert fields_orig != fields_clean

In [167]:
df.head()

Unnamed: 0,compound_name,SMILES,hERG_blocker
0,DEMETHYLASTEMIZOLE,Oc1ccc(CCN2CCC(Nc3nc4ccccc4n3Cc3ccc(F)cc3)CC2)cc1,1.0
1,GBR-12909,Fc1ccc(C(OCC[NH+]2CC[NH+](CCCc3ccccc3)CC2)c2cc...,1.0
2,LY-97241,CCCCCCCN(CC)CCCCc1ccc([N+](=O)[O-])cc1,1.0
3,CLOFILIUM PHOSPHATE,CCCCCCC[N+](CC)(CC)CCCCc1ccc(Cl)cc1.CCCCCCC[N+...,1.0
4,FLUSPIRILENE,O=C1NCN(c2ccccc2)C12CC[NH+](CCCC(c1ccc(F)cc1)c...,1.0


## Data cleaning

In [168]:
df.compound_name = (
    df.compound_name.str.strip()
)  # remove leading and trailing white space characters

In [169]:
assert not df.duplicated().sum()

## Save to csv

In [170]:
fn_data_csv = "data_clean.csv"

In [171]:
df.to_csv(fn_data_csv, index=False)

In [172]:
!ls -lh {fn_data_csv}

-rw-rw-r-- 1 melo melo 47K مار  1 22:50 data_clean.csv


In [173]:
!head -n 5 {fn_data_csv}

compound_name,SMILES,hERG_blocker
DEMETHYLASTEMIZOLE,Oc1ccc(CCN2CCC(Nc3nc4ccccc4n3Cc3ccc(F)cc3)CC2)cc1,1.0
GBR-12909,Fc1ccc(C(OCC[NH+]2CC[NH+](CCCc3ccccc3)CC2)c2ccc(F)cc2)cc1,1.0
LY-97241,CCCCCCCN(CC)CCCCc1ccc([N+](=O)[O-])cc1,1.0
CLOFILIUM PHOSPHATE,CCCCCCC[N+](CC)(CC)CCCCc1ccc(Cl)cc1.CCCCCCC[N+](CC)(CC)CCCCc1ccc(Cl)cc1.CCCCCCC[N+](CC)(CC)CCCCc1ccc(Cl)cc1.O=P([O-])([O-])[O-],1.0


In [174]:
df.head()

Unnamed: 0,compound_name,SMILES,hERG_blocker
0,DEMETHYLASTEMIZOLE,Oc1ccc(CCN2CCC(Nc3nc4ccccc4n3Cc3ccc(F)cc3)CC2)cc1,1.0
1,GBR-12909,Fc1ccc(C(OCC[NH+]2CC[NH+](CCCc3ccccc3)CC2)c2cc...,1.0
2,LY-97241,CCCCCCCN(CC)CCCCc1ccc([N+](=O)[O-])cc1,1.0
3,CLOFILIUM PHOSPHATE,CCCCCCC[N+](CC)(CC)CCCCc1ccc(Cl)cc1.CCCCCCC[N+...,1.0
4,FLUSPIRILENE,O=C1NCN(c2ccccc2)C12CC[NH+](CCCC(c1ccc(F)cc1)c...,1.0


## Load from csv

In [175]:
fn_data_csv = "data_clean.csv"

In [176]:
df = pd.read_csv(fn_data_csv)

In [177]:
df.head()

Unnamed: 0,compound_name,SMILES,hERG_blocker
0,DEMETHYLASTEMIZOLE,Oc1ccc(CCN2CCC(Nc3nc4ccccc4n3Cc3ccc(F)cc3)CC2)cc1,1.0
1,GBR-12909,Fc1ccc(C(OCC[NH+]2CC[NH+](CCCc3ccccc3)CC2)c2cc...,1.0
2,LY-97241,CCCCCCCN(CC)CCCCc1ccc([N+](=O)[O-])cc1,1.0
3,CLOFILIUM PHOSPHATE,CCCCCCC[N+](CC)(CC)CCCCc1ccc(Cl)cc1.CCCCCCC[N+...,1.0
4,FLUSPIRILENE,O=C1NCN(c2ccccc2)C12CC[NH+](CCCC(c1ccc(F)cc1)c...,1.0


# meta YAML

In [178]:
df.head()

Unnamed: 0,compound_name,SMILES,hERG_blocker
0,DEMETHYLASTEMIZOLE,Oc1ccc(CCN2CCC(Nc3nc4ccccc4n3Cc3ccc(F)cc3)CC2)cc1,1.0
1,GBR-12909,Fc1ccc(C(OCC[NH+]2CC[NH+](CCCc3ccccc3)CC2)c2cc...,1.0
2,LY-97241,CCCCCCCN(CC)CCCCc1ccc([N+](=O)[O-])cc1,1.0
3,CLOFILIUM PHOSPHATE,CCCCCCC[N+](CC)(CC)CCCCc1ccc(Cl)cc1.CCCCCCC[N+...,1.0
4,FLUSPIRILENE,O=C1NCN(c2ccccc2)C12CC[NH+](CCCC(c1ccc(F)cc1)c...,1.0


In [179]:
meta = {
    "name": "hERG",  # unique identifier, we will also use this for directory names
    "description": """Human ether-à-go-go related gene (hERG) is crucial for the coordination of the heart's beating. Thus, if a drug blocks the hERG, it could lead to severe adverse effects. Therefore, reliable prediction of hERG liability in the early stages of drug design is quite important to reduce the risk of cardiotoxicity-related attritions in the later development stages.""",
    "targets": [
        {
            "id": "hERG_blocker",  # name of the column in a tabular dataset
            "description": "hERG active compound(blocker)",  # description of what this column means
            "units": "ld50",  # units of the values in this column (leave empty if unitless)
            "type": "categorical",  # can be "categorical", "ordinal", "continuous"
            "names": [  # names for the property (to sample from for building the prompts)
                "hERG activity",
                "hERG active compound",
                "hERG blocker",
                "Human ether-à-go-go related gene (hERG) blocker",
                "Activity against Human ether-à-go-go related gene (hERG)",
            ],
        },
    ],
    "identifiers": [
        {
            "id": "SMILES",  # column name
            "type": "SMILES",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
            "description": "SMILES",  # description (optional, except for "Other")
        },
        {
            "id": "compound_name",
            "type": "Other",
            "description": "compound name",
            "names": [
                "compound",
                "compound name",
                "drug",
            ],
        },
    ],
    "license": "CC BY 4.0",  # license under which the original dataset was published
    "links": [  # list of relevant links (original dataset, other uses, etc.)
        {
            "url": "https://doi.org/10.1021/acs.molpharmaceut.6b00471",
            "description": "corresponding publication",
        },
    ],
    "num_points": len(df),  # number of datapoints in this dataset
    "url": "https://tdcommons.ai/single_pred_tasks/tox/#herg-blockers",
    "bibtex": [
        """@article{Wang2016,
  doi = {10.1021/acs.molpharmaceut.6b00471},
  url = {https://doi.org/10.1021/acs.molpharmaceut.6b00471},
  year = {2016},
  month = jul,
  publisher = {American Chemical Society ({ACS})},
  volume = {13},
  number = {8},
  pages = {2855--2866},
  author = {Shuangquan Wang and Huiyong Sun and Hui Liu and Dan Li and Youyong Li and Tingjun Hou},
  title = {{ADMET} Evaluation in Drug Discovery. 16. Predicting {hERG} Blockers by Combining Multiple Pharmacophores and Machine Learning Approaches},
  journal = {Molecular Pharmaceutics}}""",
    ],
}

In [180]:
def str_presenter(dumper, data):
    """configures yaml for dumping multiline strings
    Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
    """
    if data.count("\n") > 0:  # check for multiline string
        return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
    return dumper.represent_scalar("tag:yaml.org,2002:str", data)


yaml.add_representer(str, str_presenter)
yaml.representer.SafeRepresenter.add_representer(
    str, str_presenter
)  # to use with safe_dum

In [181]:
fn_meta = "meta.yaml"

In [182]:
with open(fn_meta, "w") as f:
    yaml.dump(meta, f, sort_keys=False)

In [183]:
!ls -lh {fn_meta}

-rw-rw-r-- 1 melo melo 1.7K مار  1 22:50 meta.yaml


In [184]:
!cat {fn_meta}

name: hERG
description: "Human ether-\xE0-go-go related gene (hERG) is crucial for the coordination\
  \ of the heart's beating. Thus, if a drug blocks the hERG, it could lead to severe\
  \ adverse effects. Therefore, reliable prediction of hERG liability in the early\
  \ stages of drug design is quite important to reduce the risk of cardiotoxicity-related\
  \ attritions in the later development stages."
targets:
- id: hERG_blocker
  description: hERG active compound(blocker)
  units: ld50
  type: categorical
  names:
  - hERG activity
  - hERG active compound
  - hERG blocker
  - "Human ether-\xE0-go-go related gene (hERG) blocker"
  - "Activity against Human ether-\xE0-go-go related gene (hERG)"
identifiers:
- id: SMILES
  type: SMILES
  description: SMILES
- id: compound_name
  type: Other
  description: compound name
  names:
  - compound
  - compound name
  - drug
license: CC BY 4.0
links:
- url: https://doi.org/10.1021/acs.molpharmaceut.6b00471
 

# create transform.py

In [185]:
path_file = "transform.py"

In [186]:
%%writefile $path_file
import pandas as pd
import yaml
from tdc.single_pred import Tox


def get_and_transform_data():
    # get raw data
    data = Tox(name = 'hERG')
    fn_data_original = "data_original.csv"
    data.get_data().to_csv(fn_data_original, index=False)

    # create dataframe
    df = pd.read_csv(
        fn_data_original,
        delimiter=",",
    )  # not necessary but ensure we can load the saved data

    # check if fields are the same
    fields_orig = df.columns.tolist()
    assert fields_orig == [
        "Drug_ID",
        "Drug",
        "Y",
    ]

    # overwrite column names = fields
    fields_clean = [
        "compound_name",
        "SMILES",
        "hERG_blocker",
    ]
    df.columns = fields_clean

    # data cleaning
    df.compound_name = (
        df.compound_name.str.strip()
    )  # remove leading and trailing white space characters

    assert not df.duplicated().sum()

    # save to csv
    fn_data_csv = "data_clean.csv"
    df.to_csv(fn_data_csv, index=False)

    # create meta yaml
    meta = {
        "name": "hERG",  # unique identifier, we will also use this for directory names
        "description": """Human ether-à-go-go related gene (hERG) is crucial for the coordination of the heart's beating. Thus, if a drug blocks the hERG, it could lead to severe adverse effects. Therefore, reliable prediction of hERG liability in the early stages of drug design is quite important to reduce the risk of cardiotoxicity-related attritions in the later development stages.""",
        "targets": [
            {
                "id": "hERG_blocker",  # name of the column in a tabular dataset
                "description": "hERG active compound(blocker)",  # description of what this column means
                "units": "ld50",  # units of the values in this column (leave empty if unitless)
                "type": "categorical",  # can be "categorical", "ordinal", "continuous"
                "names": [  # names for the property (to sample from for building the prompts)
                    "hERG activity",
                    "hERG active compound",
                    "hERG blocker",
                    "Human ether-à-go-go related gene (hERG) blocker",
                    "Activity against Human ether-à-go-go related gene (hERG)",
                ],
            },
        ],
        "identifiers": [
            {
                "id": "SMILES",  # column name
                "type": "SMILES",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
                "description": "SMILES",  # description (optional, except for "Other")
            },
            {
                "id": "compound_name",
                "type": "Other",
                "description": "compound name",
                "names": [
                    "compound",
                    "compound name",
                    "drug",
                ],
            },
        ],
        "license": "CC BY 4.0",  # license under which the original dataset was published
        "links": [  # list of relevant links (original dataset, other uses, etc.)
            {
                "url": "https://doi.org/10.1021/acs.molpharmaceut.6b00471",
                "description": "corresponding publication",
            },
        ],
        "num_points": len(df),  # number of datapoints in this dataset
        "url": "https://tdcommons.ai/single_pred_tasks/tox/#herg-blockers",
        "bibtex": [
            """@article{Wang2016,
      doi = {10.1021/acs.molpharmaceut.6b00471},
      url = {https://doi.org/10.1021/acs.molpharmaceut.6b00471},
      year = {2016},
      month = jul,
      publisher = {American Chemical Society ({ACS})},
      volume = {13},
      number = {8},
      pages = {2855--2866},
      author = {Shuangquan Wang and Huiyong Sun and Hui Liu and Dan Li and Youyong Li and Tingjun Hou},
      title = {{ADMET} Evaluation in Drug Discovery. 16. Predicting {hERG} Blockers by Combining Multiple Pharmacophores and Machine Learning Approaches},
      journal = {Molecular Pharmaceutics}}""",
        ],
    }

    def str_presenter(dumper, data):
        """configures yaml for dumping multiline strings
        Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
        """
        if data.count("\n") > 0:  # check for multiline string
            return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
        return dumper.represent_scalar("tag:yaml.org,2002:str", data)

    yaml.add_representer(str, str_presenter)
    yaml.representer.SafeRepresenter.add_representer(
        str, str_presenter
    )  # to use with safe_dum
    fn_meta = "meta.yaml"
    with open(fn_meta, "w") as f:
        yaml.dump(meta, f, sort_keys=False)

    print(f"Finished processing {meta['name']} dataset!")


if __name__ == "__main__":
    get_and_transform_data()

Overwriting transform.py


In [187]:
!python3 transform.py

Found local copy...
Loading...
Done!
Finished processing hERG dataset!


In [188]:
ls -lh  # fmt: skip

total 152K
drwxrwxr-x 2 melo melo 4.0K مار  1 22:50 [0m[01;34mdata[0m/
-rw-rw-r-- 1 melo melo  47K مار  1 22:50 data_clean.csv
-rw-rw-r-- 1 melo melo  47K مار  1 22:50 data_original.csv
-rw-rw-r-- 1 melo melo  40K مار  1 22:26 example_processing_and_templates.ipynb
-rw-rw-r-- 1 melo melo 1.7K مار  1 22:50 meta.yaml
-rw-rw-r-- 1 melo melo 4.8K مار  1 22:50 transform.py


# End