# hERG Karim et al.

Original data repository: https://tdcommons.ai/single_pred_tasks/tox/#herg-karim-et-al

# Imports

In [1]:
import pandas as pd
import yaml
from tdc.single_pred import Tox

# Data processing

## Download data

In [2]:
fn_data_original = "data_original.csv"

In [3]:
data = Tox(name = 'hERG_Karim')

Downloading...
100%|████████████████████████████████████████| 885k/885k [00:01<00:00, 766kiB/s]
Loading...
Done!


In [4]:
data.get_data().to_csv(fn_data_original, index=False)

In [5]:
!ls -lh

total 896K
drwxrwxr-x 2 melo melo 4.0K مار  2 16:55 data
-rw-rw-r-- 1 melo melo 839K مار  2 16:55 data_original.csv
-rw-rw-r-- 1 melo melo  38K مار  2 16:55 example_processing_and_templates.ipynb
-rw-rw-r-- 1 melo melo 1.7K مار  2 06:06 meta.yaml
-rw-rw-r-- 1 melo melo 4.8K مار  2 06:06 transform.py


## Load original data

In [6]:
!head -n 5 {fn_data_original}

Drug_ID,Drug,Y
0,Fc1ccc(-n2cc(NCCN3CCCCC3)nn2)cc1F,1
1,COc1cc(N2Cc3ccc(Sc4ccc(F)cc4)nc3C2=O)ccc1OCCN1CCCC1,0
2,CCOC(=O)[C@H]1CC[C@@H](N2CC(NC(=O)CNc3nn(C(N)=O)c4ccc(C(F)(F)F)cc34)C2)CC1,0
3,N[C@@H](Cn1c(=O)cnc2ccc(F)cc21)C1CCC(NCc2ccc3c(n2)NC(=O)CO3)CC1,0


In [7]:
df = pd.read_csv(fn_data_original, delimiter=",")

In [8]:
df.head()

Unnamed: 0,Drug_ID,Drug,Y
0,0,Fc1ccc(-n2cc(NCCN3CCCCC3)nn2)cc1F,1
1,1,COc1cc(N2Cc3ccc(Sc4ccc(F)cc4)nc3C2=O)ccc1OCCN1...,0
2,2,CCOC(=O)[C@H]1CC[C@@H](N2CC(NC(=O)CNc3nn(C(N)=...,0
3,3,N[C@@H](Cn1c(=O)cnc2ccc(F)cc21)C1CCC(NCc2ccc3c...,0
4,4,O=C(NC1COc2cccc(-c3ccnc(CO)c3)c2C1)c1ccc(OCC(F...,0


## Add column = field names
Clean column names (`fields_clean`) and keep original names (`fields_orig`)

In [9]:
fields_orig = df.columns.tolist()
fields_orig

['Drug_ID', 'Drug', 'Y']

In [10]:
assert fields_orig == ['Drug_ID', 'Drug', 'Y']

In [11]:
fields_clean = [
    "compound_id",
    "SMILES",
    "hERG_blocker",
]

In [12]:
df.columns = fields_clean

In [13]:
assert fields_orig != fields_clean

In [14]:
df.head()

Unnamed: 0,compound_id,SMILES,hERG_blocker
0,0,Fc1ccc(-n2cc(NCCN3CCCCC3)nn2)cc1F,1
1,1,COc1cc(N2Cc3ccc(Sc4ccc(F)cc4)nc3C2=O)ccc1OCCN1...,0
2,2,CCOC(=O)[C@H]1CC[C@@H](N2CC(NC(=O)CNc3nn(C(N)=...,0
3,3,N[C@@H](Cn1c(=O)cnc2ccc(F)cc21)C1CCC(NCc2ccc3c...,0
4,4,O=C(NC1COc2cccc(-c3ccnc(CO)c3)c2C1)c1ccc(OCC(F...,0


## Data cleaning

In [15]:
assert not df.duplicated().sum()

## Save to csv

In [16]:
fn_data_csv = "data_clean.csv"

In [17]:
df.to_csv(fn_data_csv, index=False)

In [18]:
!ls -lh {fn_data_csv}

-rw-rw-r-- 1 melo melo 839K مار  2 16:55 data_clean.csv


In [19]:
!head -n 5 {fn_data_csv}

compound_id,SMILES,hERG_blocker
0,Fc1ccc(-n2cc(NCCN3CCCCC3)nn2)cc1F,1
1,COc1cc(N2Cc3ccc(Sc4ccc(F)cc4)nc3C2=O)ccc1OCCN1CCCC1,0
2,CCOC(=O)[C@H]1CC[C@@H](N2CC(NC(=O)CNc3nn(C(N)=O)c4ccc(C(F)(F)F)cc34)C2)CC1,0
3,N[C@@H](Cn1c(=O)cnc2ccc(F)cc21)C1CCC(NCc2ccc3c(n2)NC(=O)CO3)CC1,0


In [20]:
df.head()

Unnamed: 0,compound_id,SMILES,hERG_blocker
0,0,Fc1ccc(-n2cc(NCCN3CCCCC3)nn2)cc1F,1
1,1,COc1cc(N2Cc3ccc(Sc4ccc(F)cc4)nc3C2=O)ccc1OCCN1...,0
2,2,CCOC(=O)[C@H]1CC[C@@H](N2CC(NC(=O)CNc3nn(C(N)=...,0
3,3,N[C@@H](Cn1c(=O)cnc2ccc(F)cc21)C1CCC(NCc2ccc3c...,0
4,4,O=C(NC1COc2cccc(-c3ccnc(CO)c3)c2C1)c1ccc(OCC(F...,0


## Load from csv

In [21]:
fn_data_csv = "data_clean.csv"

In [22]:
df = pd.read_csv(fn_data_csv)

In [23]:
df.head()

Unnamed: 0,compound_id,SMILES,hERG_blocker
0,0,Fc1ccc(-n2cc(NCCN3CCCCC3)nn2)cc1F,1
1,1,COc1cc(N2Cc3ccc(Sc4ccc(F)cc4)nc3C2=O)ccc1OCCN1...,0
2,2,CCOC(=O)[C@H]1CC[C@@H](N2CC(NC(=O)CNc3nn(C(N)=...,0
3,3,N[C@@H](Cn1c(=O)cnc2ccc(F)cc21)C1CCC(NCc2ccc3c...,0
4,4,O=C(NC1COc2cccc(-c3ccnc(CO)c3)c2C1)c1ccc(OCC(F...,0


# meta YAML

In [24]:
df.head()

Unnamed: 0,compound_id,SMILES,hERG_blocker
0,0,Fc1ccc(-n2cc(NCCN3CCCCC3)nn2)cc1F,1
1,1,COc1cc(N2Cc3ccc(Sc4ccc(F)cc4)nc3C2=O)ccc1OCCN1...,0
2,2,CCOC(=O)[C@H]1CC[C@@H](N2CC(NC(=O)CNc3nn(C(N)=...,0
3,3,N[C@@H](Cn1c(=O)cnc2ccc(F)cc21)C1CCC(NCc2ccc3c...,0
4,4,O=C(NC1COc2cccc(-c3ccnc(CO)c3)c2C1)c1ccc(OCC(F...,0


In [25]:
meta = {
    "name": "hERG_Karim_et_al",  # unique identifier, we will also use this for directory names
    "description": """A integrated Ether-a-go-go-related gene (hERG) dataset consisting of molecular structures labelled as hERG (<10uM) and non-hERG (>=10uM) blockers in the form of SMILES strings was obtained from the DeepHIT, the BindingDB database, ChEMBL bioactivity database, and other literature.""",
    "targets": [
        {
            "id": "hERG_blocker",  # name of the column in a tabular dataset
            "description": "whether it blocks (1, <10uM) or not blocks (0, >=10uM)",  # description of what this column means
            "units": "activity",  # units of the values in this column (leave empty if unitless)
            "type": "categorical",  # can be "categorical", "ordinal", "continuous"
            "names": [  # names for the property (to sample from for building the prompts)
                "hERG blocker",
                "hERG active compound",
                "hERG blocker",
                "hERG active compound <10uM",
                "Human ether-à-go-go related gene (hERG) blocker",
                "Activity against Human ether-à-go-go related gene (hERG)",
            ],
        },
    ],
    "identifiers": [
        {
            "id": "SMILES",  # column name
            "type": "SMILES",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
            "description": "SMILES",  # description (optional, except for "Other")
        },
    ],
    "license": "CC BY 4.0",  # license under which the original dataset was published
    "links": [  # list of relevant links (original dataset, other uses, etc.)
        {
            "url": "https://doi.org/10.1186/s13321-021-00541-z",
            "description": "corresponding publication",
        },
    ],
    "num_points": len(df),  # number of datapoints in this dataset
    "url": "https://tdcommons.ai/single_pred_tasks/tox/#herg-karim-et-al",
    "bibtex": [
        """@article{Karim2021,
      doi = {10.1186/s13321-021-00541-z},
      url = {https://doi.org/10.1186/s13321-021-00541-z},
      year = {2021},
      month = aug,
      publisher = {Springer Science and Business Media {LLC}},
      volume = {13},
      number = {1},
      author = {Abdul Karim and Matthew Lee and Thomas Balle and Abdul Sattar},
      title = {{CardioTox} net: a robust predictor for {hERG} channel blockade based on deep learning meta-feature ensembles},
      journal = {Journal of Cheminformatics}}""",
    ],
}

In [26]:
def str_presenter(dumper, data):
    """configures yaml for dumping multiline strings
    Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
    """
    if data.count("\n") > 0:  # check for multiline string
        return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
    return dumper.represent_scalar("tag:yaml.org,2002:str", data)


yaml.add_representer(str, str_presenter)
yaml.representer.SafeRepresenter.add_representer(
    str, str_presenter
)  # to use with safe_dum

In [27]:
fn_meta = "meta.yaml"

In [28]:
with open(fn_meta, "w") as f:
    yaml.dump(meta, f, sort_keys=False)

In [29]:
!ls -lh {fn_meta}

-rw-rw-r-- 1 melo melo 1.5K مار  2 16:55 meta.yaml


In [30]:
!cat {fn_meta}

name: hERG_Karim_et_al
description: A integrated Ether-a-go-go-related gene (hERG) dataset consisting of
  molecular structures labelled as hERG (<10uM) and non-hERG (>=10uM) blockers in
  the form of SMILES strings was obtained from the DeepHIT, the BindingDB database,
  ChEMBL bioactivity database, and other literature.
targets:
- id: hERG_blocker
  description: whether it blocks (1, <10uM) or not blocks (0, >=10uM)
  units: activity
  type: categorical
  names:
  - hERG blocker
  - hERG active compound
  - hERG blocker
  - hERG active compound <10uM
  - "Human ether-\xE0-go-go related gene (hERG) blocker"
  - "Activity against Human ether-\xE0-go-go related gene (hERG)"
identifiers:
- id: SMILES
  type: SMILES
  description: SMILES
license: CC BY 4.0
links:
- url: https://doi.org/10.1186/s13321-021-00541-z
  description: corresponding publication
num_points: 13445
url: https://tdcommons.ai/single_pred_tasks/tox/#herg-karim-et-al
bibtex:
- |-
  @article{K

# create transform.py

In [31]:
path_file = "transform.py"

In [32]:
%%writefile $path_file
import pandas as pd
import yaml
from tdc.single_pred import Tox


def get_and_transform_data():
    # get raw data
    data = Tox(name = 'hERG_Karim')
    fn_data_original = "data_original.csv"
    data.get_data().to_csv(fn_data_original, index=False)

    # create dataframe
    df = pd.read_csv(
        fn_data_original,
        delimiter=",",
    )  # not necessary but ensure we can load the saved data

    # check if fields are the same
    fields_orig = df.columns.tolist()
    assert fields_orig == [
        "Drug_ID",
        "Drug",
        "Y",
    ]

    # overwrite column names = fields
    fields_clean = [
        "compound_id",
        "SMILES",
        "hERG_blocker",
    ]
    df.columns = fields_clean

    # data cleaning
#     df.compound_name = (
#         df.compound_name.str.strip()
#     )  
    # remove leading and trailing white space characters

    assert not df.duplicated().sum()

    # save to csv
    fn_data_csv = "data_clean.csv"
    df.to_csv(fn_data_csv, index=False)

    # create meta yaml
    meta = {
        "name": "hERG_Karim_et_al",  # unique identifier, we will also use this for directory names
        "description": """A integrated Ether-a-go-go-related gene (hERG) dataset consisting of molecular structures labelled as hERG (<10uM) and non-hERG (>=10uM) blockers in the form of SMILES strings was obtained from the DeepHIT, the BindingDB database, ChEMBL bioactivity database, and other literature.""",
        "targets": [
            {
                "id": "hERG_blocker",  # name of the column in a tabular dataset
                "description": "whether it blocks (1, <10uM) or not blocks (0, >=10uM)",  # description of what this column means
                "units": "activity",  # units of the values in this column (leave empty if unitless)
                "type": "categorical",  # can be "categorical", "ordinal", "continuous"
                "names": [  # names for the property (to sample from for building the prompts)
                    "hERG blocker",
                    "hERG active compound",
                    "hERG blocker",
                    "hERG active compound <10uM",
                    "Human ether-à-go-go related gene (hERG) blocker",
                    "Activity against Human ether-à-go-go related gene (hERG)",
                ],
            },
        ],
        "identifiers": [
            {
                "id": "SMILES",  # column name
                "type": "SMILES",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
                "description": "SMILES",  # description (optional, except for "Other")
            },
        ],
        "license": "CC BY 4.0",  # license under which the original dataset was published
        "links": [  # list of relevant links (original dataset, other uses, etc.)
            {
                "url": "https://doi.org/10.1186/s13321-021-00541-z",
                "description": "corresponding publication",
            },
        ],
        "num_points": len(df),  # number of datapoints in this dataset
        "url": "https://tdcommons.ai/single_pred_tasks/tox/#herg-karim-et-al",
        "bibtex": [
            """@article{Karim2021,
          doi = {10.1186/s13321-021-00541-z},
          url = {https://doi.org/10.1186/s13321-021-00541-z},
          year = {2021},
          month = aug,
          publisher = {Springer Science and Business Media {LLC}},
          volume = {13},
          number = {1},
          author = {Abdul Karim and Matthew Lee and Thomas Balle and Abdul Sattar},
          title = {{CardioTox} net: a robust predictor for {hERG} channel blockade based on deep learning meta-feature ensembles},
          journal = {Journal of Cheminformatics}}""",
        ],
    }

    def str_presenter(dumper, data):
        """configures yaml for dumping multiline strings
        Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
        """
        if data.count("\n") > 0:  # check for multiline string
            return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
        return dumper.represent_scalar("tag:yaml.org,2002:str", data)

    yaml.add_representer(str, str_presenter)
    yaml.representer.SafeRepresenter.add_representer(
        str, str_presenter
    )  # to use with safe_dum
    fn_meta = "meta.yaml"
    with open(fn_meta, "w") as f:
        yaml.dump(meta, f, sort_keys=False)

    print(f"Finished processing {meta['name']} dataset!")


if __name__ == "__main__":
    get_and_transform_data()

Overwriting transform.py


In [33]:
!python3 transform.py

Found local copy...
Loading...
Done!
Finished processing hERG_Karim_et_al dataset!


In [34]:
ls -lh  # fmt: skip

total 1.7M
drwxrwxr-x 2 melo melo 4.0K مار  2 16:55 [0m[01;34mdata[0m/
-rw-rw-r-- 1 melo melo 839K مار  2 16:55 data_clean.csv
-rw-rw-r-- 1 melo melo 839K مار  2 16:55 data_original.csv
-rw-rw-r-- 1 melo melo  38K مار  2 16:55 example_processing_and_templates.ipynb
-rw-rw-r-- 1 melo melo 1.6K مار  2 16:55 meta.yaml
-rw-rw-r-- 1 melo melo 4.5K مار  2 16:55 transform.py


# End