# ClinTox

Original data repository: https://tdcommons.ai/single_pred_tasks/tox/#clintox

# Imports

In [2]:
import pandas as pd
import yaml
from tdc.single_pred import Tox

# Data processing

## Download data

In [3]:
fn_data_original = "data_original.csv"

In [4]:
data = Tox(name = 'ClinTox')

Downloading...
100%|████████████████████████████████████████| 110k/110k [00:00<00:00, 292kiB/s]
Loading...
Done!


In [5]:
data.get_data().to_csv(fn_data_original, index=False)

In [6]:
!ls -lh

total 160K
drwxrwxr-x 2 melo melo 4.0K مار  2 16:52 data
-rw-rw-r-- 1 melo melo 102K مار  2 16:52 data_original.csv
-rw-rw-r-- 1 melo melo  37K مار  2 16:52 example_processing_and_templates.ipynb
-rw-rw-r-- 1 melo melo 1.4K مار  2 07:55 meta.yaml
-rw-rw-r-- 1 melo melo 4.5K مار  2 07:53 transform.py


## Load original data

In [7]:
!head -n 5 {fn_data_original}

Drug_ID,Drug,Y
Drug 0,*C(=O)[C@H](CCCCNC(=O)OCCOC)NC(=O)OCCOC,0
Drug 1,Cl[C@H]1[C@H](Cl)[C@@H](Cl)[C@@H](Cl)[C@H](Cl)[C@H]1Cl,0
Drug 2,O=C([O-])[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C(=O)[O-],0
Drug 3,[H]/[NH+]=C(\N)C1=CC(=O)/C(=C\C=c2ccc(=C(N)[NH3+])cc2)C=C1,0


In [8]:
df = pd.read_csv(fn_data_original, delimiter=",")

In [9]:
df.head()

Unnamed: 0,Drug_ID,Drug,Y
0,Drug 0,*C(=O)[C@H](CCCCNC(=O)OCCOC)NC(=O)OCCOC,0
1,Drug 1,Cl[C@H]1[C@H](Cl)[C@@H](Cl)[C@@H](Cl)[C@H](Cl)...,0
2,Drug 2,O=C([O-])[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C(...,0
3,Drug 3,[H]/[NH+]=C(\N)C1=CC(=O)/C(=C\C=c2ccc(=C(N)[NH...,0
4,Drug 4,[H]/[NH+]=C(\N)c1ccc(OCCCCCOc2ccc(/C(N)=[NH+]/...,0


## Add column = field names
Clean column names (`fields_clean`) and keep original names (`fields_orig`)

In [10]:
fields_orig = df.columns.tolist()
fields_orig

['Drug_ID', 'Drug', 'Y']

In [11]:
assert fields_orig == ['Drug_ID', 'Drug', 'Y']

In [12]:
fields_clean = [
    "compound_id",
    "SMILES",
    "clinical_toxicity",
]

In [13]:
df.columns = fields_clean

In [14]:
assert fields_orig != fields_clean

In [15]:
df.head()

Unnamed: 0,compound_id,SMILES,clinical_toxicity
0,Drug 0,*C(=O)[C@H](CCCCNC(=O)OCCOC)NC(=O)OCCOC,0
1,Drug 1,Cl[C@H]1[C@H](Cl)[C@@H](Cl)[C@@H](Cl)[C@H](Cl)...,0
2,Drug 2,O=C([O-])[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C(...,0
3,Drug 3,[H]/[NH+]=C(\N)C1=CC(=O)/C(=C\C=c2ccc(=C(N)[NH...,0
4,Drug 4,[H]/[NH+]=C(\N)c1ccc(OCCCCCOc2ccc(/C(N)=[NH+]/...,0


In [16]:
assert not df.duplicated().sum()

## Save to csv

In [17]:
fn_data_csv = "data_clean.csv"

In [18]:
df.to_csv(fn_data_csv, index=False)

In [19]:
!ls -lh {fn_data_csv}

-rw-rw-r-- 1 melo melo 102K مار  2 16:52 data_clean.csv


In [20]:
!head -n 5 {fn_data_csv}

compound_id,SMILES,clinical_toxicity
Drug 0,*C(=O)[C@H](CCCCNC(=O)OCCOC)NC(=O)OCCOC,0
Drug 1,Cl[C@H]1[C@H](Cl)[C@@H](Cl)[C@@H](Cl)[C@H](Cl)[C@H]1Cl,0
Drug 2,O=C([O-])[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C(=O)[O-],0
Drug 3,[H]/[NH+]=C(\N)C1=CC(=O)/C(=C\C=c2ccc(=C(N)[NH3+])cc2)C=C1,0


In [21]:
df.head()

Unnamed: 0,compound_id,SMILES,clinical_toxicity
0,Drug 0,*C(=O)[C@H](CCCCNC(=O)OCCOC)NC(=O)OCCOC,0
1,Drug 1,Cl[C@H]1[C@H](Cl)[C@@H](Cl)[C@@H](Cl)[C@H](Cl)...,0
2,Drug 2,O=C([O-])[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C(...,0
3,Drug 3,[H]/[NH+]=C(\N)C1=CC(=O)/C(=C\C=c2ccc(=C(N)[NH...,0
4,Drug 4,[H]/[NH+]=C(\N)c1ccc(OCCCCCOc2ccc(/C(N)=[NH+]/...,0


## Load from csv

In [22]:
fn_data_csv = "data_clean.csv"

In [23]:
df = pd.read_csv(fn_data_csv)

In [24]:
df.head()

Unnamed: 0,compound_id,SMILES,clinical_toxicity
0,Drug 0,*C(=O)[C@H](CCCCNC(=O)OCCOC)NC(=O)OCCOC,0
1,Drug 1,Cl[C@H]1[C@H](Cl)[C@@H](Cl)[C@@H](Cl)[C@H](Cl)...,0
2,Drug 2,O=C([O-])[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C(...,0
3,Drug 3,[H]/[NH+]=C(\N)C1=CC(=O)/C(=C\C=c2ccc(=C(N)[NH...,0
4,Drug 4,[H]/[NH+]=C(\N)c1ccc(OCCCCCOc2ccc(/C(N)=[NH+]/...,0


# meta YAML

In [25]:
df.head()

Unnamed: 0,compound_id,SMILES,clinical_toxicity
0,Drug 0,*C(=O)[C@H](CCCCNC(=O)OCCOC)NC(=O)OCCOC,0
1,Drug 1,Cl[C@H]1[C@H](Cl)[C@@H](Cl)[C@@H](Cl)[C@H](Cl)...,0
2,Drug 2,O=C([O-])[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C(...,0
3,Drug 3,[H]/[NH+]=C(\N)C1=CC(=O)/C(=C\C=c2ccc(=C(N)[NH...,0
4,Drug 4,[H]/[NH+]=C(\N)c1ccc(OCCCCCOc2ccc(/C(N)=[NH+]/...,0


In [26]:
meta = {
    "name": "ClinTox",  # unique identifier, we will also use this for directory names
    "description": """The ClinTox dataset includes drugs that have failed clinical trials for toxicity reasons and also drugs that are associated with successful trials.""",
    "targets": [
        {
            "id": "clinical_toxicity",  # name of the column in a tabular dataset
            "description": "whether it can cause clinical toxicity (1) or not (0).",  # description of what this column means
            "units": "clinical_toxicity",  # units of the values in this column (leave empty if unitless)
            "type": "categorical",  # can be "categorical", "ordinal", "continuous"
            "names": [  # names for the property (to sample from for building the prompts)
                "clinical toxicity",
                "toxicity",
                "drug Induced clinical toxicity",
                "drug failed in clinical trials"
            ],
        },
    ],
    "identifiers": [
        {
            "id": "SMILES",  # column name
            "type": "SMILES",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
            "description": "SMILES",  # description (optional, except for "Other")
        },
    ],
    "license": "CC BY 4.0",  # license under which the original dataset was published
    "links": [  # list of relevant links (original dataset, other uses, etc.)
        {
            "url": "https://doi.org/10.1016/j.chembiol.2016.07.023",
            "description": "corresponding publication",
        },
    ],
    "num_points": len(df),  # number of datapoints in this dataset
    "url": "https://tdcommons.ai/single_pred_tasks/tox/#clintox",
    "bibtex": [
        """@article{Gayvert2016,
          doi = {10.1016/j.chembiol.2016.07.023},
          url = {https://doi.org/10.1016/j.chembiol.2016.07.023},
          year = {2016},
          month = oct,
          publisher = {Elsevier {BV}},
          volume = {23},
          number = {10},
          pages = {1294--1301},
          author = {Kaitlyn~M. Gayvert and Neel~S. Madhukar and Olivier Elemento},
          title = {A Data-Driven Approach to Predicting Successes and Failures of Clinical Trials},
          journal = {Cell Chemical Biology}}""",
    ],
}

In [27]:
def str_presenter(dumper, data):
    """configures yaml for dumping multiline strings
    Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
    """
    if data.count("\n") > 0:  # check for multiline string
        return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
    return dumper.represent_scalar("tag:yaml.org,2002:str", data)


yaml.add_representer(str, str_presenter)
yaml.representer.SafeRepresenter.add_representer(
    str, str_presenter
)  # to use with safe_dum

In [28]:
fn_meta = "meta.yaml"

In [29]:
with open(fn_meta, "w") as f:
    yaml.dump(meta, f, sort_keys=False)

In [30]:
!ls -lh {fn_meta}

-rw-rw-r-- 1 melo melo 1.3K مار  2 16:52 meta.yaml


In [31]:
!cat {fn_meta}

name: ClinTox
description: The ClinTox dataset includes drugs that have failed clinical trials for
  toxicity reasons and also drugs that are associated with successful trials.
targets:
- id: clinical_toxicity
  description: whether it can cause clinical toxicity (1) or not (0).
  units: clinical_toxicity
  type: categorical
  names:
  - clinical toxicity
  - toxicity
  - drug Induced clinical toxicity
  - drug failed in clinical trials
identifiers:
- id: SMILES
  type: SMILES
  description: SMILES
license: CC BY 4.0
links:
- url: https://doi.org/10.1016/j.chembiol.2016.07.023
  description: corresponding publication
num_points: 1478
url: https://tdcommons.ai/single_pred_tasks/tox/#clintox
bibtex:
- |-
  @article{Gayvert2016,
            doi = {10.1016/j.chembiol.2016.07.023},
            url = {https://doi.org/10.1016/j.chembiol.2016.07.023},
            year = {2016},
            month = oct,
            publisher = {Elsevier {BV}},
            volume =

# create transform.py

In [32]:
path_file = "transform.py"

In [33]:
%%writefile $path_file
import pandas as pd
import yaml
from tdc.single_pred import Tox


def get_and_transform_data():
    # get raw data
    data = Tox(name = 'ClinTox')
    fn_data_original = "data_original.csv"
    data.get_data().to_csv(fn_data_original, index=False)

    # create dataframe
    df = pd.read_csv(
        fn_data_original,
        delimiter=",",
    )  # not necessary but ensure we can load the saved data

    # check if fields are the same
    fields_orig = df.columns.tolist()
    assert fields_orig == [
        "Drug_ID",
        "Drug",
        "Y",
    ]

    # overwrite column names = fields
    fields_clean =[
        "compound_id",
        "SMILES",
        "clinical_toxicity",
    ]
    df.columns = fields_clean

    # data cleaning
    df.compound_id = (
        df.compound_id.str.strip()
    )  # remove leading and trailing white space characters

    assert not df.duplicated().sum()

    # save to csv
    fn_data_csv = "data_clean.csv"
    df.to_csv(fn_data_csv, index=False)

    # create meta yaml
    meta =  {"name": "ClinTox",  # unique identifier, we will also use this for directory names
        "description": """The ClinTox dataset includes drugs that have failed clinical trials for toxicity reasons and also drugs that are associated with successful trials.""",
        "targets": [
            {
                "id": "clinical_toxicity",  # name of the column in a tabular dataset
                "description": "whether it can cause clinical toxicity (1) or not (0).",  # description of what this column means
                "units": "clinical_toxicity",  # units of the values in this column (leave empty if unitless)
                "type": "categorical",  # can be "categorical", "ordinal", "continuous"
                "names": [  # names for the property (to sample from for building the prompts)
                    "clinical toxicity",
                    "toxicity",
                    "drug Induced clinical toxicity",
                    "drug failed in clinical trials"
                ],
            },
        ],
        "identifiers": [
            {
                "id": "SMILES",  # column name
                "type": "SMILES",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
                "description": "SMILES",  # description (optional, except for "Other")
            },
        ],
        "license": "CC BY 4.0",  # license under which the original dataset was published
        "links": [  # list of relevant links (original dataset, other uses, etc.)
            {
                "url": "https://doi.org/10.1016/j.chembiol.2016.07.023",
                "description": "corresponding publication",
            },
        ],
        "num_points": len(df),  # number of datapoints in this dataset
        "url": "https://tdcommons.ai/single_pred_tasks/tox/#clintox",
        "bibtex": [
            """@article{Gayvert2016,
              doi = {10.1016/j.chembiol.2016.07.023},
              url = {https://doi.org/10.1016/j.chembiol.2016.07.023},
              year = {2016},
              month = oct,
              publisher = {Elsevier {BV}},
              volume = {23},
              number = {10},
              pages = {1294--1301},
              author = {Kaitlyn~M. Gayvert and Neel~S. Madhukar and Olivier Elemento},
              title = {A Data-Driven Approach to Predicting Successes and Failures of Clinical Trials},
              journal = {Cell Chemical Biology}}""",
        ],
    }

    def str_presenter(dumper, data):
        """configures yaml for dumping multiline strings
        Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
        """
        if data.count("\n") > 0:  # check for multiline string
            return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
        return dumper.represent_scalar("tag:yaml.org,2002:str", data)

    yaml.add_representer(str, str_presenter)
    yaml.representer.SafeRepresenter.add_representer(
        str, str_presenter
    )  # to use with safe_dum
    fn_meta = "meta.yaml"
    with open(fn_meta, "w") as f:
        yaml.dump(meta, f, sort_keys=False)

    print(f"Finished processing {meta['name']} dataset!")


if __name__ == "__main__":
    get_and_transform_data()

Overwriting transform.py


In [34]:
!python3 transform.py

Found local copy...
Loading...
Done!
Finished processing ClinTox dataset!


In [35]:
ls -lh  # fmt: skip

total 264K
drwxrwxr-x 2 melo melo 4.0K مار  2 16:52 [0m[01;34mdata[0m/
-rw-rw-r-- 1 melo melo 102K مار  2 16:52 data_clean.csv
-rw-rw-r-- 1 melo melo 102K مار  2 16:52 data_original.csv
-rw-rw-r-- 1 melo melo  37K مار  2 16:52 example_processing_and_templates.ipynb
-rw-rw-r-- 1 melo melo 1.3K مار  2 16:52 meta.yaml
-rw-rw-r-- 1 melo melo 4.2K مار  2 16:52 transform.py


# End