# Acute_Toxicity_LD50

Original data repository: https://tdcommons.ai/single_pred_tasks/tox/#acute-toxicity-ld50

# Imports

In [1]:
import pandas as pd
import yaml
from tdc.single_pred import Tox

# Data processing

## Download data

In [2]:
fn_data_original = "data_original.csv"

In [3]:
data = Tox(name = 'LD50_Zhu')

Downloading...
100%|████████████████████████████████████████| 707k/707k [00:00<00:00, 998kiB/s]
Loading...
Done!


In [4]:
data.get_data().to_csv(fn_data_original, index=False)

In [5]:
!ls -lh

total 720K
drwxrwxr-x 2 melo melo 4.0K مار  2 16:58 data
-rw-rw-r-- 1 melo melo 661K مار  2 16:58 data_original.csv
-rw-rw-r-- 1 melo melo  39K مار  1 22:23 example_processing_and_templates.ipynb
-rw-rw-r-- 1 melo melo 1.5K مار  1 22:23 meta.yaml
-rw-rw-r-- 1 melo melo 4.8K مار  1 22:23 transform.py


## Load original data

In [6]:
!head -n 5 {fn_data_original}

Drug_ID,Drug,Y
"Diazene, diphenyl-, 1-oxide",[O-][N+](=Nc1ccccc1)c1ccccc1,2.505
"Methane, tribromo-",BrC(Br)Br,2.343
Bromoethene (9CI),C=CBr,2.33
"1,1'-Biphenyl, hexabromo-",Brc1ccc(-c2ccc(Br)c(Br)c2Br)c(Br)c1Br,1.465


In [7]:
df = pd.read_csv(fn_data_original, delimiter=",")

In [8]:
df.head()

Unnamed: 0,Drug_ID,Drug,Y
0,"Diazene, diphenyl-, 1-oxide",[O-][N+](=Nc1ccccc1)c1ccccc1,2.505
1,"Methane, tribromo-",BrC(Br)Br,2.343
2,Bromoethene (9CI),C=CBr,2.33
3,"1,1'-Biphenyl, hexabromo-",Brc1ccc(-c2ccc(Br)c(Br)c2Br)c(Br)c1Br,1.465
4,"Isothiocyanic acid, p-bromophenyl ester",S=C=Nc1ccc(Br)cc1,2.729


## Add column = field names
Clean column names (`fields_clean`) and keep original names (`fields_orig`)

In [9]:
fields_orig = df.columns.tolist()
fields_orig

['Drug_ID', 'Drug', 'Y']

In [10]:
assert fields_orig == ['Drug_ID', 'Drug', 'Y']

In [11]:
fields_clean = [
    "compound_name",
    "SMILES",
    "acute_toxicity",
]

In [12]:
df.columns = fields_clean

In [13]:
assert fields_orig != fields_clean

In [14]:
df.head()

Unnamed: 0,compound_name,SMILES,acute_toxicity
0,"Diazene, diphenyl-, 1-oxide",[O-][N+](=Nc1ccccc1)c1ccccc1,2.505
1,"Methane, tribromo-",BrC(Br)Br,2.343
2,Bromoethene (9CI),C=CBr,2.33
3,"1,1'-Biphenyl, hexabromo-",Brc1ccc(-c2ccc(Br)c(Br)c2Br)c(Br)c1Br,1.465
4,"Isothiocyanic acid, p-bromophenyl ester",S=C=Nc1ccc(Br)cc1,2.729


## Data cleaning

In [15]:
df.compound_name = (
    df.compound_name.str.strip()
)  # remove leading and trailing white space characters

In [16]:
assert not df.duplicated().sum()

## Save to csv

In [17]:
fn_data_csv = "data_clean.csv"

In [18]:
df.to_csv(fn_data_csv, index=False)

In [19]:
!ls -lh {fn_data_csv}

-rw-rw-r-- 1 melo melo 661K مار  2 16:58 data_clean.csv


In [20]:
!head -n 5 {fn_data_csv}

compound_name,SMILES,acute_toxicity
"Diazene, diphenyl-, 1-oxide",[O-][N+](=Nc1ccccc1)c1ccccc1,2.505
"Methane, tribromo-",BrC(Br)Br,2.343
Bromoethene (9CI),C=CBr,2.33
"1,1'-Biphenyl, hexabromo-",Brc1ccc(-c2ccc(Br)c(Br)c2Br)c(Br)c1Br,1.465


In [21]:
df.head()

Unnamed: 0,compound_name,SMILES,acute_toxicity
0,"Diazene, diphenyl-, 1-oxide",[O-][N+](=Nc1ccccc1)c1ccccc1,2.505
1,"Methane, tribromo-",BrC(Br)Br,2.343
2,Bromoethene (9CI),C=CBr,2.33
3,"1,1'-Biphenyl, hexabromo-",Brc1ccc(-c2ccc(Br)c(Br)c2Br)c(Br)c1Br,1.465
4,"Isothiocyanic acid, p-bromophenyl ester",S=C=Nc1ccc(Br)cc1,2.729


## Load from csv

In [22]:
fn_data_csv = "data_clean.csv"

In [23]:
df = pd.read_csv(fn_data_csv)

In [24]:
df.head()

Unnamed: 0,compound_name,SMILES,acute_toxicity
0,"Diazene, diphenyl-, 1-oxide",[O-][N+](=Nc1ccccc1)c1ccccc1,2.505
1,"Methane, tribromo-",BrC(Br)Br,2.343
2,Bromoethene (9CI),C=CBr,2.33
3,"1,1'-Biphenyl, hexabromo-",Brc1ccc(-c2ccc(Br)c(Br)c2Br)c(Br)c1Br,1.465
4,"Isothiocyanic acid, p-bromophenyl ester",S=C=Nc1ccc(Br)cc1,2.729


# meta YAML

In [25]:
df.head()

Unnamed: 0,compound_name,SMILES,acute_toxicity
0,"Diazene, diphenyl-, 1-oxide",[O-][N+](=Nc1ccccc1)c1ccccc1,2.505
1,"Methane, tribromo-",BrC(Br)Br,2.343
2,Bromoethene (9CI),C=CBr,2.33
3,"1,1'-Biphenyl, hexabromo-",Brc1ccc(-c2ccc(Br)c(Br)c2Br)c(Br)c1Br,1.465
4,"Isothiocyanic acid, p-bromophenyl ester",S=C=Nc1ccc(Br)cc1,2.729


In [26]:
meta = {
    "name": "ld50_zhu",  # unique identifier, we will also use this for directory names
    "description": """Acute toxicity LD50 measures the most conservative dose that can lead to lethal adverse effects. The higher the dose, the more lethal of a drug.""",
    "targets": [
        {
            "id": "acute_toxicity",  # name of the column in a tabular dataset
            "description": "Acute Toxicity LD50.",  # description of what this column means
            "units": "ld50",  # units of the values in this column (leave empty if unitless)
            "type": "continuous",  # can be "categorical", "ordinal", "continuous"
            "names": [  # names for the property (to sample from for building the prompts)
                "Acute Toxicity LD50",
                "ld50",
                "conservative dose that can lead to lethal adverse effects.",
                "Rat Acute Toxicity by Oral Exposure",
                "Toxicity",
            ],
        },
    ],
    "identifiers": [
        {
            "id": "SMILES",  # column name
            "type": "SMILES",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
            "description": "SMILES",  # description (optional, except for "Other")
        },
        {
            "id": "compound_name",
            "type": "Synonyms",
            "description": "compound name",
            "names": [
                "compound",
                "compound name",
                "drug",
            ],
        },
    ],
    "license": "CC BY 4.0",  # license under which the original dataset was published
    "links": [  # list of relevant links (original dataset, other uses, etc.)
        {
            "url": "https://doi.org/10.1021/tx900189p",
            "description": "corresponding publication",
        },
    ],
    "num_points": len(df),  # number of datapoints in this dataset
    "url": "https://tdcommons.ai/single_pred_tasks/tox/#acute-toxicity-ld50",
    "bibtex": [
        """@article{Zhu2009,
      doi = {10.1021/tx900189p},
      url = {https://doi.org/10.1021/tx900189p},
      year = {2009},
      month = oct,
      publisher = {American Chemical Society ({ACS})},
      volume = {22},
      number = {12},
      pages = {1913--1921},
      author = {Hao Zhu and Todd M. Martin and Lin Ye and Alexander Sedykh and Douglas M. Young and Alexander Tropsha},
      title = {Quantitative Structure-Activity Relationship Modeling of Rat Acute Toxicity by Oral Exposure},
      journal = {Chemical Research in Toxicology}
}""",
    ],
}

In [27]:
def str_presenter(dumper, data):
    """configures yaml for dumping multiline strings
    Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
    """
    if data.count("\n") > 0:  # check for multiline string
        return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
    return dumper.represent_scalar("tag:yaml.org,2002:str", data)


yaml.add_representer(str, str_presenter)
yaml.representer.SafeRepresenter.add_representer(
    str, str_presenter
)  # to use with safe_dum

In [28]:
fn_meta = "meta.yaml"

In [29]:
with open(fn_meta, "w") as f:
    yaml.dump(meta, f, sort_keys=False)

In [30]:
!ls -lh {fn_meta}

-rw-rw-r-- 1 melo melo 1.4K مار  2 16:58 meta.yaml


In [31]:
!cat {fn_meta}

name: ld50_zhu
description: Acute toxicity LD50 measures the most conservative dose that can lead
  to lethal adverse effects. The higher the dose, the more lethal of a drug.
targets:
- id: acute_toxicity
  description: Acute Toxicity LD50.
  units: ld50
  type: continuous
  names:
  - Acute Toxicity LD50
  - ld50
  - conservative dose that can lead to lethal adverse effects.
  - Rat Acute Toxicity by Oral Exposure
  - Toxicity
identifiers:
- id: SMILES
  type: SMILES
  description: SMILES
- id: compound_name
  type: Synonyms
  description: compound name
  names:
  - compound
  - compound name
  - drug
license: CC BY 4.0
links:
- url: https://doi.org/10.1021/tx900189p
  description: corresponding publication
num_points: 7385
url: https://tdcommons.ai/single_pred_tasks/tox/#acute-toxicity-ld50
bibtex:
- |-
  @article{Zhu2009,
        doi = {10.1021/tx900189p},
        url = {https://doi.org/10.1021/tx900189p},
        year = {2009},
        month = o

# create transform.py

In [32]:
path_file = "transform.py"

In [33]:
%%writefile $path_file
import pandas as pd
import yaml
from tdc.single_pred import Tox


def get_and_transform_data():
    # get raw data
    data = Tox(name = 'LD50_Zhu')
    fn_data_original = "data_original.csv"
    data.get_data().to_csv(fn_data_original, index=False)

    # create dataframe
    df = pd.read_csv(
        fn_data_original,
        delimiter=",",
    )  # not necessary but ensure we can load the saved data

    # check if fields are the same
    fields_orig = df.columns.tolist()
    assert fields_orig == [
        "Drug_ID",
        "Drug",
        "Y",
    ]

    # overwrite column names = fields
    fields_clean = [
        "compound_name",
        "SMILES",
        "acute_toxicity",
    ]
    df.columns = fields_clean

    # data cleaning
    df.compound_name = (
        df.compound_name.str.strip()
    )  # remove leading and trailing white space characters

    assert not df.duplicated().sum()

    # save to csv
    fn_data_csv = "data_clean.csv"
    df.to_csv(fn_data_csv, index=False)

    # create meta yaml
    meta = {
            "name": "ld50_zhu",  # unique identifier, we will also use this for directory names
            "description": """Acute toxicity LD50 measures the most conservative dose that can lead to lethal adverse effects. The higher the dose, the more lethal of a drug.""",
            "targets": [
                {
                    "id": "acute_toxicity",  # name of the column in a tabular dataset
                    "description": "Acute Toxicity LD50.",  # description of what this column means
                    "units": "ld50",  # units of the values in this column (leave empty if unitless)
                    "type": "continuous",  # can be "categorical", "ordinal", "continuous"
                    "names": [  # names for the property (to sample from for building the prompts)
                        "Acute Toxicity LD50",
                        "ld50",
                        "conservative dose that can lead to lethal adverse effects.",
                        "Rat Acute Toxicity by Oral Exposure",
                        "Toxicity",
                    ],
                },
            ],
            "identifiers": [
                {
                    "id": "SMILES",  # column name
                    "type": "SMILES",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
                    "description": "SMILES",  # description (optional, except for "Other")
                },
                {
                    "id": "compound_name",
                    "type": "Synonyms",
                    "description": "compound name",
                    "names": [
                        "compound",
                        "compound name",
                        "drug",
                    ],
                },
            ],
            "license": "CC BY 4.0",  # license under which the original dataset was published
            "links": [  # list of relevant links (original dataset, other uses, etc.)
                {
                    "url": "https://doi.org/10.1021/tx900189p",
                    "description": "corresponding publication",
                },
            ],
            "num_points": len(df),  # number of datapoints in this dataset
            "url": "https://tdcommons.ai/single_pred_tasks/tox/#acute-toxicity-ld50",
            "bibtex": [
                """@article{Zhu2009,
              doi = {10.1021/tx900189p},
              url = {https://doi.org/10.1021/tx900189p},
              year = {2009},
              month = oct,
              publisher = {American Chemical Society ({ACS})},
              volume = {22},
              number = {12},
              pages = {1913--1921},
              author = {Hao Zhu and Todd M. Martin and Lin Ye and Alexander Sedykh and Douglas M. Young and Alexander Tropsha},
              title = {Quantitative Structure-Activity Relationship Modeling of Rat Acute Toxicity by Oral Exposure},
              journal = {Chemical Research in Toxicology}}""",
            ],
        }

    def str_presenter(dumper, data):
        """configures yaml for dumping multiline strings
        Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
        """
        if data.count("\n") > 0:  # check for multiline string
            return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
        return dumper.represent_scalar("tag:yaml.org,2002:str", data)

    yaml.add_representer(str, str_presenter)
    yaml.representer.SafeRepresenter.add_representer(
        str, str_presenter
    )  # to use with safe_dum
    fn_meta = "meta.yaml"
    with open(fn_meta, "w") as f:
        yaml.dump(meta, f, sort_keys=False)

    print(f"Finished processing {meta['name']} dataset!")


if __name__ == "__main__":
    get_and_transform_data()

Overwriting transform.py


In [34]:
!python3 transform.py

Found local copy...
Loading...
Done!
Finished processing ld50_zhu dataset!


In [35]:
ls -lh  # fmt: skip

total 1.4M
drwxrwxr-x 2 melo melo 4.0K مار  2 16:58 [0m[01;34mdata[0m/
-rw-rw-r-- 1 melo melo 661K مار  2 16:58 data_clean.csv
-rw-rw-r-- 1 melo melo 661K مار  2 16:58 data_original.csv
-rw-rw-r-- 1 melo melo  39K مار  1 22:23 example_processing_and_templates.ipynb
-rw-rw-r-- 1 melo melo 1.5K مار  2 16:58 meta.yaml
-rw-rw-r-- 1 melo melo 4.8K مار  2 16:58 transform.py


# End