# DILI (Drug Induced Liver Injury)

Original data repository: https://tdcommons.ai/single_pred_tasks/tox/#dili-drug-induced-liver-injury

# Imports

In [1]:
import pandas as pd
import yaml
from tdc.single_pred import Tox

# Data processing

## Download data

In [2]:
fn_data_original = "data_original.csv"

In [3]:
data = Tox(name = 'DILI')

Downloading...
100%|██████████████████████████████████████| 26.7k/26.7k [00:00<00:00, 198kiB/s]
Loading...
Done!


In [4]:
data.get_data().to_csv(fn_data_original, index=False)

In [5]:
!ls -lh

total 84K
drwxrwxr-x 2 melo melo 4.0K مار  2 16:53 data
-rw-rw-r-- 1 melo melo  26K مار  2 16:53 data_original.csv
-rw-rw-r-- 1 melo melo  37K مار  2 16:53 example_processing_and_templates.ipynb
-rw-rw-r-- 1 melo melo 1.7K مار  2 06:31 meta.yaml
-rw-rw-r-- 1 melo melo 5.0K مار  2 06:30 transform.py


## Load original data

In [6]:
!head -n 5 {fn_data_original}

Drug_ID,Drug,Y
187.0,CC(=O)OCC[N+](C)(C)C,0.0
247.0,C[N+](C)(C)CC(=O)[O-],0.0
298.0,O=C(NC(CO)C(O)c1ccc([N+](=O)[O-])cc1)C(Cl)Cl,0.0
338.0,O=C(O)c1ccccc1O,0.0


In [7]:
df = pd.read_csv(fn_data_original, delimiter=",")

In [8]:
df.head()

Unnamed: 0,Drug_ID,Drug,Y
0,187.0,CC(=O)OCC[N+](C)(C)C,0.0
1,247.0,C[N+](C)(C)CC(=O)[O-],0.0
2,298.0,O=C(NC(CO)C(O)c1ccc([N+](=O)[O-])cc1)C(Cl)Cl,0.0
3,338.0,O=C(O)c1ccccc1O,0.0
4,444.0,CC(NC(C)(C)C)C(=O)c1cccc(Cl)c1,0.0


## Add column = field names
Clean column names (`fields_clean`) and keep original names (`fields_orig`)

In [9]:
fields_orig = df.columns.tolist()
fields_orig

['Drug_ID', 'Drug', 'Y']

In [10]:
assert fields_orig == ['Drug_ID', 'Drug', 'Y']

In [11]:
fields_clean = [
    "compound_id",
    "SMILES",
    "liver_injury",
]

In [12]:
df.columns = fields_clean

In [13]:
assert fields_orig != fields_clean

In [14]:
df.head()

Unnamed: 0,compound_id,SMILES,liver_injury
0,187.0,CC(=O)OCC[N+](C)(C)C,0.0
1,247.0,C[N+](C)(C)CC(=O)[O-],0.0
2,298.0,O=C(NC(CO)C(O)c1ccc([N+](=O)[O-])cc1)C(Cl)Cl,0.0
3,338.0,O=C(O)c1ccccc1O,0.0
4,444.0,CC(NC(C)(C)C)C(=O)c1cccc(Cl)c1,0.0


In [15]:
assert not df.duplicated().sum()

## Save to csv

In [16]:
fn_data_csv = "data_clean.csv"

In [17]:
df.to_csv(fn_data_csv, index=False)

In [18]:
!ls -lh {fn_data_csv}

-rw-rw-r-- 1 melo melo 26K مار  2 16:53 data_clean.csv


In [19]:
!head -n 5 {fn_data_csv}

compound_id,SMILES,liver_injury
187.0,CC(=O)OCC[N+](C)(C)C,0.0
247.0,C[N+](C)(C)CC(=O)[O-],0.0
298.0,O=C(NC(CO)C(O)c1ccc([N+](=O)[O-])cc1)C(Cl)Cl,0.0
338.0,O=C(O)c1ccccc1O,0.0


In [20]:
df.head()

Unnamed: 0,compound_id,SMILES,liver_injury
0,187.0,CC(=O)OCC[N+](C)(C)C,0.0
1,247.0,C[N+](C)(C)CC(=O)[O-],0.0
2,298.0,O=C(NC(CO)C(O)c1ccc([N+](=O)[O-])cc1)C(Cl)Cl,0.0
3,338.0,O=C(O)c1ccccc1O,0.0
4,444.0,CC(NC(C)(C)C)C(=O)c1cccc(Cl)c1,0.0


## Load from csv

In [21]:
fn_data_csv = "data_clean.csv"

In [22]:
df = pd.read_csv(fn_data_csv)

In [23]:
df.head()

Unnamed: 0,compound_id,SMILES,liver_injury
0,187.0,CC(=O)OCC[N+](C)(C)C,0.0
1,247.0,C[N+](C)(C)CC(=O)[O-],0.0
2,298.0,O=C(NC(CO)C(O)c1ccc([N+](=O)[O-])cc1)C(Cl)Cl,0.0
3,338.0,O=C(O)c1ccccc1O,0.0
4,444.0,CC(NC(C)(C)C)C(=O)c1cccc(Cl)c1,0.0


# meta YAML

In [24]:
df.head()

Unnamed: 0,compound_id,SMILES,liver_injury
0,187.0,CC(=O)OCC[N+](C)(C)C,0.0
1,247.0,C[N+](C)(C)CC(=O)[O-],0.0
2,298.0,O=C(NC(CO)C(O)c1ccc([N+](=O)[O-])cc1)C(Cl)Cl,0.0
3,338.0,O=C(O)c1ccccc1O,0.0
4,444.0,CC(NC(C)(C)C)C(=O)c1cccc(Cl)c1,0.0


In [25]:
meta = {
    "name": "Drug_Induced_Liver_Injury",  # unique identifier, we will also use this for directory names
    "description": """Drug-induced liver injury (DILI) is fatal liver disease caused by drugs and it has been the single most frequent cause of safety-related drug marketing withdrawals for the past 50 years (e.g. iproniazid, ticrynafen, benoxaprofen). This dataset is aggregated from U.S. FDA’s National Center for Toxicological Research.""",
    "targets": [
        {
            "id": "mutagenic",  # name of the column in a tabular dataset
            "description": "whether it can cause liver injury (1) or not (0).",  # description of what this column means
            "units": "liver_injury",  # units of the values in this column (leave empty if unitless)
            "type": "categorical",  # can be "categorical", "ordinal", "continuous"
            "names": [  # names for the property (to sample from for building the prompts)
                "DILI",
                "liver injury",
                "Drug Induced Liver Injury",
                "fatal liver disease caused by drugs",
                "liver toxicity"
            ],
        },
    ],
    "identifiers": [
        {
            "id": "SMILES",  # column name
            "type": "SMILES",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
            "description": "SMILES",  # description (optional, except for "Other")
        },
    ],
    "license": "CC BY 4.0",  # license under which the original dataset was published
    "links": [  # list of relevant links (original dataset, other uses, etc.)
        {
            "url": "https://doi.org/10.1021/acs.jcim.5b00238",
            "description": "corresponding publication",
        },
    ],
    "num_points": len(df),  # number of datapoints in this dataset
    "url": "https://tdcommons.ai/single_pred_tasks/tox/#dili-drug-induced-liver-injury",
    "bibtex": [
        """@article{Xu2015,
          doi = {10.1021/acs.jcim.5b00238},
          url = {https://doi.org/10.1021/acs.jcim.5b00238},
          year = {2015},
          month = oct,
          publisher = {American Chemical Society ({ACS})},
          volume = {55},
          number = {10},
          pages = {2085--2093},
          author = {Youjun Xu and Ziwei Dai and Fangjin Chen and Shuaishi Gao and Jianfeng Pei and Luhua Lai},
          title = {Deep Learning for Drug-Induced Liver Injury},
          journal = {Journal of Chemical Information and Modeling}}""",
    ],
}

In [26]:
def str_presenter(dumper, data):
    """configures yaml for dumping multiline strings
    Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
    """
    if data.count("\n") > 0:  # check for multiline string
        return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
    return dumper.represent_scalar("tag:yaml.org,2002:str", data)


yaml.add_representer(str, str_presenter)
yaml.representer.SafeRepresenter.add_representer(
    str, str_presenter
)  # to use with safe_dum

In [27]:
fn_meta = "meta.yaml"

In [28]:
with open(fn_meta, "w") as f:
    yaml.dump(meta, f, sort_keys=False)

In [29]:
!ls -lh {fn_meta}

-rw-rw-r-- 1 melo melo 1.5K مار  2 16:53 meta.yaml


In [30]:
!cat {fn_meta}

name: Drug_Induced_Liver_Injury
description: "Drug-induced liver injury (DILI) is fatal liver disease caused by drugs\
  \ and it has been the single most frequent cause of safety-related drug marketing\
  \ withdrawals for the past 50 years (e.g. iproniazid, ticrynafen, benoxaprofen).\
  \ This dataset is aggregated from U.S. FDA\u2019s National Center for Toxicological\
  \ Research."
targets:
- id: mutagenic
  description: whether it can cause liver injury (1) or not (0).
  units: liver_injury
  type: categorical
  names:
  - DILI
  - liver injury
  - Drug Induced Liver Injury
  - fatal liver disease caused by drugs
  - liver toxicity
identifiers:
- id: SMILES
  type: SMILES
  description: SMILES
license: CC BY 4.0
links:
- url: https://doi.org/10.1021/acs.jcim.5b00238
  description: corresponding publication
num_points: 475
url: https://tdcommons.ai/single_pred_tasks/tox/#dili-drug-induced-liver-injury
bibtex:
- |-
  @article{Xu2015,
            doi = 

# create transform.py

In [31]:
path_file = "transform.py"

In [32]:
%%writefile $path_file
import pandas as pd
import yaml
from tdc.single_pred import Tox


def get_and_transform_data():
    # get raw data
    data = Tox(name = 'DILI')
    fn_data_original = "data_original.csv"
    data.get_data().to_csv(fn_data_original, index=False)

    # create dataframe
    df = pd.read_csv(
        fn_data_original,
        delimiter=",",
    )  # not necessary but ensure we can load the saved data

    # check if fields are the same
    fields_orig = df.columns.tolist()
    assert fields_orig == [
        "Drug_ID",
        "Drug",
        "Y",
    ]

    # overwrite column names = fields
    fields_clean = [
        "compound_id",
        "SMILES",
        "liver_injury",
    ]
    df.columns = fields_clean

    # data cleaning
#     df.compound_id = (
#         df.compound_id.str.strip()
#     )  # remove leading and trailing white space characters

    assert not df.duplicated().sum()

    # save to csv
    fn_data_csv = "data_clean.csv"
    df.to_csv(fn_data_csv, index=False)

    # create meta yaml
    meta =  {
            "name": "Drug_Induced_Liver_Injury",  # unique identifier, we will also use this for directory names
            "description": """Drug-induced liver injury (DILI) is fatal liver disease caused by drugs and it has been the single most frequent cause of safety-related drug marketing withdrawals for the past 50 years (e.g. iproniazid, ticrynafen, benoxaprofen). This dataset is aggregated from U.S. FDA’s National Center for Toxicological Research.""",
            "targets": [
                {
                    "id": "mutagenic",  # name of the column in a tabular dataset
                    "description": "whether it can cause liver injury (1) or not (0).",  # description of what this column means
                    "units": "liver_injury",  # units of the values in this column (leave empty if unitless)
                    "type": "categorical",  # can be "categorical", "ordinal", "continuous"
                    "names": [  # names for the property (to sample from for building the prompts)
                        "DILI",
                        "liver injury",
                        "Drug Induced Liver Injury",
                        "fatal liver disease caused by drugs",
                        "liver toxicity"
                    ],
                },
            ],
            "identifiers": [
                {
                    "id": "SMILES",  # column name
                    "type": "SMILES",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
                    "description": "SMILES",  # description (optional, except for "Other")
                },
            ],
            "license": "CC BY 4.0",  # license under which the original dataset was published
            "links": [  # list of relevant links (original dataset, other uses, etc.)
                {
                    "url": "https://doi.org/10.1021/acs.jcim.5b00238",
                    "description": "corresponding publication",
                },
            ],
            "num_points": len(df),  # number of datapoints in this dataset
            "url": "https://tdcommons.ai/single_pred_tasks/tox/#dili-drug-induced-liver-injury",
            "bibtex": [
                """@article{Xu2015,
                  doi = {10.1021/acs.jcim.5b00238},
                  url = {https://doi.org/10.1021/acs.jcim.5b00238},
                  year = {2015},
                  month = oct,
                  publisher = {American Chemical Society ({ACS})},
                  volume = {55},
                  number = {10},
                  pages = {2085--2093},
                  author = {Youjun Xu and Ziwei Dai and Fangjin Chen and Shuaishi Gao and Jianfeng Pei and Luhua Lai},
                  title = {Deep Learning for Drug-Induced Liver Injury},
                  journal = {Journal of Chemical Information and Modeling}}""",
            ],
        }

    def str_presenter(dumper, data):
        """configures yaml for dumping multiline strings
        Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
        """
        if data.count("\n") > 0:  # check for multiline string
            return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
        return dumper.represent_scalar("tag:yaml.org,2002:str", data)

    yaml.add_representer(str, str_presenter)
    yaml.representer.SafeRepresenter.add_representer(
        str, str_presenter
    )  # to use with safe_dum
    fn_meta = "meta.yaml"
    with open(fn_meta, "w") as f:
        yaml.dump(meta, f, sort_keys=False)

    print(f"Finished processing {meta['name']} dataset!")


if __name__ == "__main__":
    get_and_transform_data()

Overwriting transform.py


In [33]:
!python3 transform.py

Found local copy...
Loading...
Done!
Finished processing Drug_Induced_Liver_Injury dataset!


In [34]:
ls -lh  # fmt: skip

total 112K
drwxrwxr-x 2 melo melo 4.0K مار  2 16:53 [0m[01;34mdata[0m/
-rw-rw-r-- 1 melo melo  26K مار  2 16:53 data_clean.csv
-rw-rw-r-- 1 melo melo  26K مار  2 16:53 data_original.csv
-rw-rw-r-- 1 melo melo  37K مار  2 16:53 example_processing_and_templates.ipynb
-rw-rw-r-- 1 melo melo 1.6K مار  2 16:53 meta.yaml
-rw-rw-r-- 1 melo melo 4.7K مار  2 16:53 transform.py


# End