# Ames Mutagenicity

Original data repository: https://tdcommons.ai/single_pred_tasks/tox/#ames-mutagenicity

# Imports

In [1]:
import pandas as pd
import yaml
from tdc.single_pred import Tox

# Data processing

## Download data

In [2]:
fn_data_original = "data_original.csv"

In [3]:
data = Tox(name = 'AMES')

Downloading...
100%|████████████████████████████████████████| 344k/344k [00:00<00:00, 630kiB/s]
Loading...
Done!


In [4]:
data.get_data().to_csv(fn_data_original, index=False)

In [5]:
!ls -lh

total 364K
drwxrwxr-x 2 melo melo 4.0K مار  2 16:49 data
-rw-rw-r-- 1 melo melo 308K مار  2 16:49 data_original.csv
-rw-rw-r-- 1 melo melo  38K مار  2 16:49 example_processing_and_templates.ipynb
-rw-rw-r-- 1 melo melo 1.9K مار  2 06:19 meta.yaml
-rw-rw-r-- 1 melo melo 4.9K مار  2 06:18 transform.py


## Load original data

In [6]:
!head -n 5 {fn_data_original}

Drug_ID,Drug,Y
Drug 0,O=[N+]([O-])c1ccc2ccc3ccc([N+](=O)[O-])c4c5ccccc5c1c2c34,1
Drug 1,O=[N+]([O-])c1c2c(c3ccc4cccc5ccc1c3c45)CCCC2,1
Drug 2,O=c1c2ccccc2c(=O)c2c1ccc1c2[nH]c2c3c(=O)c4ccccc4c(=O)c3c3[nH]c4c(ccc5c(=O)c6ccccc6c(=O)c54)c3c12,0
Drug 3,[N-]=[N+]=CC(=O)NCC(=O)NN,1


In [7]:
df = pd.read_csv(fn_data_original, delimiter=",")

In [8]:
df.head()

Unnamed: 0,Drug_ID,Drug,Y
0,Drug 0,O=[N+]([O-])c1ccc2ccc3ccc([N+](=O)[O-])c4c5ccc...,1
1,Drug 1,O=[N+]([O-])c1c2c(c3ccc4cccc5ccc1c3c45)CCCC2,1
2,Drug 2,O=c1c2ccccc2c(=O)c2c1ccc1c2[nH]c2c3c(=O)c4cccc...,0
3,Drug 3,[N-]=[N+]=CC(=O)NCC(=O)NN,1
4,Drug 4,[N-]=[N+]=C1C=NC(=O)NC1=O,1


## Add column = field names
Clean column names (`fields_clean`) and keep original names (`fields_orig`)

In [9]:
fields_orig = df.columns.tolist()
fields_orig

['Drug_ID', 'Drug', 'Y']

In [10]:
assert fields_orig == ['Drug_ID', 'Drug', 'Y']

In [11]:
fields_clean = [
    "compound_id",
    "SMILES",
    "mutagenic",
]

In [12]:
df.columns = fields_clean

In [13]:
assert fields_orig != fields_clean

In [14]:
df.head()

Unnamed: 0,compound_id,SMILES,mutagenic
0,Drug 0,O=[N+]([O-])c1ccc2ccc3ccc([N+](=O)[O-])c4c5ccc...,1
1,Drug 1,O=[N+]([O-])c1c2c(c3ccc4cccc5ccc1c3c45)CCCC2,1
2,Drug 2,O=c1c2ccccc2c(=O)c2c1ccc1c2[nH]c2c3c(=O)c4cccc...,0
3,Drug 3,[N-]=[N+]=CC(=O)NCC(=O)NN,1
4,Drug 4,[N-]=[N+]=C1C=NC(=O)NC1=O,1


## Data cleaning

In [15]:
df.compound_id = (
    df.compound_id.str.strip()
)  # remove leading and trailing white space characters

In [16]:
assert not df.duplicated().sum()

## Save to csv

In [17]:
fn_data_csv = "data_clean.csv"

In [18]:
df.to_csv(fn_data_csv, index=False)

In [19]:
!ls -lh {fn_data_csv}

-rw-rw-r-- 1 melo melo 308K مار  2 16:49 data_clean.csv


In [20]:
!head -n 5 {fn_data_csv}

compound_id,SMILES,mutagenic
Drug 0,O=[N+]([O-])c1ccc2ccc3ccc([N+](=O)[O-])c4c5ccccc5c1c2c34,1
Drug 1,O=[N+]([O-])c1c2c(c3ccc4cccc5ccc1c3c45)CCCC2,1
Drug 2,O=c1c2ccccc2c(=O)c2c1ccc1c2[nH]c2c3c(=O)c4ccccc4c(=O)c3c3[nH]c4c(ccc5c(=O)c6ccccc6c(=O)c54)c3c12,0
Drug 3,[N-]=[N+]=CC(=O)NCC(=O)NN,1


In [21]:
df.head()

Unnamed: 0,compound_id,SMILES,mutagenic
0,Drug 0,O=[N+]([O-])c1ccc2ccc3ccc([N+](=O)[O-])c4c5ccc...,1
1,Drug 1,O=[N+]([O-])c1c2c(c3ccc4cccc5ccc1c3c45)CCCC2,1
2,Drug 2,O=c1c2ccccc2c(=O)c2c1ccc1c2[nH]c2c3c(=O)c4cccc...,0
3,Drug 3,[N-]=[N+]=CC(=O)NCC(=O)NN,1
4,Drug 4,[N-]=[N+]=C1C=NC(=O)NC1=O,1


## Load from csv

In [22]:
fn_data_csv = "data_clean.csv"

In [23]:
df = pd.read_csv(fn_data_csv)

In [24]:
df.head()

Unnamed: 0,compound_id,SMILES,mutagenic
0,Drug 0,O=[N+]([O-])c1ccc2ccc3ccc([N+](=O)[O-])c4c5ccc...,1
1,Drug 1,O=[N+]([O-])c1c2c(c3ccc4cccc5ccc1c3c45)CCCC2,1
2,Drug 2,O=c1c2ccccc2c(=O)c2c1ccc1c2[nH]c2c3c(=O)c4cccc...,0
3,Drug 3,[N-]=[N+]=CC(=O)NCC(=O)NN,1
4,Drug 4,[N-]=[N+]=C1C=NC(=O)NC1=O,1


# meta YAML

In [25]:
df.head()

Unnamed: 0,compound_id,SMILES,mutagenic
0,Drug 0,O=[N+]([O-])c1ccc2ccc3ccc([N+](=O)[O-])c4c5ccc...,1
1,Drug 1,O=[N+]([O-])c1c2c(c3ccc4cccc5ccc1c3c45)CCCC2,1
2,Drug 2,O=c1c2ccccc2c(=O)c2c1ccc1c2[nH]c2c3c(=O)c4cccc...,0
3,Drug 3,[N-]=[N+]=CC(=O)NCC(=O)NN,1
4,Drug 4,[N-]=[N+]=C1C=NC(=O)NC1=O,1


In [26]:
meta = {
    "name": "Ames_Mutagenicity",  # unique identifier, we will also use this for directory names
    "description": """Mutagenicity means the ability of a drug to induce genetic alterations. Drugs that can cause damage to the DNA can result in cell death or other severe adverse effects. Nowadays, the most widely used assay for testing the mutagenicity of compounds is the Ames experiment which was invented by a professor named Ames. The Ames test is a short-term bacterial reverse mutation assay detecting a large number of compounds which can induce genetic damage and frameshift mutations. The dataset is aggregated from four papers.""",
    "targets": [
        {
            "id": "mutagenic",  # name of the column in a tabular dataset
            "description": "whether it is mutagenic (1) or not mutagenic (0)",  # description of what this column means
            "units": "ld50",  # units of the values in this column (leave empty if unitless)
            "type": "categorical",  # can be "categorical", "ordinal", "continuous"
            "names": [  # names for the property (to sample from for building the prompts)
                "mutagenic",
                "Mutagenicity",
                "Ames Mutagenicity",
                "ability of a drug to induce genetic alterations",
                "mutagens"
            ],
        },
    ],
    "identifiers": [
        {
            "id": "SMILES",  # column name
            "type": "SMILES",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
            "description": "SMILES",  # description (optional, except for "Other")
        },
    ],
    "license": "CC BY 4.0",  # license under which the original dataset was published
    "links": [  # list of relevant links (original dataset, other uses, etc.)
        {
            "url": "https://doi.org/10.1021/ci300400a",
            "description": "corresponding publication",
        },
    ],
    "num_points": len(df),  # number of datapoints in this dataset
    "url": "https://tdcommons.ai/single_pred_tasks/tox/#ames-mutagenicity",
    "bibtex": [
        """@article{Xu2012,
          doi = {10.1021/ci300400a},
          url = {https://doi.org/10.1021/ci300400a},
          year = {2012},
          month = oct,
          publisher = {American Chemical Society ({ACS})},
          volume = {52},
          number = {11},
          pages = {2840--2847},
          author = {Congying Xu and Feixiong Cheng and Lei Chen and Zheng Du and Weihua Li and Guixia Liu and Philip W. Lee and Yun Tang},
          title = {In silico Prediction of Chemical Ames Mutagenicity},
          journal = {Journal of Chemical Information and Modeling}}""",
    ],
}

In [27]:
def str_presenter(dumper, data):
    """configures yaml for dumping multiline strings
    Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
    """
    if data.count("\n") > 0:  # check for multiline string
        return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
    return dumper.represent_scalar("tag:yaml.org,2002:str", data)


yaml.add_representer(str, str_presenter)
yaml.representer.SafeRepresenter.add_representer(
    str, str_presenter
)  # to use with safe_dum

In [28]:
fn_meta = "meta.yaml"

In [29]:
with open(fn_meta, "w") as f:
    yaml.dump(meta, f, sort_keys=False)

In [30]:
!ls -lh {fn_meta}

-rw-rw-r-- 1 melo melo 1.7K مار  2 16:49 meta.yaml


In [31]:
!cat {fn_meta}

name: Ames_Mutagenicity
description: Mutagenicity means the ability of a drug to induce genetic alterations.
  Drugs that can cause damage to the DNA can result in cell death or other severe
  adverse effects. Nowadays, the most widely used assay for testing the mutagenicity
  of compounds is the Ames experiment which was invented by a professor named Ames.
  The Ames test is a short-term bacterial reverse mutation assay detecting a large
  number of compounds which can induce genetic damage and frameshift mutations. The
  dataset is aggregated from four papers.
targets:
- id: mutagenic
  description: whether it is mutagenic (1) or not mutagenic (0)
  units: ld50
  type: categorical
  names:
  - mutagenic
  - Mutagenicity
  - Ames Mutagenicity
  - ability of a drug to induce genetic alterations
  - mutagens
identifiers:
- id: SMILES
  type: SMILES
  description: SMILES
license: CC BY 4.0
links:
- url: https://doi.org/10.1021/ci300400a
  description: correspond

# create transform.py

In [32]:
path_file = "transform.py"

In [33]:
%%writefile $path_file
import pandas as pd
import yaml
from tdc.single_pred import Tox


def get_and_transform_data():
    # get raw data
    data = Tox(name = 'AMES')
    fn_data_original = "data_original.csv"
    data.get_data().to_csv(fn_data_original, index=False)

    # create dataframe
    df = pd.read_csv(
        fn_data_original,
        delimiter=",",
    )  # not necessary but ensure we can load the saved data

    # check if fields are the same
    fields_orig = df.columns.tolist()
    assert fields_orig == [
        "Drug_ID",
        "Drug",
        "Y",
    ]

    # overwrite column names = fields
    fields_clean = [
        "compound_id",
        "SMILES",
        "mutagenic",
    ]
    df.columns = fields_clean

    # data cleaning
    df.compound_id = (
        df.compound_id.str.strip()
    )  # remove leading and trailing white space characters

    assert not df.duplicated().sum()

    # save to csv
    fn_data_csv = "data_clean.csv"
    df.to_csv(fn_data_csv, index=False)

    # create meta yaml
    meta =  {
        "name": "Ames_Mutagenicity",  # unique identifier, we will also use this for directory names
        "description": """Mutagenicity means the ability of a drug to induce genetic alterations. Drugs that can cause damage to the DNA can result in cell death or other severe adverse effects. Nowadays, the most widely used assay for testing the mutagenicity of compounds is the Ames experiment which was invented by a professor named Ames. The Ames test is a short-term bacterial reverse mutation assay detecting a large number of compounds which can induce genetic damage and frameshift mutations. The dataset is aggregated from four papers.""",
        "targets": [
            {
                "id": "mutagenic",  # name of the column in a tabular dataset
                "description": "whether it is mutagenic (1) or not mutagenic (0)",  # description of what this column means
                "units": "ld50",  # units of the values in this column (leave empty if unitless)
                "type": "categorical",  # can be "categorical", "ordinal", "continuous"
                "names": [  # names for the property (to sample from for building the prompts)
                    "mutagenic",
                    "Mutagenicity",
                    "Ames Mutagenicity",
                    "ability of a drug to induce genetic alterations",
                    "mutagens"
                ],
            },
        ],
        "identifiers": [
            {
                "id": "SMILES",  # column name
                "type": "SMILES",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
                "description": "SMILES",  # description (optional, except for "Other")
            },
        ],
        "license": "CC BY 4.0",  # license under which the original dataset was published
        "links": [  # list of relevant links (original dataset, other uses, etc.)
            {
                "url": "https://doi.org/10.1021/ci300400a",
                "description": "corresponding publication",
            },
        ],
        "num_points": len(df),  # number of datapoints in this dataset
        "url": "https://tdcommons.ai/single_pred_tasks/tox/#ames-mutagenicity",
        "bibtex": [
            """@article{Xu2012,
              doi = {10.1021/ci300400a},
              url = {https://doi.org/10.1021/ci300400a},
              year = {2012},
              month = oct,
              publisher = {American Chemical Society ({ACS})},
              volume = {52},
              number = {11},
              pages = {2840--2847},
              author = {Congying Xu and Feixiong Cheng and Lei Chen and Zheng Du and Weihua Li and Guixia Liu and Philip W. Lee and Yun Tang},
              title = {In silico Prediction of Chemical Ames Mutagenicity},
              journal = {Journal of Chemical Information and Modeling}}""",
        ],
    }

    def str_presenter(dumper, data):
        """configures yaml for dumping multiline strings
        Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
        """
        if data.count("\n") > 0:  # check for multiline string
            return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
        return dumper.represent_scalar("tag:yaml.org,2002:str", data)

    yaml.add_representer(str, str_presenter)
    yaml.representer.SafeRepresenter.add_representer(
        str, str_presenter
    )  # to use with safe_dum
    fn_meta = "meta.yaml"
    with open(fn_meta, "w") as f:
        yaml.dump(meta, f, sort_keys=False)

    print(f"Finished processing {meta['name']} dataset!")


if __name__ == "__main__":
    get_and_transform_data()

Overwriting transform.py


In [34]:
!python3 transform.py

Found local copy...
Loading...
Done!
Finished processing Ames_Mutagenicity dataset!


In [35]:
ls -lh  # fmt: skip

total 672K
drwxrwxr-x 2 melo melo 4.0K مار  2 16:49 [0m[01;34mdata[0m/
-rw-rw-r-- 1 melo melo 308K مار  2 16:49 data_clean.csv
-rw-rw-r-- 1 melo melo 308K مار  2 16:49 data_original.csv
-rw-rw-r-- 1 melo melo  38K مار  2 16:49 example_processing_and_templates.ipynb
-rw-rw-r-- 1 melo melo 1.7K مار  2 16:49 meta.yaml
-rw-rw-r-- 1 melo melo 4.6K مار  2 16:49 transform.py


# End