# PAMPA Permeability, NCATS

Original data repository: https://tdcommons.ai/single_pred_tasks/adme/#pampa-permeability-ncats

# Imports

In [1]:
import pandas as pd
import yaml
from tdc.single_pred import ADME

# Data processing

## Download data

In [2]:
fn_data_original = "data_original.csv"

In [3]:
data = ADME(name="PAMPA_NCATS")

Downloading...
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 144k/144k [00:00<00:00, 582kiB/s]
Loading...
Done!


In [4]:
data.get_data().to_csv(fn_data_original, index=False)

In [None]:
!ls -lh

## Load original data

In [6]:
!head -n 5 {fn_data_original}

Drug_ID,Drug,Y
2466,CCCCOC1=CC=C(C=C1)CC(=O)NO,1
1259573,COC1=C(C=C(C=C1)CCN2C(=CC(=O)NC2=S)N)OC,0
1275864,COC1=C(C=C(C=C1)Cl)C(=O)NC2=CC=C(C=C2)NC(=O)C3=CC=CO3,1
4878,CC(C)(C)N1C2=NC=NC(=C2C(=N1)C3=CC=C(C=C3)Cl)N,1


In [22]:
df = pd.read_csv(fn_data_original, delimiter=",")

## Add column = field names
Clean column names (`fields_clean`) and keep original names (`fields_orig`)

In [23]:
fields_orig = df.columns.tolist()
fields_orig

['Drug_ID', 'Drug', 'Y']

In [24]:
fields_clean = [
    "compound_id",
    "SMILES",
    "permeability",
]

In [25]:
df.columns = fields_clean

In [26]:
df.head()

Unnamed: 0,compound_id,SMILES,permeability
0,2466,CCCCOC1=CC=C(C=C1)CC(=O)NO,1
1,1259573,COC1=C(C=C(C=C1)CCN2C(=CC(=O)NC2=S)N)OC,0
2,1275864,COC1=C(C=C(C=C1)Cl)C(=O)NC2=CC=C(C=C2)NC(=O)C3...,1
3,4878,CC(C)(C)N1C2=NC=NC(=C2C(=N1)C3=CC=C(C=C3)Cl)N,1
4,2030130,CN1C2=CC=CC=C2C(=O)C3=C1N=C(N(C3=O)C4=CC=CC=C4...,1


## Data cleaning

In [27]:
df.drop(columns=["compound_id"], inplace=True)

In [28]:
assert not df.duplicated().sum()

## Save to csv

In [29]:
fn_data_csv = "data_clean.csv"

In [30]:
df.to_csv(fn_data_csv, index=False)

In [None]:
!ls -lh {fn_data_csv}

In [32]:
!head -n 5 {fn_data_csv}

SMILES,permeability
CCCCOC1=CC=C(C=C1)CC(=O)NO,1
COC1=C(C=C(C=C1)CCN2C(=CC(=O)NC2=S)N)OC,0
COC1=C(C=C(C=C1)Cl)C(=O)NC2=CC=C(C=C2)NC(=O)C3=CC=CO3,1
CC(C)(C)N1C2=NC=NC(=C2C(=N1)C3=CC=C(C=C3)Cl)N,1


In [33]:
df.head()

Unnamed: 0,SMILES,permeability
0,CCCCOC1=CC=C(C=C1)CC(=O)NO,1
1,COC1=C(C=C(C=C1)CCN2C(=CC(=O)NC2=S)N)OC,0
2,COC1=C(C=C(C=C1)Cl)C(=O)NC2=CC=C(C=C2)NC(=O)C3...,1
3,CC(C)(C)N1C2=NC=NC(=C2C(=N1)C3=CC=C(C=C3)Cl)N,1
4,CN1C2=CC=CC=C2C(=O)C3=C1N=C(N(C3=O)C4=CC=CC=C4...,1


## Load from csv

In [34]:
fn_data_csv = "data_clean.csv"

In [35]:
df = pd.read_csv(fn_data_csv)

In [36]:
df.head()

Unnamed: 0,SMILES,permeability
0,CCCCOC1=CC=C(C=C1)CC(=O)NO,1
1,COC1=C(C=C(C=C1)CCN2C(=CC(=O)NC2=S)N)OC,0
2,COC1=C(C=C(C=C1)Cl)C(=O)NC2=CC=C(C=C2)NC(=O)C3...,1
3,CC(C)(C)N1C2=NC=NC(=C2C(=N1)C3=CC=C(C=C3)Cl)N,1
4,CN1C2=CC=CC=C2C(=O)C3=C1N=C(N(C3=O)C4=CC=CC=C4...,1


# meta YAML

In [37]:
df.head()

Unnamed: 0,SMILES,permeability
0,CCCCOC1=CC=C(C=C1)CC(=O)NO,1
1,COC1=C(C=C(C=C1)CCN2C(=CC(=O)NC2=S)N)OC,0
2,COC1=C(C=C(C=C1)Cl)C(=O)NC2=CC=C(C=C2)NC(=O)C3...,1
3,CC(C)(C)N1C2=NC=NC(=C2C(=N1)C3=CC=C(C=C3)Cl)N,1
4,CN1C2=CC=CC=C2C(=O)C3=C1N=C(N(C3=O)C4=CC=CC=C4...,1


In [38]:
meta = {
    "name": "pampa_ncats",  # unique identifier, we will also use this for directory names
    "description": """PAMPA (parallel artificial membrane permeability assay) is a commonly
    employed assay to evaluate drug permeability across the cellular membrane. PAMPA is a
    non-cell-based, low-cost and high-throughput alternative to cellular models. Although
    PAMPA does not model active and efflux transporters, it still provides permeability values
    that are useful for absorption prediction because the majority of drugs are absorbed by
    passive diffusion through the membrane.""",
    "targets": [
        {
            "id": "permeability",  # name of the column in a tabular dataset
            "description": "Binary permeability in PAMPA assay.",  # description of what this column means
            "units": "Bool",  # units of the values in this column (leave empty if unitless)
            "type": "categorical",  # can be "categorical", "ordinal", "continuous"
            "names": [  # names for the property (to sample from for building the prompts)
                "binary permeability in PAMPA assay",
                "permeability in PAMPA assay",
                "PAMPA permeability",
            ],
        },
    ],
    "identifiers": [
        {
            "id": "SMILES",  # column name
            "type": "SMILES",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
            "description": "SMILES",  # description (optional, except for "Other")
        },
    ],
    "license": "CC BY 4.0",  # license under which the original dataset was published
    "links": [  # list of relevant links (original dataset, other uses, etc.)
        {
            "url": "https://tdcommons.ai/single_pred_tasks/adme/#pampa-permeability-ncats",
            "description": "original dataset link",
        },
        {
            "url": "https://journals.sagepub.com/doi/full/10.1177/24725552211017520",
            "description": "corresponding publication",
        },
    ],
    "num_points": len(df),  # number of datapoints in this dataset
    "bibtex": [
        """@article{siramshetty2021validating,
title={Validating ADME QSAR Models Using Marketed Drugs},
author={Siramshetty, Vishal and Williams, Jordan and Nguyen, DHac-Trung and Neyra, Jorge and Southall,
Noel and Math'e, Ewy and Xu, Xin and Shah, Pranav},
journal={SLAS DISCOVERY: Advancing the Science of Drug Discovery},
volume={26},
number={10},
pages={1326--1336},
year={2021},
publisher={SAGE Publications Sage CA: Los Angeles, CA}
}""",
    ],
}

In [39]:
fn_meta = "meta.yaml"

In [None]:
def str_presenter(dumper, data):
    """configures yaml for dumping multiline strings
    Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
    """
    if data.count("\n") > 0:  # check for multiline string
        return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
    return dumper.represent_scalar("tag:yaml.org,2002:str", data)


yaml.add_representer(str, str_presenter)
yaml.representer.SafeRepresenter.add_representer(
    str, str_presenter
)  # to use with safe_dum

In [40]:
with open(fn_meta, "w") as f:
    yaml.dump(meta, f, sort_keys=False)

In [None]:
!ls -lh {fn_meta}

In [42]:
!cat {fn_meta}

name: pampa_ncats
description: "PAMPA (parallel artificial membrane permeability assay) is a commonly\
  \ employed assay to\n    evaluate drug permeability across the cellular membrane.\
  \ PAMPA is a non-cell-based, low-cost and high-throughput\n    alternative to cellular\
  \ models. Although PAMPA does not model active and efflux transporters, it still\
  \ provides\n    permeability values that are useful for absorption prediction because\
  \ the majority of drugs are absorbed by passive\n    diffusion through the membrane."
targets:
- id: permeability
  description: Binary permeability in PAMPA assay.
  units: Bool
  type: categorical
  names:
  - binary permeability in PAMPA assay
  - permeability in PAMPA assay
  - PAMPA permeability
identifiers:
- id: SMILES
  type: SMILES
  description: SMILES
license: CC BY 4.0
links:
- url: https://tdcommons.ai/single_pred_tasks/adme/#pampa-permeability-ncats
  description: tdcommons.ai dataset url
- url: https://journals.sagepub.com/doi

# create transform.py

In [1]:
path_file = "transform.py"

In [2]:
%%writefile $path_file
import pandas as pd
import yaml
from tdc.single_pred import ADME


def get_and_transform_data():
    # get raw data
    data = ADME(name="PAMPA_NCATS")
    fn_data_original = "data_original.csv"
    data.get_data().to_csv(fn_data_original, index=False)

    # create dataframe
    df = pd.read_csv(
        fn_data_original,
        delimiter=",",
    )  # not necessary but ensure we can load the saved data

    # check if fields are the same
    fields_orig = df.columns.tolist()
    assert fields_orig == [
        "Drug_ID",
        "Drug",
        "Y",
    ]

    # overwrite column names = fields
    fields_clean = [
        "compound_id",
        "SMILES",
        "permeability",
    ]
    df.columns = fields_clean

    # data cleaning
    df.drop(columns=["compound_id"], inplace=True)
    assert not df.duplicated().sum()

    # save to csv
    fn_data_csv = "data_clean.csv"
    df.to_csv(fn_data_csv, index=False)

    # create meta yaml
    meta = {
        "name": "pampa_ncats",  # unique identifier, we will also use this for directory names
        "description": """PAMPA (parallel artificial membrane permeability assay) is a commonly
        employed assay to evaluate drug permeability across the cellular membrane. PAMPA is a
        non-cell-based, low-cost and high-throughput alternative to cellular models. Although
        PAMPA does not model active and efflux transporters, it still provides permeability values
        that are useful for absorption prediction because the majority of drugs are absorbed by
        passive diffusion through the membrane.""",
        "targets": [
            {
                "id": "permeability",  # name of the column in a tabular dataset
                "description": "Binary permeability in PAMPA assay.",  # description of what this column means
                "units": "Bool",  # units of the values in this column (leave empty if unitless)
                "type": "categorical",  # can be "categorical", "ordinal", "continuous"
                "names": [  # names for the property (to sample from for building the prompts)
                    "binary permeability in PAMPA assay",
                    "permeability in PAMPA assay",
                    "PAMPA permeability",
                ],
            },
        ],
        "identifiers": [
            {
                "id": "SMILES",  # column name
                "type": "SMILES",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
                "description": "SMILES",  # description (optional, except for "Other")
            },
        ],
        "license": "CC BY 4.0",  # license under which the original dataset was published
        "links": [  # list of relevant links (original dataset, other uses, etc.)
            {
                "url": "https://tdcommons.ai/single_pred_tasks/adme/#pampa-permeability-ncats",
                "description": "original dataset link",
            },
            {
                "url": "https://journals.sagepub.com/doi/full/10.1177/24725552211017520",
                "description": "corresponding publication",
            },
        ],
        "num_points": len(df),  # number of datapoints in this dataset
        "bibtex": [
            """@article{siramshetty2021validating,
    title={Validating ADME QSAR Models Using Marketed Drugs},
    author={Siramshetty, Vishal and Williams, Jordan and Nguyen, DHac-Trung and Neyra, Jorge and Southall,
    Noel and Math'e, Ewy and Xu, Xin and Shah, Pranav},
    journal={SLAS DISCOVERY: Advancing the Science of Drug Discovery},
    volume={26},
    number={10},
    pages={1326--1336},
    year={2021},
    publisher={SAGE Publications Sage CA: Los Angeles, CA}
    }""",
        ],
    }

    def str_presenter(dumper, data):
        """configures yaml for dumping multiline strings
        Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
        """
        if data.count("\n") > 0:  # check for multiline string
            return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
        return dumper.represent_scalar("tag:yaml.org,2002:str", data)

    yaml.add_representer(str, str_presenter)
    yaml.representer.SafeRepresenter.add_representer(
        str, str_presenter
    )  # to use with safe_dum
    fn_meta = "meta.yaml"
    with open(fn_meta, "w") as f:
        yaml.dump(meta, f, sort_keys=False)

    print(f"Finished processing {meta['name']} dataset!")


if __name__ == "__main__":
    get_and_transform_data()

Overwriting transform.py


In [None]:
!python3 transform.py

In [None]:
ls -lh  # fmt: skip

# End