# Caco-2 (Cell Effective Permeability), Wang et al.

Original data repository: https://tdcommons.ai/single_pred_tasks/adme/#caco-2-cell-effective-permeability-wang-et-al

# Imports

In [11]:
from tdc.single_pred import ADME
import pandas as pd
import yaml

# Data processing

## Download data

In [2]:
fn_data_original = "data_original.csv"

In [3]:
data = ADME(name = 'Caco2_Wang')

Downloading...
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 82.5k/82.5k [00:00<00:00, 374kiB/s]
Loading...
Done!


In [7]:
data.get_data().to_csv(fn_data_original, index=False)

In [None]:
!ls -lh

## Load original data

In [9]:
!head -n 5 {fn_data_original}

Drug_ID,Drug,Y
(-)-epicatechin,Oc1cc(O)c2c(c1)OC(c1ccc(O)c(O)c1)C(O)C2,-6.2199998
"(2E,4Z,8Z)-N-isobutyldodeca-2,4,10-triene-8 -ynamide",C/C=C\C#CCC/C=C\C=C\C(=O)NCC(C)C,-3.8599999
codeine,COc1ccc2c3c1O[C@H]1[C@@H](O)C=C[C@H]4[C@@H](C2)N(C)CC[C@]314,-4.0900002
creatinine,CN1CC(=O)NC1=N,-5.935409099999998


In [20]:
df = pd.read_csv(fn_data_original, delimiter=",")

In [21]:
df.head()

Unnamed: 0,Drug_ID,Drug,Y
0,(-)-epicatechin,Oc1cc(O)c2c(c1)OC(c1ccc(O)c(O)c1)C(O)C2,-6.22
1,"(2E,4Z,8Z)-N-isobutyldodeca-2,4,10-triene-8 -y...",C/C=C\C#CCC/C=C\C=C\C(=O)NCC(C)C,-3.86
2,codeine,COc1ccc2c3c1O[C@H]1[C@@H](O)C=C[C@H]4[C@@H](C2...,-4.09
3,creatinine,CN1CC(=O)NC1=N,-5.935409
4,danazol,C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=Cc5oncc5C[C@]4(...,-4.84


## Add column = field names
Clean column names (`fields_clean`) and keep original names (`fields_orig`)

In [22]:
fields_orig = df.columns.tolist()
fields_orig

['Drug_ID', 'Drug', 'Y']

In [23]:
fields_clean = [
    "compound_id",
    "SMILES",
    "permeability",
]

In [24]:
df.columns = fields_clean

In [25]:
df.head()

Unnamed: 0,compound_id,SMILES,permeability
0,(-)-epicatechin,Oc1cc(O)c2c(c1)OC(c1ccc(O)c(O)c1)C(O)C2,-6.22
1,"(2E,4Z,8Z)-N-isobutyldodeca-2,4,10-triene-8 -y...",C/C=C\C#CCC/C=C\C=C\C(=O)NCC(C)C,-3.86
2,codeine,COc1ccc2c3c1O[C@H]1[C@@H](O)C=C[C@H]4[C@@H](C2...,-4.09
3,creatinine,CN1CC(=O)NC1=N,-5.935409
4,danazol,C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=Cc5oncc5C[C@]4(...,-4.84


## Data cleaning

In [26]:
df.compound_id = df.compound_id.str.strip()  # remove leading and trailing white space characters

In [29]:
assert not df.duplicated().sum()

## Save to csv

In [30]:
fn_data_csv = "data_clean.csv"

In [31]:
df.to_csv(fn_data_csv, index=False)

In [None]:
!ls -lh {fn_data_csv}

In [33]:
!head -n 5 {fn_data_csv}

compound_id,SMILES,permeability
(-)-epicatechin,Oc1cc(O)c2c(c1)OC(c1ccc(O)c(O)c1)C(O)C2,-6.2199998
"(2E,4Z,8Z)-N-isobutyldodeca-2,4,10-triene-8 -ynamide",C/C=C\C#CCC/C=C\C=C\C(=O)NCC(C)C,-3.8599999
codeine,COc1ccc2c3c1O[C@H]1[C@@H](O)C=C[C@H]4[C@@H](C2)N(C)CC[C@]314,-4.0900002
creatinine,CN1CC(=O)NC1=N,-5.935409099999998


In [34]:
df.head()

Unnamed: 0,compound_id,SMILES,permeability
0,(-)-epicatechin,Oc1cc(O)c2c(c1)OC(c1ccc(O)c(O)c1)C(O)C2,-6.22
1,"(2E,4Z,8Z)-N-isobutyldodeca-2,4,10-triene-8 -y...",C/C=C\C#CCC/C=C\C=C\C(=O)NCC(C)C,-3.86
2,codeine,COc1ccc2c3c1O[C@H]1[C@@H](O)C=C[C@H]4[C@@H](C2...,-4.09
3,creatinine,CN1CC(=O)NC1=N,-5.935409
4,danazol,C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=Cc5oncc5C[C@]4(...,-4.84


## Load from csv

In [35]:
fn_data_csv = "data_clean.csv"

In [36]:
df = pd.read_csv(fn_data_csv)

In [37]:
df.head()

Unnamed: 0,compound_id,SMILES,permeability
0,(-)-epicatechin,Oc1cc(O)c2c(c1)OC(c1ccc(O)c(O)c1)C(O)C2,-6.22
1,"(2E,4Z,8Z)-N-isobutyldodeca-2,4,10-triene-8 -y...",C/C=C\C#CCC/C=C\C=C\C(=O)NCC(C)C,-3.86
2,codeine,COc1ccc2c3c1O[C@H]1[C@@H](O)C=C[C@H]4[C@@H](C2...,-4.09
3,creatinine,CN1CC(=O)NC1=N,-5.935409
4,danazol,C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=Cc5oncc5C[C@]4(...,-4.84


# meta YAML

In [38]:
df.head()

Unnamed: 0,compound_id,SMILES,permeability
0,(-)-epicatechin,Oc1cc(O)c2c(c1)OC(c1ccc(O)c(O)c1)C(O)C2,-6.22
1,"(2E,4Z,8Z)-N-isobutyldodeca-2,4,10-triene-8 -y...",C/C=C\C#CCC/C=C\C=C\C(=O)NCC(C)C,-3.86
2,codeine,COc1ccc2c3c1O[C@H]1[C@@H](O)C=C[C@H]4[C@@H](C2...,-4.09
3,creatinine,CN1CC(=O)NC1=N,-5.935409
4,danazol,C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=Cc5oncc5C[C@]4(...,-4.84


In [57]:
meta = {
    "name": "caco2_wang",  # unique identifier, we will also use this for directory names
    "description": "The human colon epithelial cancer cell line, Caco-2, is used as an in vitro model to simulate the human intestinal tissue. The experimental result on the rate of drug passing through the Caco-2 cells can approximate the rate at which the drug permeates through the human intestinal tissue.",
    "targets": [
        {
            "id": "permeability",  # name of the column in a tabular dataset
            "description": "Caco-2 cell effective permeability.",  # description of what this column means
            "units": "?",  # units of the values in this column (leave empty if unitless)
            "type": "continuous",  # can be "categorical", "ordinal", "continuous"
            "names": [  # names for the property (to sample from for building the prompts)
                "Caco-2 cell effective permeability",
                "Caco-2 cell permeability",
                "Caco-2 permeability",
                "permeability",
            ],
        },
    ],
    "identifiers": [
        {
            "id": "SMILES",  # column name
            "type": "SMILES",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
            "description": "SMILES",  # description (optional, except for "Other")
        },
        {
            "id": "compound_id",
            "type": "Other",
            "description": "Compound id / name",
            "names": [
                "compound id",
                "compound name",
            ]
        },
    ],
    "license": "CC BY 4.0",  # license under which the original dataset was published
    "links": [  # list of relevant links (original dataset, other uses, etc.)
        "https://tdcommons.ai/single_pred_tasks/adme/#caco-2-cell-effective-permeability-wang-et-al",
        "https://pubs.acs.org/doi/10.1021/acs.jcim.5b00642",
    ],
    "num_points": len(df),  # number of datapoints in this dataset
    "url": "https://tdcommons.ai/single_pred_tasks/adme/#caco-2-cell-effective-permeability-wang-et-al",
    "bibtex": [
        """@article{wang2016adme,
title={ADME properties evaluation in drug discovery: prediction of Caco-2 cell permeability using a combination of NSGA-II and boosting},
author={Wang, Ning-Ning and Dong, Jie and Deng, Yin-Hua and Zhu, Min-Feng and Wen, Ming and Yao, Zhi-Jiang and Lu, Ai-Ping and Wang, Jian-Bing and Cao, Dong-Sheng},
journal={Journal of Chemical Information and Modeling},
volume={56},
number={4},
pages={763--773},
year={2016},
publisher={ACS Publications}
}""",
    ],
}

In [40]:
fn_meta = "meta.yaml"

In [41]:
with open(fn_meta, "w") as f:
    yaml.dump(meta, f, sort_keys=False)

In [None]:
!ls -lh {fn_meta}

In [52]:
!cat {fn_meta}

name: caco2_wang
description: The human colon epithelial cancer cell line, Caco-2, is used as an in
  vitro model to simulate the human intestinal tissue. The experimental result on
  the rate of drug passing through the Caco-2 cells can approximate the rate at which
  the drug permeates through the human intestinal tissue.
targets:
- id: permeability
  description: Caco-2 cell effective permeability.
  units: '?'
  type: continuous
  names:
  - Caco-2 cell effective permeability
  - Caco-2 cell permeability
  - Caco-2 permeability
  - permeability
identifiers:
- id: SMILES
  type: SMILES
  description: SMILES
- id: compound_id
  type: OTHER
  description: Compound id / name
license: CC BY 4.0
links:
- https://tdcommons.ai/single_pred_tasks/adme/#caco-2-cell-effective-permeability-wang-et-al
- https://pubs.acs.org/doi/10.1021/acs.jcim.5b00642
num_points: 910
url: https://tdcommons.ai/single_pred_tasks/adme/#caco-2-cell-effective-permeability-wang-et-al
bibtex:
- "@article{wang2016adme,

# create transform.py

In [45]:
path_file = "transform.py"

In [58]:
%%writefile $path_file
from tdc.single_pred import ADME
import pandas as pd
import yaml


def get_and_transform_data():
    # get raw data
    data = ADME(name = 'Caco2_Wang')
    fn_data_original = "data_original.csv"
    data.get_data().to_csv(fn_data_original, index=False)

    # create dataframe
    df = pd.read_csv(fn_data_original, delimiter=",")  # not necessary but ensure we can load the saved data

    # check if fields are the same
    fields_orig = df.columns.tolist()
    assert fields_orig == [
        "Drug_ID",
        "Drug",
        "Y",
    ]

    # overwrite column names = fields
    fields_clean = [
        "compound_id",
        "SMILES",
        "permeability",
    ]
    df.columns = fields_clean

    # data cleaning
    df.compound_id = (
        df.compound_id.str.strip()
    )  # remove leading and trailing white space characters
    
    assert not df.duplicated().sum()

    # save to csv
    fn_data_csv = "data_clean.csv"
    df.to_csv(fn_data_csv, index=False)

    # create meta yaml
    meta = {
        "name": "caco2_wang",  # unique identifier, we will also use this for directory names
        "description": "The human colon epithelial cancer cell line, Caco-2, is used as an in vitro model to simulate the human intestinal tissue. The experimental result on the rate of drug passing through the Caco-2 cells can approximate the rate at which the drug permeates through the human intestinal tissue.",
        "targets": [
            {
                "id": "permeability",  # name of the column in a tabular dataset
                "description": "Caco-2 cell effective permeability.",  # description of what this column means
                "units": "?",  # units of the values in this column (leave empty if unitless)
                "type": "continuous",  # can be "categorical", "ordinal", "continuous"
                "names": [  # names for the property (to sample from for building the prompts)
                    "Caco-2 cell effective permeability",
                    "Caco-2 cell permeability",
                    "Caco-2 permeability",
                    "permeability",
                ],
            },
        ],
        "identifiers": [
            {
                "id": "SMILES",  # column name
                "type": "SMILES",  # can be "SMILES", "SELFIES", "IUPAC", "OTHER"
                "description": "SMILES",  # description (optional, except for "OTHER")
            },
            {
                "id": "compound_id",
                "type": "Other",
                "description": "Compound id / name",
                "names": [
                    "compound id",
                    "compound name",
                ],
            },
        ],
        "license": "CC BY 4.0",  # license under which the original dataset was published
        "links": [  # list of relevant links (original dataset, other uses, etc.)
            "https://tdcommons.ai/single_pred_tasks/adme/#caco-2-cell-effective-permeability-wang-et-al",
            "https://pubs.acs.org/doi/10.1021/acs.jcim.5b00642",
        ],
        "num_points": len(df),  # number of datapoints in this dataset
        "url": "https://tdcommons.ai/single_pred_tasks/adme/#caco-2-cell-effective-permeability-wang-et-al",
        "bibtex": [
            """@article{wang2016adme,
            title={ADME properties evaluation in drug discovery: prediction of Caco-2 cell permeability using a combination of NSGA-II and boosting},
            author={Wang, Ning-Ning and Dong, Jie and Deng, Yin-Hua and Zhu, Min-Feng and Wen, Ming and Yao, Zhi-Jiang and Lu, Ai-Ping and Wang, Jian-Bing and Cao, Dong-Sheng},
            journal={Journal of Chemical Information and Modeling},
            volume={56},
            number={4},
            pages={763--773},
            year={2016},
            publisher={ACS Publications}
            }""",
        ],
    }
    fn_meta = "meta.yaml"
    with open(fn_meta, "w") as f:
        yaml.dump(meta, f, sort_keys=False)

    print(f"Finished processing {meta['name']} dataset!")


if __name__ == "__main__":
    get_and_transform_data()

Overwriting transform.py


In [None]:
!python3 notes

In [None]:
ls -lh  # fmt: skip

# End