# Drug-Target Interaction, Liu et al.

Original data repository: https://tdcommons.ai/multi_pred_tasks/dti/

# Imports

In [1]:
import pandas as pd
import yaml
from tdc.multi_pred import DTI

# Data processing

## Download data

In [2]:
fn_data_original = "data_original.csv"

In [3]:
data = DTI(name = 'BindingDB_Kd')

Downloading...
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 54.4M/54.4M [00:15<00:00, 3.41MiB/s]
Loading...
Done!


In [4]:
data.get_data().to_csv(fn_data_original, index=False)

In [5]:
!ls -lh

total 82856
drwxr-xr-x  3 cody  staff    96B Mar 18 19:00 [34mdata[m[m
-rw-r--r--  1 cody  staff    39M Mar 18 19:00 data_original.csv
-rw-r--r--  1 cody  staff    46K Mar 18 18:58 example_processing_and_templates.ipynb
-rw-r--r--@ 1 cody  staff   1.8K Mar 15 22:47 meta.yaml
-rw-r--r--@ 1 cody  staff   5.1K Mar 18 19:00 transform.py


## Load original data

In [6]:
!head -n 5 {fn_data_original}

Drug_ID,Drug,Target_ID,Target,Y
444607.0,Cc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKPLSVSYDQATSLRILNNGHAFNVEFDDSQDKAVLKGGPLDGTYRLIQFHFHWGSLDGQGSEHTVDKKKYAAELHLVHWNTKYGDFGKAVQQPDGLAVLGIFLKVGSAKPGLQKVVDVLDSIKTKGKSADFTNFDPRGLLPESLDYWTYPGSLTTPPLLECVTWIVLKEPISVSSEQVLKFRKLNFNGEGEPEELMVDNWRPAQPLKNRQIKASFK,0.46
4316.0,COc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKPLSVSYDQATSLRILNNGHAFNVEFDDSQDKAVLKGGPLDGTYRLIQFHFHWGSLDGQGSEHTVDKKKYAAELHLVHWNTKYGDFGKAVQQPDGLAVLGIFLKVGSAKPGLQKVVDVLDSIKTKGKSADFTNFDPRGLLPESLDYWTYPGSLTTPPLLECVTWIVLKEPISVSSEQVLKFRKLNFNGEGEPEELMVDNWRPAQPLKNRQIKASFK,0.49
4293.0,NS(=O)(=O)c1ccc(S(=O)(=O)NCc2cccs2)s1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKPLSVSYDQATSLRILNNGHAFNVEFDDSQDKAVLKGGPLDGTYRLIQFHFHWGSLDGQGSEHTVDKKKYAAELHLVHWNTKYGDFGKAVQQPDGLAVLGIFLKVGSAKPGLQKVVDVLDSIKTKGKSADFTNFDPRGLLPESLDYWTYPGSLTTPPLLECVTWIVLKEPISVSSEQVLKFRKLNFNGEGEPEELMVDNWRPAQPLKNRQIKASFK,0.83

In [7]:
df = pd.read_csv(fn_data_original, delimiter=",")

In [8]:
df.head()

Unnamed: 0,Drug_ID,Drug,Target_ID,Target,Y
0,444607.0,Cc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.46
1,4316.0,COc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.49
2,4293.0,NS(=O)(=O)c1ccc(S(=O)(=O)NCc2cccs2)s1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.83
3,1611.0,NS(=O)(=O)c1cc2c(s1)S(=O)(=O)N(Cc1cccs1)CC2O,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.2
4,1612.0,COc1ccc(N2CC(O)c3cc(S(N)(=O)=O)sc3S2(=O)=O)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.16


## Add column = field names
Clean column names (`fields_clean`) and keep original names (`fields_orig`)

In [9]:
fields_orig = df.columns.tolist()
fields_orig

['Drug_ID', 'Drug', 'Target_ID', 'Target', 'Y']

In [10]:
fields_clean = [
        "compound_name",
        "SMILES",
        "target_name",
        "Target_aa",
        "binding",
    ]

In [11]:
df.columns = fields_clean

In [12]:
df.head()

Unnamed: 0,compound_name,SMILES,target_name,Target_aa,binding
0,444607.0,Cc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.46
1,4316.0,COc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.49
2,4293.0,NS(=O)(=O)c1ccc(S(=O)(=O)NCc2cccs2)s1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.83
3,1611.0,NS(=O)(=O)c1cc2c(s1)S(=O)(=O)N(Cc1cccs1)CC2O,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.2
4,1612.0,COc1ccc(N2CC(O)c3cc(S(N)(=O)=O)sc3S2(=O)=O)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.16


## Data cleaning

In [13]:
df.columns = fields_clean


In [14]:
assert not df.duplicated().sum()

## Save to csv

In [15]:
fn_data_csv = "data_clean.csv"

In [16]:
df.to_csv(fn_data_csv, index=False)

In [17]:
!ls -lh {fn_data_csv}

-rw-r--r--  1 cody  staff    39M Mar 18 19:00 data_clean.csv


In [18]:
!head -n 5 {fn_data_csv}

compound_name,SMILES,target_name,Target_aa,binding
444607.0,Cc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKPLSVSYDQATSLRILNNGHAFNVEFDDSQDKAVLKGGPLDGTYRLIQFHFHWGSLDGQGSEHTVDKKKYAAELHLVHWNTKYGDFGKAVQQPDGLAVLGIFLKVGSAKPGLQKVVDVLDSIKTKGKSADFTNFDPRGLLPESLDYWTYPGSLTTPPLLECVTWIVLKEPISVSSEQVLKFRKLNFNGEGEPEELMVDNWRPAQPLKNRQIKASFK,0.46
4316.0,COc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKPLSVSYDQATSLRILNNGHAFNVEFDDSQDKAVLKGGPLDGTYRLIQFHFHWGSLDGQGSEHTVDKKKYAAELHLVHWNTKYGDFGKAVQQPDGLAVLGIFLKVGSAKPGLQKVVDVLDSIKTKGKSADFTNFDPRGLLPESLDYWTYPGSLTTPPLLECVTWIVLKEPISVSSEQVLKFRKLNFNGEGEPEELMVDNWRPAQPLKNRQIKASFK,0.49
4293.0,NS(=O)(=O)c1ccc(S(=O)(=O)NCc2cccs2)s1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKPLSVSYDQATSLRILNNGHAFNVEFDDSQDKAVLKGGPLDGTYRLIQFHFHWGSLDGQGSEHTVDKKKYAAELHLVHWNTKYGDFGKAVQQPDGLAVLGIFLKVGSAKPGLQKVVDVLDSIKTKGKSADFTNFDPRGLLPESLDYWTYPGSLTTPPLLECVTWIVLKEPISVSSEQVLKFRKLNFNGEGEPEELMVDNWRPA

In [19]:
df.head()

Unnamed: 0,compound_name,SMILES,target_name,Target_aa,binding
0,444607.0,Cc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.46
1,4316.0,COc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.49
2,4293.0,NS(=O)(=O)c1ccc(S(=O)(=O)NCc2cccs2)s1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.83
3,1611.0,NS(=O)(=O)c1cc2c(s1)S(=O)(=O)N(Cc1cccs1)CC2O,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.2
4,1612.0,COc1ccc(N2CC(O)c3cc(S(N)(=O)=O)sc3S2(=O)=O)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.16


## Load from csv

In [20]:
fn_data_csv = "data_clean.csv"

In [21]:
df = pd.read_csv(fn_data_csv)

In [22]:
df.head()

Unnamed: 0,compound_name,SMILES,target_name,Target_aa,binding
0,444607.0,Cc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.46
1,4316.0,COc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.49
2,4293.0,NS(=O)(=O)c1ccc(S(=O)(=O)NCc2cccs2)s1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.83
3,1611.0,NS(=O)(=O)c1cc2c(s1)S(=O)(=O)N(Cc1cccs1)CC2O,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.2
4,1612.0,COc1ccc(N2CC(O)c3cc(S(N)(=O)=O)sc3S2(=O)=O)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.16


# meta YAML

In [23]:
df.head()

Unnamed: 0,compound_name,SMILES,target_name,Target_aa,binding
0,444607.0,Cc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.46
1,4316.0,COc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.49
2,4293.0,NS(=O)(=O)c1ccc(S(=O)(=O)NCc2cccs2)s1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.83
3,1611.0,NS(=O)(=O)c1cc2c(s1)S(=O)(=O)N(Cc1cccs1)CC2O,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.2
4,1612.0,COc1ccc(N2CC(O)c3cc(S(N)(=O)=O)sc3S2(=O)=O)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.16


In [24]:
meta = {
    "name": "Drug-Target Interaction",  # unique identifier, we will also use this for directory names
    "description": """The activity of a small-molecule drug is measured by its binding affinity with the target protein.
    Given a new target protein, the very first step is to screen a set of potential compounds to find their activity.
    Traditional method to gauge the affinities are through high-throughput screening wet-lab experiments.
    However, they are very expensive and are thus restricted by their abilities to search over a large set of candidates
    Drug-target interaction prediction task aims to predict the interaction activity score in silico given only the accessible compound structural information and protein amino acid sequence.""",
    "targets": [
        {
            "id": "binding",  # name of the column in a tabular dataset
            "description": "small-molecule protein interaction.",  # description of what this column means
            "units": "Kd",  # units of the values in this column (leave empty if unitless)
            "type": "continuous",  # can be "categorical", "ordinal", "continuous"
            "uris" : ["	http://purl.obolibrary.org/obo/NCIT_C20604"],
            "names": [  # names for the property (to sample from for building the prompts)
                "Drug-Target Interaction"
                "small-molecule binding affinity",
                "small-molecule binding",
                "protein-ligand binding",
                "protein-ligand"
                "binding affinity",
                "binding",

            ],
        },
    ],
    "identifiers": [
        {
            "id": "SMILES",  # column name
            "type": "SMILES",  # can be "SMILES", "SELFIES", "IUPAC", "OTHER"
            "description": "small-molecule",  # description (optional, except for "OTHER")
        },
        {
            "id": "Target",
            "type": "Other",
            "description": "Target amino acid sequence",

        },
    ],
    "license": "CC BY 4.0",  # license under which the original dataset was published
    "links": [  # list of relevant links (original dataset, other uses, etc.)
        {
            "url": "https://tdcommons.ai/multi_pred_tasks/dti/",
            "description": "original data set link",
        },
        {
            "url": "https://doi.org/10.1093/nar/gkl999",
            "description": "corresponding publication",
        },
    ],
    "split_col": "split",
    "num_points": len(df),  # number of datapoints in this dataset
    "bibtex": [
        """@article{Liu2006bindingdb,
        title={BindingDB: a web-accessible database of experimentally determined protein-ligand binding affinities},
        author={Tiqing Liu, Yuhmei Lin, Xin Wen, Robert N. Jorissen, Micahel, K. Gilson},
        journal={Journal of Chemical Information and Modeling},
        volume={35},
        number={4},
        pages={D198-D201},
        year={2006},
        publisher={Oxford Academic}
        }""",
    ],
}

In [25]:
def str_presenter(dumper, data):
    """configures yaml for dumping multiline strings
    Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
    """
    if data.count("\n") > 0:  # check for multiline string
        return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
    return dumper.represent_scalar("tag:yaml.org,2002:str", data)


yaml.add_representer(str, str_presenter)
yaml.representer.SafeRepresenter.add_representer(
    str, str_presenter
)  # to use with safe_dum

In [26]:
fn_meta = "meta.yaml"

In [27]:
with open(fn_meta, "w") as f:
    yaml.dump(meta, f, sort_keys=False)

In [28]:
!ls -lh {fn_meta}

-rw-r--r--@ 1 cody  staff   1.8K Mar 18 19:00 meta.yaml


In [29]:
!cat {fn_meta}

name: Drug-Target Interaction
description: |-
  The activity of a small-molecule drug is measured by its binding affinity with the target protein.
      Given a new target protein, the very first step is to screen a set of potential compounds to find their activity.
      Traditional method to gauge the affinities are through high-throughput screening wet-lab experiments.
      However, they are very expensive and are thus restricted by their abilities to search over a large set of candidates
      Drug-target interaction prediction task aims to predict the interaction activity score in silico given only the accessible compound structural information and protein amino acid sequence.
targets:
- id: binding
  description: small-molecule protein interaction.
  units: Kd
  type: continuous
  uris:
  - "\thttp://purl.obolibrary.org/obo/NCIT_C20604"
  names:
  - Drug-Target Interactionsmall-molecule binding affinity
  - small-molecule binding
  - protein-ligand binding
  - 

# create transform.py

In [30]:
path_file = "transform.py"

In [33]:
%%writefile $path_file
import pandas as pd
import yaml
from tdc.multi_pred import DTI

def get_and_transform_data():
    # get raw data
    data = DTI(name="BindingDB_Kd")
    splits = data.get_split()
    df_train = splits["train"]
    df_valid = splits["valid"]
    df_test = splits["test"]
    df_train["split"] = "train"
    df_valid["split"] = "valid"
    df_test["split"] = "test"

    df = pd.concat([df_train, df_valid, df_test], axis=0)

    # check if fields are the same
    fields_orig = df.columns.tolist()
    assert fields_orig == [
        "Drug_ID",
        "Drug",
        "Target_ID",
        "Target",
        "Y",
        "split",
    ]

    # overwrite column names = fields
    fields_clean = [
        "compound_name",
        "SMILES",
        "target_name",
        "Target_aa",
        "binding",
        "split"
    ]
    df.columns = fields_clean

    # data cleaning
    '''
    df.compound_name = (
        df.compound_name.str.strip()
    )  # remove leading and trailing white space characters
    '''
    assert not df.duplicated().sum()

    # save to csv
    fn_data_csv = "data_clean.csv"
    df.to_csv(fn_data_csv, index=False)

    # create meta yaml
    meta = {
        "name": "Drug-Target Interaction",  # unique identifier, we will also use this for directory names
        "description": """The activity of a small-molecule drug is measured by its binding affinity with the target protein.
        Given a new target protein, the very first step is to screen a set of potential compounds to find their activity.
        Traditional method to gauge the affinities are through high-throughput screening wet-lab experiments.
        However, they are very expensive and are thus restricted by their abilities to search over a large set of candidates
        Drug-target interaction prediction task aims to predict the interaction activity score in silico given only the accessible compound structural information and protein amino acid sequence.""",
        "targets": [
            {
                "id": "binding",  # name of the column in a tabular dataset
                "description": "small-molecule protein interaction.",  # description of what this column means
                "units": "Kd",  # units of the values in this column (leave empty if unitless)
                "type": "regression",  # can be "categorical", "ordinal", "continuous"
                "names": [  # names for the property (to sample from for building the prompts)
                    "Drug-Target Interaction"
                    "small-molecule binding affinity",
                    "small-molecule binding",
                    "protein-ligand binding",
                    "protein-ligand"
                    "binding affinity",
                    "binding",

                ],
            },
        ],
        "identifiers": [
            {
                "id": "SMILES",  # column name
                "type": "SMILES",  # can be "SMILES", "SELFIES", "IUPAC", "OTHER"
                "description": "small-molecule",  # description (optional, except for "OTHER")
            },
            {
                "id": "Target",
                "type": "Other",
                "description": "Target amino acid sequence",
    
            },
        ],
        "license": "CC BY 4.0",  # license under which the original dataset was published
        "links": [  # list of relevant links (original dataset, other uses, etc.)
            {
                "url": "https://tdcommons.ai/multi_pred_tasks/dti/",
                "description": "original data set link",
            },
            {
                "url": "https://doi.org/10.1093/nar/gkl999",
                "description": "corresponding publication",
            },
        ],
        "num_points": len(df),  # number of datapoints in this dataset
        "bibtex": [
            """@article{Liu2006bindingdb,
            title={BindingDB: a web-accessible database of experimentally determined protein-ligand binding affinities},
            author={Tiqing Liu, Yuhmei Lin, Xin Wen, Robert N. Jorissen, Micahel, K. Gilson},
            journal={Journal of Chemical Information and Modeling},
            volume={35},
            number={4},
            pages={D198-D201},
            year={2006},
            publisher={Oxford Academic}
            }""",
        ],
    }

    def str_presenter(dumper, data):
        """configures yaml for dumping multiline strings
        Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
        """

        if data.count("\n") > 0:  # check for multiline string
            return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
        return dumper.represent_scalar("tag:yaml.org,2002:str", data)

    yaml.add_representer(str, str_presenter)
    yaml.representer.SafeRepresenter.add_representer(
        str, str_presenter
    )  # to use with safe_dum
    fn_meta = "meta.yaml"
    with open(fn_meta, "w") as f:
        yaml.dump(meta, f, sort_keys=False)

    print(f"Finished processing {meta['name']} dataset!")


if __name__ == "__main__":
    get_and_transform_data()

Overwriting transform.py


In [34]:
!python3 transform.py

Found local copy...
Loading...
Done!
Finished processing Drug-Target Interaction dataset!


In [70]:
ls -lh  # fmt: skip

total 163984
drwxr-xr-x  3 cody  staff    96B Mar 11 11:42 [34mdata[m[m/
-rw-r--r--  1 cody  staff    39M Mar 11 11:43 data_clean.csv
-rw-r--r--  1 cody  staff    39M Mar 11 11:43 data_original.csv
-rw-r--r--  1 cody  staff    36K Mar 11 11:42 example_processing_and_templates.ipynb
-rw-r--r--@ 1 cody  staff   1.8K Mar 11 11:43 meta.yaml
-rw-r--r--  1 cody  staff   1.7K Mar 11 11:15 meta_.yaml
-rw-r--r--@ 1 cody  staff   5.0K Mar 11 11:43 transform.py


# End