# TCR-Epitope binding, Weber et al.

Original data repository: https://tdcommons.ai/multi_pred_tasks/tcrepitope/

# Imports

In [1]:
import pandas as pd
import yaml
from tdc.multi_pred import TCREpitopeBinding 

# Data processing

## Download data

In [2]:
fn_data_original = "data_original.csv"

In [3]:
data = TCREpitopeBinding(name = 'weber', path = './data')

Downloading...
100%|████████████████████████████████████████████████| 16.0M/16.0M [00:02<00:00, 5.98MiB/s]
Loading...
Done!


In [4]:
data.get_data().to_csv(fn_data_original, index=False)

In [5]:
!ls -lh

total 30672
drwxr-xr-x  3 cody  staff    96B Mar  4 10:09 [34mdata[m[m
-rw-r--r--  1 cody  staff    15M Mar  4 10:10 data_original.csv
-rw-r--r--  1 cody  staff    45K Mar  4 10:06 example_processing_and_templates.ipynb
-rw-r--r--@ 1 cody  staff   1.5K Mar  4 10:05 meta.yaml
-rw-r--r--@ 1 cody  staff   4.8K Mar  4 10:05 transform.py


## Load original data

In [6]:
!head -n 5 {fn_data_original}

epitope_aa,epitope_smi,tcr,tcr_full,label
FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)Cc1ccccc1)C(=O)O,CSVWGTGKTYEQYF,SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLIATANQGSEATYESGFVIDKFPISRPNLTFSTLTVSNMSPEDSSIYLCSVWGTGKTYEQYFGPGTRLTVT,1
FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)Cc1ccccc1)C(=O)O,CSVWGEGRSYEQYF,SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLIATANQGSEATYESGFVIDKFPISRPNLTFSTLTVSNMSPEDSSIYLCSVWGEGRSYEQYFGPGTRLTVT,1
FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)Cc1ccccc1)C(=O)O,CSATILAGVPYGEQYF,GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSATILAGVPYGEQYFGPGTRLTVT,1
FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@@

In [7]:
df = pd.read_csv(fn_data_original, delimiter=",")

In [8]:
df.head()

Unnamed: 0,epitope_aa,epitope_smi,tcr,tcr_full,label
0,FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...,CSVWGTGKTYEQYF,SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI...,1
1,FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...,CSVWGEGRSYEQYF,SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI...,1
2,FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...,CSATILAGVPYGEQYF,GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...,1
3,FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...,CSASEGTSSYEQYF,GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...,1
4,FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...,CASSFDREVTGELFF,GAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL...,1


## Add column = field names
Clean column names (`fields_clean`) and keep original names (`fields_orig`)

In [9]:
fields_orig = df.columns.tolist()
fields_orig

['epitope_aa', 'epitope_smi', 'tcr', 'tcr_full', 'label']

In [10]:
    fields_clean = [
        "epitope_aa",
        "epitope_smiles",
        "tcr",
        "tcr_full",
        "binding",
    ]

In [11]:
df.columns = fields_clean

In [12]:
df.head()

Unnamed: 0,epitope_aa,epitope_smiles,tcr,tcr_full,binding
0,FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...,CSVWGTGKTYEQYF,SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI...,1
1,FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...,CSVWGEGRSYEQYF,SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI...,1
2,FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...,CSATILAGVPYGEQYF,GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...,1
3,FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...,CSASEGTSSYEQYF,GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...,1
4,FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...,CASSFDREVTGELFF,GAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL...,1


## Data cleaning

In [13]:
df.epitope_aa = (
    df.epitope_aa.str.strip()
)  # remove leading and trailing white space characters

In [14]:
assert not df.duplicated().sum()

## Save to csv

In [15]:
fn_data_csv = "data_clean.csv"

In [16]:
df.to_csv(fn_data_csv, index=False)

In [17]:
!ls -lh {fn_data_csv}

-rw-r--r--  1 cody  staff    15M Mar  4 10:10 data_clean.csv


In [18]:
!head -n 5 {fn_data_csv}

epitope_aa,epitope_smiles,tcr,tcr_full,binding
FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)Cc1ccccc1)C(=O)O,CSVWGTGKTYEQYF,SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLIATANQGSEATYESGFVIDKFPISRPNLTFSTLTVSNMSPEDSSIYLCSVWGTGKTYEQYFGPGTRLTVT,1
FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)Cc1ccccc1)C(=O)O,CSVWGEGRSYEQYF,SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLIATANQGSEATYESGFVIDKFPISRPNLTFSTLTVSNMSPEDSSIYLCSVWGEGRSYEQYFGPGTRLTVT,1
FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)Cc1ccccc1)C(=O)O,CSATILAGVPYGEQYF,GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSATILAGVPYGEQYFGPGTRLTVT,1
FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O

In [19]:
df.head()

Unnamed: 0,epitope_aa,epitope_smiles,tcr,tcr_full,binding
0,FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...,CSVWGTGKTYEQYF,SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI...,1
1,FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...,CSVWGEGRSYEQYF,SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI...,1
2,FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...,CSATILAGVPYGEQYF,GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...,1
3,FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...,CSASEGTSSYEQYF,GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...,1
4,FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...,CASSFDREVTGELFF,GAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL...,1


## Load from csv

In [20]:
fn_data_csv = "data_clean.csv"

In [21]:
df = pd.read_csv(fn_data_csv)

In [22]:
df.head()

Unnamed: 0,epitope_aa,epitope_smiles,tcr,tcr_full,binding
0,FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...,CSVWGTGKTYEQYF,SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI...,1
1,FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...,CSVWGEGRSYEQYF,SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI...,1
2,FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...,CSATILAGVPYGEQYF,GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...,1
3,FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...,CSASEGTSSYEQYF,GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...,1
4,FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...,CASSFDREVTGELFF,GAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL...,1


# meta YAML

In [23]:
df.head()

Unnamed: 0,epitope_aa,epitope_smiles,tcr,tcr_full,binding
0,FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...,CSVWGTGKTYEQYF,SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI...,1
1,FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...,CSVWGEGRSYEQYF,SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI...,1
2,FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...,CSATILAGVPYGEQYF,GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...,1
3,FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...,CSASEGTSSYEQYF,GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...,1
4,FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...,CASSFDREVTGELFF,GAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL...,1


In [24]:
meta = {
    "name": "tcr_epitope_binding",  # unique identifier, we will also use this for directory names
    "description": """T-cells are an integral part of the adaptive immune system, whose survival, proliferation, activation
    and function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic peptides (epitopes).
    A large repertoire of T-cell receptors with different specificity is needed to provide protection against a wide range of pathogens.
    This new task aims to predict the binding affinity given a pair of TCR sequence and epitope sequence.""",
    "targets": [
        {
            "id": "binding",  # name of the column in a tabular dataset
            "description": "TCR epitope binding.",  # description of what this column means
            "units": "",  # units of the values in this column (leave empty if unitless)
            "type": "binary classification",  # can be "categorical", "ordinal", "continuous"
            "names": [  # names for the property (to sample from for building the prompts)
                "tcr binding affinity",
                "binding affinity",
                "binding",

            ],
        },
    ],
    "identifiers": [
        {
            "id": "epitope_smiles",  # column name
            "type": "SMILES",  # can be "SMILES", "SELFIES", "IUPAC", "OTHER"
            "description": "epitope smiles ",  # description (optional, except for "OTHER")
        },
        {
            "id": "epitope_aa",
            "type": "amino acid",
            "description": "epitope amino acid sequence",

        },
        {
            "id": "tcr_aa",
            "type": "amino acid",
            "description": "tcr amino acid sequence",

        },
    ],
    "license": "CC BY 4.0",  # license under which the original dataset was published
    "links": [  # list of relevant links (original dataset, other uses, etc.)
        {
            "url": "https://tdcommons.ai/multi_pred_tasks/tcrepitope/",
            "description": "original data set link",
        },
        {
            "url": "https://doi.org/10.1093/bioinformatics/btab294",
            "description": "corresponding publication",
        },
    ],
    "num_points": len(df),  # number of datapoints in this dataset
    "bibtex": [
        """@article{weber2021titan,
        title={TITAN: T-cell receptor specificity prediction with bimodal attention network},
        author={Weber Anna,Born Janis, Martinez Maria Rodriguez},
        journal={Bioinformatics},
        volume={56},
        number={4},
        pages={i237-i234},
        year={2021},
        publisher={Oxford Academic}
        }""",
    ],
}

In [25]:
def str_presenter(dumper, data):
    """configures yaml for dumping multiline strings
    Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
    """
    if data.count("\n") > 0:  # check for multiline string
        return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
    return dumper.represent_scalar("tag:yaml.org,2002:str", data)


yaml.add_representer(str, str_presenter)
yaml.representer.SafeRepresenter.add_representer(
    str, str_presenter
)  # to use with safe_dum

In [26]:
fn_meta = "meta.yaml"

In [27]:
with open(fn_meta, "w") as f:
    yaml.dump(meta, f, sort_keys=False)

In [28]:
!ls -lh {fn_meta}

-rw-r--r--@ 1 cody  staff   1.5K Mar  4 10:10 meta.yaml


In [29]:
!cat {fn_meta}

name: tcr_epitope_binding
description: |-
  T-cells are an integral part of the adaptive immune system, whose survival, proliferation, activation
      and function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic peptides (epitopes).
      A large repertoire of T-cell receptors with different specificity is needed to provide protection against a wide range of pathogens.
      This new task aims to predict the binding affinity given a pair of TCR sequence and epitope sequence.
targets:
- id: binding
  description: TCR epitope binding.
  units: ''
  type: binary classification
  names:
  - tcr binding affinity
  - binding affinity
  - binding
identifiers:
- id: epitope_smiles
  type: SMILES
  description: 'epitope smiles '
- id: epitope_aa
  type: amino acid
  description: epitope amino acid sequence
- id: tcr_aa
  type: amino acid
  description: tcr amino acid sequence
license: CC BY 4.0
links:
- url: https://tdcommons.ai/mu

# create transform.py

In [30]:
path_file = "transform.py"

In [36]:
%%writefile $path_file
import pandas as pd
import yaml
from tdc.multi_pred import TCREpitopeBinding 

def get_and_transform_data():
    # get raw data
    data = TCREpitopeBinding(name = 'weber', path = './data')
    fn_data_original = "data_original.csv"
    data.get_data().to_csv(fn_data_original, index=False)

    # create dataframe
    df = pd.read_csv(
        fn_data_original,
        delimiter=",",
    )  # not necessary but ensure we can load the saved data

    # check if fields are the same
    fields_orig = df.columns.tolist()
    assert fields_orig == [
        "epitope_aa",
        "epitope_smi",
        "tcr",
        "tcr_full",
        "label",
    ]

    # overwrite column names = fields
    fields_clean = [
        "epitope_aa",
        "epitope_smiles",
        "tcr",
        "tcr_full_aa",
        "binding",
    ]
    df.columns = fields_clean

    # data cleaning
    df.epitope_aa = (
        df.epitope_aa.str.strip()
    )  # remove leading and trailing white space characters

    assert not df.duplicated().sum()

    # save to csv
    fn_data_csv = "data_clean.csv"
    df.to_csv(fn_data_csv, index=False)

    # create meta yaml
    meta = {
        "name": "tcr_epitope_binding",  # unique identifier, we will also use this for directory names
        "description": """T-cells are an integral part of the adaptive immune system, whose survival, proliferation, activation
        and function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic peptides (epitopes).
        A large repertoire of T-cell receptors with different specificity is needed to provide protection against a wide range of pathogens.
        This new task aims to predict the binding affinity given a pair of TCR sequence and epitope sequence.""",
        "targets": [
            {
                "id": "binding",  # name of the column in a tabular dataset
                "description": "TCR epitope binding.",  # description of what this column means
                "units": "",  # units of the values in this column (leave empty if unitless)
                "type": "binary classification",  # can be "categorical", "ordinal", "continuous"
                "names": [  # names for the property (to sample from for building the prompts)
                    "tcr binding affinity",
                    "binding affinity",
                    "binding",

                ],
            },
        ],
        "identifiers": [
            {
                "id": "epitope_smiles",  # column name
                "type": "SMILES",  # can be "SMILES", "SELFIES", "IUPAC", "OTHER"
                "description": "epitope smiles",  # description (optional, except for "OTHER")
            },
            {
                "id": "epitope_aa",
                "type": "Other",
                "description": "epitope amino acid sequence",
    
            },
            {
                "id": "tcr_full_aa",
                "type": "Other",
                "description": "tcr amino acid sequence",
    
            },
        ],
        "license": "CC BY 4.0",  # license under which the original dataset was published
        "links": [  # list of relevant links (original dataset, other uses, etc.)
            {
                "url": "https://tdcommons.ai/multi_pred_tasks/tcrepitope/",
                "description": "original data set link",
            },
            {
                "url": "https://doi.org/10.1093/bioinformatics/btab294",
                "description": "corresponding publication",
            },
        ],
        "num_points": len(df),  # number of datapoints in this dataset
        "bibtex": [
            """@article{weber2021titan,
            title={TITAN: T-cell receptor specificity prediction with bimodal attention network},
            author={Weber Anna,Born Janis, Martinez Maria Rodriguez},
            journal={Bioinformatics},
            volume={56},
            number={4},
            pages={i237-i234},
            year={2021},
            publisher={Oxford Academic}
            }""",
        ],
    }

    def str_presenter(dumper, data):
        """configures yaml for dumping multiline strings
        Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
        """

        if data.count("\n") > 0:  # check for multiline string
            return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
        return dumper.represent_scalar("tag:yaml.org,2002:str", data)

    yaml.add_representer(str, str_presenter)
    yaml.representer.SafeRepresenter.add_representer(
        str, str_presenter
    )  # to use with safe_dum
    fn_meta = "meta.yaml"
    with open(fn_meta, "w") as f:
        yaml.dump(meta, f, sort_keys=False)

    print(f"Finished processing {meta['name']} dataset!")


if __name__ == "__main__":
    get_and_transform_data()

Overwriting transform.py


In [32]:
!python3 transform.py

Found local copy...
Loading...
Done!
Finished processing tcr_epitope_binding dataset!


In [33]:
ls -lh  # fmt: skip

total 61592
drwxr-xr-x  3 cody  staff    96B Mar  4 10:09 [34mdata[m[m/
-rw-r--r--  1 cody  staff    15M Mar  4 10:10 data_clean.csv
-rw-r--r--  1 cody  staff    15M Mar  4 10:10 data_original.csv
-rw-r--r--  1 cody  staff    45K Mar  4 10:06 example_processing_and_templates.ipynb
-rw-r--r--@ 1 cody  staff   1.5K Mar  4 10:10 meta.yaml
-rw-r--r--@ 1 cody  staff   4.8K Mar  4 10:10 transform.py


# End