# Preprocessing of TCR databases

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np 
import seaborn as sb
import matplotlib.pyplot as plt

import os
import sys
sys.path.append('../tcr_benchmark')

from utils.config import col_va, col_ja, col_cdr3a, col_vb, col_jb, col_cdr3b, col_epitope, col_mhc
from utils.config import required_cols

sb.set_style('whitegrid')

## Helpers

In [2]:
# IEDB does not save leading C and ending F or W in the sequence. We will therefore remove all of them to have it uniform.
def trim_tcr(cdr3, is_beta):
    if type(cdr3) != str:
        return cdr3
    idx_start = 0
    if cdr3[0] == 'C':
        idx_start = 1

    idx_end = -1
    if cdr3[-1] == 'F':
        idx_end = -2
    if cdr3[-1] == 'W' and not is_beta:
        idx_end = -2
    return cdr3[idx_start: idx_end]

## Benchmark Data
- Minervina
- Dorigatti
- Francis

In [3]:
path_base = '../tcr_benchmark/data'

In [4]:
df_minervina = pd.read_csv(f'{path_base}/minervina.csv', index_col=0)
df_minervina = df_minervina.drop_duplicates()
df_minervina[col_cdr3b] = df_minervina[col_cdr3b].apply(lambda x: trim_tcr(x, True))
df_minervina[col_cdr3a] = df_minervina[col_cdr3a].apply(lambda x: trim_tcr(x, False))
df_minervina.head()

Unnamed: 0,CDR3_alpha,V_alpha,J_alpha,CDR3_beta,V_beta,J_beta,Epitope,MHC
0,AAPGSARQL,TRAV29/DV5,TRAJ22,ASSFENQPEA,TRBV5-1,TRBJ1-1,DTDFVNEFY,HLA-A*01:01
1,AVDIRAGNML,TRAV39,TRAJ39,ASSLAGYEQ,TRBV5-1,TRBJ2-7,LTDEMIAQY,HLA-A*01:01
2,AMSVGNNNDM,TRAV12-3,TRAJ43,ASMSLYQETQ,TRBV5-6,TRBJ2-5,TTDPSFLGRY,HLA-A*01:01
3,AIPGAGSYQL,TRAV14/DV4,TRAJ28,ASSRHPRGEKL,TRBV27,TRBJ1-4,TTDPSFLGRY,HLA-A*01:01
4,AVSPLGGYNKL,TRAV1-2,TRAJ4,ASSLLSPGYNSPL,TRBV28,TRBJ1-6,VYFLQSINF,HLA-A*24:02


In [5]:
df_dorigatti = pd.read_csv(f'{path_base}/dorigatti.csv', index_col=0)
df_dorigatti = df_dorigatti.drop(columns=['TCR', 'Label', 'Activation Score'])
df_dorigatti = df_dorigatti.drop_duplicates()
df_dorigatti[col_cdr3b] = df_dorigatti[col_cdr3b].apply(lambda x: trim_tcr(x, True))
df_dorigatti[col_cdr3a] = df_dorigatti[col_cdr3a].apply(lambda x: trim_tcr(x, False))
df_dorigatti.head()

Unnamed: 0,Epitope,MHC,CDR3_alpha,CDR3_beta,V_alpha,J_alpha,V_beta,J_beta
0,VPSVWRSSL,HLA-B*07:02,ALGTYGNNRL,ASTKGGPSSYEQ,TRAV19*01,TRAJ7*01,TRBV6-6*02,TRBJ2-7*01
1,RPSVWRSSL,HLA-B*07:02,ALGTYGNNRL,ASTKGGPSSYEQ,TRAV19*01,TRAJ7*01,TRBV6-6*02,TRBJ2-7*01
2,APSVWRSSL,HLA-B*07:02,ALGTYGNNRL,ASTKGGPSSYEQ,TRAV19*01,TRAJ7*01,TRBV6-6*02,TRBJ2-7*01
3,SPSVWRSSL,HLA-B*07:02,ALGTYGNNRL,ASTKGGPSSYEQ,TRAV19*01,TRAJ7*01,TRBV6-6*02,TRBJ2-7*01
4,KPSVWRSSL,HLA-B*07:02,ALGTYGNNRL,ASTKGGPSSYEQ,TRAV19*01,TRAJ7*01,TRBV6-6*02,TRBJ2-7*01


In [6]:
df_francis = pd.read_csv(f'{path_base}/francis.csv', index_col=0)
df_francis = df_francis.drop_duplicates()
df_francis = df_francis.rename(columns={'epitope': 'Epitope'})
df_francis[col_cdr3b] = df_francis[col_cdr3b].apply(lambda x: trim_tcr(x, True))
df_francis[col_cdr3a] = df_francis[col_cdr3a].apply(lambda x: trim_tcr(x, False))
df_francis.head()

Unnamed: 0,CDR3_alpha,V_alpha,J_alpha,CDR3_beta,V_beta,J_beta,Epitope,MHC
0,AAASGYGQNF,TRAV1-1,TRAJ26,ASSLGFNEQ,TRBV11-1,TRBJ2-7,SVLYYQNNV,A*02:01
1,AAMTNDYKL,TRAV13-1,TRAJ20,ASSIHAGLTSGNTI,TRBV19,TRBJ1-3,GILGFVFTL,A*02:01
2,AASGGGSQGNL,TRAV13-1,TRAJ42,ASSIFGTSLQ,TRBV19,TRBJ2-1,GILGFVFTL,A*02:01
3,AERIGGGSQGNL,TRAV13-2,TRAJ42,ASSIRADNEQ,TRBV19,TRBJ2-1,GILGFVFTL,A*02:01
4,AFMSGAGGTSYGKL,TRAV38-1,TRAJ52,ASSIGVWGY,TRBV19,TRBJ1-2,GILGFVFTL,A*02:01


## Databases
- IEDB
- VDJdb
- McPas

### IEDB

https://www.iedb.org/

- Eptiope = Any
- Assay = T Cell, Outcome = Positive
- MHC Restriction = Any
- Host = Any
- Disease = Any

=> Search => Receptors => Export Results => Export to CSV file.

In [7]:
rename_iedb = {
    'Description': col_epitope,  

    'Calculated V Gene': col_va,
    'Calculated J Gene': col_ja,
    'CDR3 Curated': col_cdr3a,

    'Calculated V Gene.1' : col_vb,
    'Calculated J Gene.1': col_jb,
    'CDR3 Curated.1': col_cdr3b,
    'MHC Allele Names': col_mhc,
    'Name': col_epitope
}

In [8]:
path_iedb = '../data/raw/receptor_table_export_1698079287.csv'
df_iedb = pd.read_csv(path_iedb, skiprows=1)

df_iedb = df_iedb.rename(columns=rename_iedb)
df_iedb = df_iedb[required_cols]

df_iedb = df_iedb[~df_iedb[col_epitope].isna()]
df_iedb = df_iedb[~(df_iedb[col_cdr3a].isna() & df_iedb[col_cdr3b].isna())]

df_iedb[col_cdr3b] = df_iedb[col_cdr3b].apply(lambda x: trim_tcr(x, True))
df_iedb[col_cdr3a] = df_iedb[col_cdr3a].apply(lambda x: trim_tcr(x, False))

df_iedb = df_iedb.drop_duplicates([col_epitope, col_cdr3a, col_cdr3b])
df_iedb = df_iedb.reset_index(drop=True)

df_iedb.to_csv('../data/processed/iedb.csv')
df_iedb.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,CDR3_alpha,V_alpha,J_alpha,CDR3_beta,V_beta,J_beta,Epitope,MHC
0,IVVRSSNTGKL,TRAV26-1*01,TRAJ37*01,ASSQDRDTQ,TRBV14*01,TRBJ2-3*01,VMAPRTLIL,"HLA-E*01:01, HLA-E*01:03"
1,,,,ASSQGGDRGDPGDGY,TRBV5-6,TRBJ1-2,CINGVCWTV,HLA-A*02:01
2,AVTTDSWGKL,TRAV12-2*01,TRAJ24*02,ASRPGLAGGRPEQ,TRBV6-5*01,TRBJ2-7*01,LLFGYPVYV,HLA-A*02:01
3,AVTTDSWGKL,TRAV12-2*01,TRAJ24*02,ASRPGLMSAQPEQ,TRBV6-5*01,TRBJ2-7*01,LLFGYPVYV,HLA-A*02:01
4,AVRPTSGGSYIP,TRAV21*01,TRAJ6*01,ASSYVGNTGEL,TRBV6-5*01,TRBJ2-2*01,SLLMWITQC,HLA-A*02:01


In [9]:
print('Unique Epitopes: ', len(df_iedb[col_epitope].unique()))
print('Unique TCRs: ', len(df_iedb[col_cdr3b].unique()))
print('Total pairs: ', len(df_iedb))

Unique Epitopes:  2623
Unique TCRs:  148878
Total pairs:  191170


#### Overlap

In [10]:
def print_overlap(df_ours, df_db):
    print('- By column:')
    cols = [col_cdr3a, col_cdr3b, col_epitope]
    for col in cols:
        df_ours_tmp = df_ours.drop_duplicates(col).copy()
        df_db_tmp = df_db.drop_duplicates(col)
        df_ours_tmp['in_db'] = df_ours_tmp[col].isin(df_db_tmp[col])
        print(f'-- {col}: {df_ours_tmp["in_db"].sum()} out of {len(df_ours_tmp)}')
    
    print('- Combinations: ')
    for col in [col_cdr3a, col_cdr3b]:
        df_ours_tmp = df_ours.drop_duplicates([col, col_epitope]).copy()
        df_db_tmp = df_db.drop_duplicates([col, col_epitope])
        df_ours_tmp[f'{col}_in_db'] = df_ours_tmp[col].isin(df_db_tmp[col])
        df_ours_tmp['epitope_in_db'] = df_ours_tmp[col_epitope].isin(df_db_tmp[col_epitope])
        print(f'-- Epitope-{col}: {(df_ours_tmp[f"{col}_in_db"] & df_ours_tmp[f"epitope_in_db"]).sum()} out of {len(df_ours_tmp)}')
    
    print('- Total: ')
    df_ours_tmp = df_ours.drop_duplicates(cols).copy()
    df_db_tmp = df_db.drop_duplicates(cols)
    
    for col in cols:
        df_ours_tmp[f'{col}_in_db'] = df_ours_tmp[col].isin(df_db_tmp[col])
    df_ours_tmp['total_in_db'] = df_ours_tmp[f'{cols[0]}_in_db'] & df_ours_tmp[f'{cols[1]}_in_db'] & df_ours_tmp[f'{cols[2]}_in_db']
    print(f'-- Ep+CDR3a+CDR3b: {df_ours_tmp[f"total_in_db"].sum()} out of {len(df_ours_tmp)}')
    return df_ours_tmp

In [11]:
print('Francis-IEDB')
_ = print_overlap(df_francis, df_iedb)
# Added to IEDB

Francis-IEDB
- By column:
-- CDR3_alpha: 1843 out of 1998
-- CDR3_beta: 2431 out of 2623
-- Epitope: 662 out of 664
- Combinations: 
-- Epitope-CDR3_alpha: 2218 out of 2375
-- Epitope-CDR3_beta: 2443 out of 2636
- Total: 
-- Ep+CDR3a+CDR3b: 2724 out of 3017


In [12]:
print('Minervina-IEDB')
_ = overlap_minervina = print_overlap(df_minervina, df_iedb)
# Added to IEDB

Minervina-IEDB
- By column:
-- CDR3_alpha: 1132 out of 3891
-- CDR3_beta: 1130 out of 4133
-- Epitope: 17 out of 17
- Combinations: 
-- Epitope-CDR3_alpha: 1192 out of 4015
-- Epitope-CDR3_beta: 1151 out of 4207
- Total: 
-- Ep+CDR3a+CDR3b: 931 out of 4505


In [13]:
print('Dorigatti-IEDB')
_ = print_overlap(df_dorigatti, df_iedb)

Dorigatti-IEDB
- By column:
-- CDR3_alpha: 1 out of 6
-- CDR3_beta: 0 out of 6
-- Epitope: 0 out of 134
- Combinations: 
-- Epitope-CDR3_alpha: 0 out of 804
-- Epitope-CDR3_beta: 0 out of 804
- Total: 
-- Ep+CDR3a+CDR3b: 0 out of 804


## VDJdb

https://github.com/antigenomics/vdjdb-db/releases/download/2022-03-30/vdjdb-2022-03-30.zip



In [14]:
rename_vdjdb = {
    'antigen.epitope': col_epitope, 

    'v.segm_a': col_va,
    'j.segm_a': col_ja,
    'cdr3_a': col_cdr3a,

    'v.segm_b' : col_vb,
    'j.segm_b': col_jb,
    'cdr3_b': col_cdr3b,
}

In [15]:
def assign_unique_index(x):
    idcs = []
    next_idx = x.max() + 1
    for i in x:
        if i != 0:
            idcs.append(i)
        else:
            idcs.append(next_idx)
            next_idx += 1
    return idcs


In [16]:
path_vdjdb = '../data/raw/vdjdb-2023-06-01/vdjdb.txt'
df_vdjdb = pd.read_csv(path_vdjdb, sep='\t')

df_vdjdb['complex.id'] = assign_unique_index(df_vdjdb['complex.id'])
df_vdjdb = df_vdjdb.set_index('complex.id')

df_vdjdb_a = df_vdjdb[df_vdjdb['gene']=='TRA'][['cdr3', 'v.segm', 'j.segm']]
df_vdjdb_a.columns = [f'{c}_a' if c in ['cdr3', 'v.segm', 'j.segm'] else c for c in df_vdjdb_a.columns]

df_vdjdb_b = df_vdjdb[df_vdjdb['gene']=='TRB'][['cdr3', 'v.segm', 'j.segm']]
df_vdjdb_b.columns = [f'{c}_b' if c in ['cdr3', 'v.segm', 'j.segm'] else c for c in df_vdjdb_b.columns]

df_vdjdb = df_vdjdb[~df_vdjdb.index.duplicated(keep='first')]
df_vdjdb = pd.concat([df_vdjdb, df_vdjdb_a, df_vdjdb_b], axis=1)

df_vdjdb = df_vdjdb.rename(columns=rename_vdjdb)
df_vdjdb['MHC'] = np.nan
df_vdjdb = df_vdjdb[required_cols]

df_vdjdb[col_cdr3b] = df_vdjdb[col_cdr3b].apply(lambda x: trim_tcr(x, True))
df_vdjdb[col_cdr3a] = df_vdjdb[col_cdr3a].apply(lambda x: trim_tcr(x, False))

df_vdjdb = df_vdjdb.reset_index(drop=True)

df_vdjdb.to_csv('../data/processed/vdjdb.csv')
df_vdjdb.head()

Unnamed: 0,CDR3_alpha,V_alpha,J_alpha,CDR3_beta,V_beta,J_beta,Epitope,MHC
0,IVRAPGRADM,TRAV26-1*01,TRAJ43*01,ASSYLPGQGDHYSNQPQ,TRBV13*01,TRBJ1-5*01,FLKEKGGL,
1,AVPSGAGSYQL,TRAV20*01,TRAJ28*01,ASSFEPGQGFYSNQPQ,TRBV13*01,TRBJ1-5*01,FLKEKGGL,
2,AVKASGSRL,TRAV2*01,,ASSYEPGQVSHYSNQPQ,TRBV13*01,TRBJ1-5*01,FLKEKGGL,
3,AYRPPGTYKY,TRAV38-2/DV8*01,TRAJ40*01,ASSALASLNEQ,TRBV14*01,TRBJ2-1*01,FLKEKGGL,
4,IVRAPGRADM,TRAV26-1*01,TRAJ43*01,ASSYLPGQGDHYSNQPQ,TRBV13*01,TRBJ1-5*01,FLKEQGGL,


In [17]:
print('Unique Epitopes: ', len(df_vdjdb[col_epitope].unique()))
print('Unique TCRs: ', len(df_vdjdb[col_cdr3b].unique()))
print('Total pairs: ', len(df_vdjdb))

Unique Epitopes:  1169
Unique TCRs:  41727
Total pairs:  62177


#### Overlap

In [18]:
print('Dorigatti-VDJdb')
_ = print_overlap(df_dorigatti, df_vdjdb)

Dorigatti-VDJdb
- By column:
-- CDR3_alpha: 1 out of 6
-- CDR3_beta: 0 out of 6
-- Epitope: 0 out of 134
- Combinations: 
-- Epitope-CDR3_alpha: 0 out of 804
-- Epitope-CDR3_beta: 0 out of 804
- Total: 
-- Ep+CDR3a+CDR3b: 0 out of 804


In [19]:
print('francis-VDJdb')
_ = print_overlap(df_francis, df_vdjdb)

francis-VDJdb
- By column:
-- CDR3_alpha: 1820 out of 1998
-- CDR3_beta: 2483 out of 2623
-- Epitope: 654 out of 664
- Combinations: 
-- Epitope-CDR3_alpha: 2179 out of 2375
-- Epitope-CDR3_beta: 2492 out of 2636
- Total: 
-- Ep+CDR3a+CDR3b: 2738 out of 3017


In [20]:
print('Minervina-VDJdb')
_ = print_overlap(df_minervina, df_vdjdb)

Minervina-VDJdb
- By column:
-- CDR3_alpha: 1100 out of 3891
-- CDR3_beta: 836 out of 4133
-- Epitope: 17 out of 17
- Combinations: 
-- Epitope-CDR3_alpha: 1159 out of 4015
-- Epitope-CDR3_beta: 853 out of 4207
- Total: 
-- Ep+CDR3a+CDR3b: 833 out of 4505


## McPAS-TCR
http://friedmanlab.weizmann.ac.il/McPAS-TCR/

"Download the complete database" => downloaded 23.10.2023

In [21]:
rename_mcpas = {
    'Epitope.peptide': col_epitope, 

    'TRAV': col_va,
    'TRAJ': col_ja,
    'CDR3.alpha.aa': col_cdr3a,

    'TRBV' : col_vb,
    'TRBJ': col_jb,
    'CDR3.beta.aa': col_cdr3b,
}

In [22]:
path_mcpas = '../data/raw/McPAS-TCR.csv'
df_mcpas = pd.read_csv(path_mcpas, encoding = "ISO-8859-1")
df_mcpas = df_mcpas[rename_mcpas.keys()]
df_mcpas = df_mcpas.rename(columns=rename_mcpas)

df_mcpas[col_cdr3b] = df_mcpas[col_cdr3b].apply(lambda x: trim_tcr(x, True))
df_mcpas[col_cdr3a] = df_mcpas[col_cdr3a].apply(lambda x: trim_tcr(x, False))

df_mcpas = df_mcpas.reset_index(drop=True)

df_mcpas.to_csv('../data/processed/mcpas_tcr.csv')
df_mcpas.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,Epitope,V_alpha,J_alpha,CDR3_alpha,V_beta,J_beta,CDR3_beta
0,IKAVYNFATCG,,,,TRBV8-1,TRBJ1-1,ASSDAGANTE
1,IKAVYNFATCG,,,,TRBV8-1,TRBJ2-1,ASSDAGAYAE
2,IKAVYNFATCG,,,,TRBV8-3,TRBJ1-1,ASSDAGGAAE
3,IKAVYNFATCG,,,,TRBV8-1,TRBJ1-6,ASSDAGHSPL
4,IKAVYNFATCG,,,,TRBV8-3,TRBJ2-6,ASSDAWGGAEQ


In [23]:
print('Unique Epitopes: ', len(df_mcpas[col_epitope].unique()))
print('Unique TCRs: ', len(df_mcpas[col_cdr3b].unique()))
print('Total pairs: ', len(df_mcpas))

Unique Epitopes:  380
Unique TCRs:  31201
Total pairs:  39985


#### Overlap

In [24]:
print('Dorigatti-McPas')
_ = print_overlap(df_dorigatti, df_mcpas)

Dorigatti-McPas
- By column:
-- CDR3_alpha: 0 out of 6
-- CDR3_beta: 0 out of 6
-- Epitope: 0 out of 134
- Combinations: 
-- Epitope-CDR3_alpha: 0 out of 804
-- Epitope-CDR3_beta: 0 out of 804
- Total: 
-- Ep+CDR3a+CDR3b: 0 out of 804


In [25]:
print('francis-McPas')
_ = print_overlap(df_francis, df_mcpas)

francis-McPas
- By column:
-- CDR3_alpha: 168 out of 1998
-- CDR3_beta: 131 out of 2623
-- Epitope: 10 out of 664
- Combinations: 
-- Epitope-CDR3_alpha: 107 out of 2375
-- Epitope-CDR3_beta: 91 out of 2636
- Total: 
-- Ep+CDR3a+CDR3b: 134 out of 3017


In [26]:
print('Minervina-McPas')
_ = print_overlap(df_minervina, df_mcpas)

Minervina-McPas
- By column:
-- CDR3_alpha: 190 out of 3891
-- CDR3_beta: 75 out of 4133
-- Epitope: 0 out of 17
- Combinations: 
-- Epitope-CDR3_alpha: 0 out of 4015
-- Epitope-CDR3_beta: 0 out of 4207
- Total: 
-- Ep+CDR3a+CDR3b: 0 out of 4505
