# Preprocessing of TCR databases

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np 
import seaborn as sb
import matplotlib.pyplot as plt

import os
import sys
sys.path.append('../tcr_benchmark')

from utils.config import col_va, col_ja, col_cdr3a, col_vb, col_jb, col_cdr3b, col_epitope, col_mhc
from utils.config import required_cols

sb.set_style('whitegrid')

## Helpers

In [2]:
# IEDB does not save leading C and ending F or W in the sequence. We will therefore remove all of them to have it uniform.
def trim_tcr(cdr3, is_beta):
    if type(cdr3) != str:
        return cdr3
    idx_start = 0
    if cdr3[0] == 'C':
        idx_start = 1

    idx_end = -1
    if cdr3[-1] == 'F':
        idx_end = -2
    if cdr3[-1] == 'W' and not is_beta:
        idx_end = -2
    return cdr3[idx_start: idx_end]

In [3]:
path_base = '../tcr_benchmark/data'

## Databases
- IEDB
- VDJdb
- McPas

### IEDB

https://www.iedb.org/

- Eptiope = Any
- Assay = T Cell, Outcome = Positive
- MHC Restriction = Any
- Host = Any
- Disease = Any

=> Search => Receptors => Export Results => Export to CSV file.

In [4]:
rename_iedb = {
    'Description': col_epitope,  

    'Calculated V Gene': col_va,
    'Calculated J Gene': col_ja,
    'CDR3 Curated': col_cdr3a,

    'Calculated V Gene.1' : col_vb,
    'Calculated J Gene.1': col_jb,
    'CDR3 Curated.1': col_cdr3b,
    'MHC Allele Names': col_mhc,
    'Name': col_epitope
}

In [5]:
path_iedb = '../data/raw/receptor_table_export_1698079287.csv'
df_iedb = pd.read_csv(path_iedb, skiprows=1)

df_iedb = df_iedb.rename(columns=rename_iedb)
df_iedb = df_iedb[required_cols]

df_iedb = df_iedb[~df_iedb[col_epitope].isna()]
df_iedb = df_iedb[~(df_iedb[col_cdr3a].isna() & df_iedb[col_cdr3b].isna())]

df_iedb[col_cdr3b] = df_iedb[col_cdr3b].apply(lambda x: trim_tcr(x, True))
df_iedb[col_cdr3a] = df_iedb[col_cdr3a].apply(lambda x: trim_tcr(x, False))

df_iedb = df_iedb.drop_duplicates([col_epitope, col_cdr3a, col_cdr3b])
df_iedb = df_iedb.reset_index(drop=True)

df_iedb.to_csv('../data/processed/iedb.csv')
df_iedb.head()

  df_iedb = pd.read_csv(path_iedb, skiprows=1)


Unnamed: 0,CDR3_alpha,V_alpha,J_alpha,CDR3_beta,V_beta,J_beta,Epitope,MHC
0,IVVRSSNTGKL,TRAV26-1*01,TRAJ37*01,ASSQDRDTQ,TRBV14*01,TRBJ2-3*01,VMAPRTLIL,"HLA-E*01:01, HLA-E*01:03"
1,,,,ASSQGGDRGDPGDGY,TRBV5-6,TRBJ1-2,CINGVCWTV,HLA-A*02:01
2,AVTTDSWGKL,TRAV12-2*01,TRAJ24*02,ASRPGLAGGRPEQ,TRBV6-5*01,TRBJ2-7*01,LLFGYPVYV,HLA-A*02:01
3,AVTTDSWGKL,TRAV12-2*01,TRAJ24*02,ASRPGLMSAQPEQ,TRBV6-5*01,TRBJ2-7*01,LLFGYPVYV,HLA-A*02:01
4,AVRPTSGGSYIP,TRAV21*01,TRAJ6*01,ASSYVGNTGEL,TRBV6-5*01,TRBJ2-2*01,SLLMWITQC,HLA-A*02:01


In [6]:
print('Unique Epitopes: ', len(df_iedb[col_epitope].unique()))
print('Unique TCRs: ', len(df_iedb[col_cdr3b].unique()))
print('Total pairs: ', len(df_iedb))

Unique Epitopes:  2623
Unique TCRs:  148878
Total pairs:  191170


## VDJdb

https://github.com/antigenomics/vdjdb-db/releases/download/2022-03-30/vdjdb-2022-03-30.zip



In [7]:
rename_vdjdb = {
    'antigen.epitope': col_epitope, 

    'v.segm_a': col_va,
    'j.segm_a': col_ja,
    'cdr3_a': col_cdr3a,

    'v.segm_b' : col_vb,
    'j.segm_b': col_jb,
    'cdr3_b': col_cdr3b,
}

In [8]:
def assign_unique_index(x):
    idcs = []
    next_idx = x.max() + 1
    for i in x:
        if i != 0:
            idcs.append(i)
        else:
            idcs.append(next_idx)
            next_idx += 1
    return idcs


In [9]:
path_vdjdb = '../data/raw/vdjdb-2023-06-01/vdjdb.txt'
df_vdjdb = pd.read_csv(path_vdjdb, sep='\t')

df_vdjdb['complex.id'] = assign_unique_index(df_vdjdb['complex.id'])
df_vdjdb = df_vdjdb.set_index('complex.id')

df_vdjdb_a = df_vdjdb[df_vdjdb['gene']=='TRA'][['cdr3', 'v.segm', 'j.segm']]
df_vdjdb_a.columns = [f'{c}_a' if c in ['cdr3', 'v.segm', 'j.segm'] else c for c in df_vdjdb_a.columns]

df_vdjdb_b = df_vdjdb[df_vdjdb['gene']=='TRB'][['cdr3', 'v.segm', 'j.segm']]
df_vdjdb_b.columns = [f'{c}_b' if c in ['cdr3', 'v.segm', 'j.segm'] else c for c in df_vdjdb_b.columns]

df_vdjdb = df_vdjdb[~df_vdjdb.index.duplicated(keep='first')]
df_vdjdb = pd.concat([df_vdjdb, df_vdjdb_a, df_vdjdb_b], axis=1)

df_vdjdb = df_vdjdb.rename(columns=rename_vdjdb)
df_vdjdb['MHC'] = np.nan
df_vdjdb = df_vdjdb[required_cols]

df_vdjdb[col_cdr3b] = df_vdjdb[col_cdr3b].apply(lambda x: trim_tcr(x, True))
df_vdjdb[col_cdr3a] = df_vdjdb[col_cdr3a].apply(lambda x: trim_tcr(x, False))

df_vdjdb = df_vdjdb.reset_index(drop=True)

df_vdjdb.to_csv('../data/processed/vdjdb.csv')
df_vdjdb.head()

Unnamed: 0,CDR3_alpha,V_alpha,J_alpha,CDR3_beta,V_beta,J_beta,Epitope,MHC
0,IVRAPGRADM,TRAV26-1*01,TRAJ43*01,ASSYLPGQGDHYSNQPQ,TRBV13*01,TRBJ1-5*01,FLKEKGGL,
1,,,,ASSFEAGQGFFSNQPQ,TRBV13*01,TRBJ1-5*01,FLKEKGGL,
2,AVPSGAGSYQL,TRAV20*01,TRAJ28*01,ASSFEPGQGFYSNQPQ,TRBV13*01,TRBJ1-5*01,FLKEKGGL,
3,AVKASGSRL,TRAV2*01,,ASSYEPGQVSHYSNQPQ,TRBV13*01,TRBJ1-5*01,FLKEKGGL,
4,AYRPPGTYKY,TRAV38-2/DV8*01,TRAJ40*01,ASSALASLNEQ,TRBV14*01,TRBJ2-1*01,FLKEKGGL,


In [10]:
print('Unique Epitopes: ', len(df_vdjdb[col_epitope].unique()))
print('Unique TCRs: ', len(df_vdjdb[col_cdr3b].unique()))
print('Total pairs: ', len(df_vdjdb))

Unique Epitopes:  1169
Unique TCRs:  41727
Total pairs:  62177


## McPAS-TCR
http://friedmanlab.weizmann.ac.il/McPAS-TCR/

"Download the complete database" => downloaded 23.10.2023

In [11]:
rename_mcpas = {
    'Epitope.peptide': col_epitope, 

    'TRAV': col_va,
    'TRAJ': col_ja,
    'CDR3.alpha.aa': col_cdr3a,

    'TRBV' : col_vb,
    'TRBJ': col_jb,
    'CDR3.beta.aa': col_cdr3b,
}

In [12]:
path_mcpas = '../data/raw/McPAS-TCR.csv'
df_mcpas = pd.read_csv(path_mcpas, encoding = "ISO-8859-1")
df_mcpas = df_mcpas[rename_mcpas.keys()]
df_mcpas = df_mcpas.rename(columns=rename_mcpas)

df_mcpas[col_cdr3b] = df_mcpas[col_cdr3b].apply(lambda x: trim_tcr(x, True))
df_mcpas[col_cdr3a] = df_mcpas[col_cdr3a].apply(lambda x: trim_tcr(x, False))

df_mcpas = df_mcpas.reset_index(drop=True)

df_mcpas.to_csv('../data/processed/mcpas_tcr.csv')
df_mcpas.head()

  df_mcpas = pd.read_csv(path_mcpas, encoding = "ISO-8859-1")


Unnamed: 0,Epitope,V_alpha,J_alpha,CDR3_alpha,V_beta,J_beta,CDR3_beta
0,IKAVYNFATCG,,,,TRBV8-1,TRBJ1-1,ASSDAGANTE
1,IKAVYNFATCG,,,,TRBV8-1,TRBJ2-1,ASSDAGAYAE
2,IKAVYNFATCG,,,,TRBV8-3,TRBJ1-1,ASSDAGGAAE
3,IKAVYNFATCG,,,,TRBV8-1,TRBJ1-6,ASSDAGHSPL
4,IKAVYNFATCG,,,,TRBV8-3,TRBJ2-6,ASSDAWGGAEQ


In [13]:
print('Unique Epitopes: ', len(df_mcpas[col_epitope].unique()))
print('Unique TCRs: ', len(df_mcpas[col_cdr3b].unique()))
print('Total pairs: ', len(df_mcpas))

Unique Epitopes:  380
Unique TCRs:  31201
Total pairs:  39985
