# DeepTCR Preprocessing
This script adds the gene labels (expressed alpha/beta + v/d/j gene) to adata.obsm

It further separates the alpha and beta tcr sequence and pads them both to a length of 40

In [1]:
import scanpy as sc
import scirpy as ir
import os

In [2]:
path_base = '../data/10x_CD8TC/'
path_clean_split = path_base + 'v5_train_val_test.h5ad'

In [3]:
adata = sc.read(path_clean_split)

In [4]:
adata

AnnData object with n_obs × n_vars = 128587 × 5000
    obs: 'IR_VJ_1_locus', 'IR_VJ_2_locus', 'IR_VDJ_1_locus', 'IR_VDJ_2_locus', 'IR_VJ_1_cdr3', 'IR_VJ_2_cdr3', 'IR_VDJ_1_cdr3', 'IR_VDJ_2_cdr3', 'IR_VJ_1_cdr3_nt', 'IR_VJ_2_cdr3_nt', 'IR_VDJ_1_cdr3_nt', 'IR_VDJ_2_cdr3_nt', 'IR_VJ_1_expr', 'IR_VJ_2_expr', 'IR_VDJ_1_expr', 'IR_VDJ_2_expr', 'IR_VJ_1_expr_raw', 'IR_VJ_2_expr_raw', 'IR_VDJ_1_expr_raw', 'IR_VDJ_2_expr_raw', 'IR_VJ_1_v_gene', 'IR_VJ_2_v_gene', 'IR_VDJ_1_v_gene', 'IR_VDJ_2_v_gene', 'IR_VJ_1_d_gene', 'IR_VJ_2_d_gene', 'IR_VDJ_1_d_gene', 'IR_VDJ_2_d_gene', 'IR_VJ_1_j_gene', 'IR_VJ_2_j_gene', 'IR_VDJ_1_j_gene', 'IR_VDJ_2_j_gene', 'IR_VJ_1_c_gene', 'IR_VJ_2_c_gene', 'IR_VDJ_1_c_gene', 'IR_VDJ_2_c_gene', 'IR_VJ_1_junction_ins', 'IR_VJ_2_junction_ins', 'IR_VDJ_1_junction_ins', 'IR_VDJ_2_junction_ins', 'has_ir', 'multi_chain', 'barcode', 'donor', 'cell_clono_cdr3_aa', 'cell_clono_cdr3_nt', 'CD3', 'CD19', 'CD45RA', 'CD4', 'CD8a', 'CD14', 'CD45RO', 'CD279_PD-1', 'IgG1', 'IgG2a', 'IgG2b

## Preprocess Genes

In [5]:
adata.obs[['IR_VJ_1_v_gene', 'IR_VJ_1_j_gene', 'IR_VDJ_1_v_gene', 'IR_VDJ_1_d_gene', 'IR_VDJ_1_j_gene']].value_counts()

IR_VJ_1_v_gene  IR_VJ_1_j_gene  IR_VDJ_1_v_gene  IR_VDJ_1_d_gene  IR_VDJ_1_j_gene
TRAV13-1        TRAJ50          TRBV12-3         None             TRBJ2-2            5342
TRAV21          TRAJ50          TRBV6-6          TRBD2            TRBJ2-3            3838
TRAV13-2        TRAJ45          TRBV5-6          TRBD2            TRBJ1-1            3834
TRAV12-3        TRAJ39          TRBV29-1         None             TRBJ1-2            2977
TRAV35          TRAJ49          TRBV11-2         TRBD1            TRBJ1-2            2213
                                                                                     ... 
TRAV12-2        TRAJ15          TRBV7-3          TRBD1            TRBJ2-7               1
                                TRBV7-6          None             TRBJ2-2               1
                                TRBV7-7          TRBD1            TRBJ2-7               1
                                TRBV7-8          TRBD1            TRBJ2-2               1
TRAV41          TR

In [6]:
adata.obs['IR_VJ_1_v_gene'].value_counts()

TRAV13-1       13510
TRAV21          7961
TRAV12-3        6972
TRAV13-2        6521
TRAV19          6501
TRAV12-2        6166
TRAV1-2         5979
TRAV27          5203
TRAV14DV4       4952
TRAV5           4377
TRAV29DV5       4233
TRAV12-1        4130
TRAV8-3         3948
TRAV17          3911
TRAV3           3906
TRAV35          3449
TRAV38-2DV8     2788
TRAV26-2        2302
TRAV24          2291
TRAV38-1        2156
TRAV26-1        2115
TRAV8-4         2061
TRAV22          1955
TRAV9-2         1883
TRAV8-1         1773
TRAV8-2         1710
TRAV10          1626
TRAV8-6         1411
TRAV25          1365
TRAV30          1310
TRAV20          1299
TRAV4           1189
TRAV1-1         1183
TRAV41          1153
TRAV23DV6       1018
TRAV39           927
TRAV6            753
TRAV2            711
TRAV36DV7        700
TRAV16           690
TRAV34           421
TRAV40            75
TRAV18             3
Name: IR_VJ_1_v_gene, dtype: int64

In [7]:
adata.obs['IR_VJ_1_j_gene'].value_counts()

TRAJ50    12360
TRAJ45     7202
TRAJ42     6834
TRAJ49     5550
TRAJ39     5524
TRAJ33     4696
TRAJ20     4307
TRAJ6      4165
TRAJ52     4155
TRAJ11     3440
TRAJ37     3366
TRAJ26     3019
TRAJ31     2952
TRAJ22     2797
TRAJ4      2767
TRAJ30     2715
TRAJ43     2631
TRAJ34     2425
TRAJ17     2419
TRAJ40     2400
TRAJ21     2345
TRAJ27     2251
TRAJ53     2241
TRAJ9      2117
TRAJ47     2052
TRAJ28     2039
TRAJ54     1934
TRAJ29     1910
TRAJ48     1838
TRAJ23     1754
TRAJ12     1689
TRAJ36     1678
TRAJ5      1520
TRAJ32     1511
TRAJ24     1438
TRAJ13     1385
TRAJ57     1355
TRAJ15     1281
TRAJ10     1209
TRAJ35     1203
TRAJ44     1100
TRAJ3      1061
TRAJ8      1019
TRAJ41      896
TRAJ16      881
TRAJ7       842
TRAJ58      702
TRAJ18      698
TRAJ56      492
TRAJ38      379
TRAJ46       37
TRAJ25        3
TRAJ14        3
Name: IR_VJ_1_j_gene, dtype: int64

In [8]:
adata.obs['IR_VDJ_1_v_gene'].value_counts()

TRBV19      11013
TRBV7-9      8140
TRBV12-3     7395
TRBV20-1     7324
TRBV29-1     6827
TRBV27       6528
TRBV5-6      5803
TRBV11-2     5705
TRBV9        5592
TRBV6-6      5424
TRBV28       4349
TRBV5-4      3598
TRBV12-4     3349
TRBV4-1      3310
TRBV4-3      2993
TRBV6-5      2903
TRBV5-1      2850
TRBV6-3      2832
TRBV3-1      2706
TRBV7-2      2367
TRBV6-1      2290
TRBV2        2274
TRBV13       2117
TRBV7-8      2083
TRBV24-1     1996
TRBV4-2      1942
TRBV30       1729
TRBV15       1333
TRBV5-5      1303
TRBV10-3     1261
TRBV14       1105
TRBV7-3      1099
TRBV18        998
TRBV7-6       961
TRBV6-4       907
TRBV11-3      641
TRBV12-5      625
TRBV10-2      607
TRBV25-1      529
TRBV10-1      493
TRBV21-1      369
TRBV5-8       304
TRBV11-1      275
TRBV7-7       155
TRBV16         96
TRBV6-7        57
TRBV7-4        12
TRBV6-9         7
TRBV5-7         4
TRBV23-1        3
TRBV6-8         3
TRBV5-3         1
Name: IR_VDJ_1_v_gene, dtype: int64

In [9]:
adata.obs['IR_VDJ_1_d_gene'].value_counts()

TRBD2    45049
TRBD1    44817
None     38721
Name: IR_VDJ_1_d_gene, dtype: int64

In [10]:
adata.obs['IR_VDJ_1_j_gene'].value_counts()

TRBJ2-7    23421
TRBJ2-2    17104
TRBJ2-3    16016
TRBJ1-2    15746
TRBJ2-1    15697
TRBJ1-1    14845
TRBJ1-5     7353
TRBJ2-5     6898
TRBJ1-4     3570
TRBJ1-6     2936
TRBJ2-6     2139
TRBJ1-3     1718
TRBJ2-4     1144
Name: IR_VDJ_1_j_gene, dtype: int64

In [11]:
v_alpha_to_id = {k: v for v, k in enumerate(adata.obs['IR_VJ_1_v_gene'].unique())}
j_alpha_to_id = {k: v for v, k in enumerate(adata.obs['IR_VJ_1_j_gene'].unique())}

v_beta_to_id = {k: v for v, k in enumerate(adata.obs['IR_VDJ_1_v_gene'].unique())}
d_beta_to_id = {k: v for v, k in enumerate(adata.obs['IR_VDJ_1_d_gene'].unique())}
j_beta_to_id = {k: v for v, k in enumerate(adata.obs['IR_VDJ_1_j_gene'].unique())}

In [12]:
adata.uns['v_alpha_to_id'] = v_alpha_to_id
adata.uns['j_alpha_to_id'] = j_alpha_to_id

adata.uns['v_beta_to_id'] = v_beta_to_id
adata.uns['d_beta_to_id'] = d_beta_to_id
adata.uns['j_beta_to_id'] = j_beta_to_id

In [13]:
adata.uns['d_beta_to_id']

{'TRBD1': 0, 'TRBD2': 1, 'None': 2}

In [14]:
adata.obsm['v_alpha'] = adata.obs['IR_VJ_1_v_gene'].map(v_alpha_to_id).to_numpy()
adata.obsm['j_alpha'] = adata.obs['IR_VJ_1_j_gene'].map(j_alpha_to_id).to_numpy()

adata.obsm['v_beta'] = adata.obs['IR_VDJ_1_v_gene'].map(v_beta_to_id).to_numpy()
adata.obsm['d_beta'] = adata.obs['IR_VDJ_1_d_gene'].map(d_beta_to_id).to_numpy()
adata.obsm['j_beta'] = adata.obs['IR_VDJ_1_j_gene'].map(j_beta_to_id).to_numpy()

In [15]:
# Sanity check if all genes have a valid (non-nan) label. Should print False for all
import numpy as np
print(np.isnan(adata.obsm['v_alpha']).any())
print(np.isnan(adata.obsm['j_alpha']).any())
print(np.isnan(adata.obsm['v_beta']).any())
print(np.isnan(adata.obsm['d_beta']).any())
print(np.isnan(adata.obsm['j_beta']).any())

False
False
False
False
False


In [16]:
adata.obsm['v_beta']

array([ 0,  1,  2, ..., 34, 17, 22], dtype=int64)

In [17]:
# Sanity check if reversing the map is getting us the original gene names back, should print True
import pandas as pd
inv_map = {v: k for k, v in adata.uns['d_beta_to_id'].items()}

inv_df = pd.Series(adata.obsm['d_beta']).map(inv_map)
(inv_df.to_numpy() == adata.obs['IR_VDJ_1_d_gene'].to_numpy()).all()

True

In [18]:
import sys
sys.path.append('../')
import tcr_embedding as tcr

In [19]:
aa_to_id = {'_': 0, 'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'P': 13,
            'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'V': 18, 'W': 19, 'Y': 20, '+': 21, '<': 22, '>': 23}
tcr.utils.aa_encoding(adata, read_col='IR_VJ_1_cdr3', label_col='alpha_seq', length_col='alpha_len', pad=40, aa_to_id=aa_to_id, start_end_symbol=False)
tcr.utils.aa_encoding(adata, read_col='IR_VDJ_1_cdr3', label_col='beta_seq', length_col='beta_len', pad=40, aa_to_id=aa_to_id, start_end_symbol=False)

In [20]:
adata.obsm['alpha_seq']

array([[ 2,  1, 11, ...,  0,  0,  0],
       [ 2,  1, 15, ...,  0,  0,  0],
       [ 2,  1,  1, ...,  0,  0,  0],
       ...,
       [ 2,  8, 10, ...,  0,  0,  0],
       [ 2,  1, 18, ...,  0,  0,  0],
       [ 2,  6,  1, ...,  0,  0,  0]])

In [23]:
adata.obsm['beta_seq'].shape

(128587, 40)

In [21]:
adata

AnnData object with n_obs × n_vars = 128587 × 5000
    obs: 'IR_VJ_1_locus', 'IR_VJ_2_locus', 'IR_VDJ_1_locus', 'IR_VDJ_2_locus', 'IR_VJ_1_cdr3', 'IR_VJ_2_cdr3', 'IR_VDJ_1_cdr3', 'IR_VDJ_2_cdr3', 'IR_VJ_1_cdr3_nt', 'IR_VJ_2_cdr3_nt', 'IR_VDJ_1_cdr3_nt', 'IR_VDJ_2_cdr3_nt', 'IR_VJ_1_expr', 'IR_VJ_2_expr', 'IR_VDJ_1_expr', 'IR_VDJ_2_expr', 'IR_VJ_1_expr_raw', 'IR_VJ_2_expr_raw', 'IR_VDJ_1_expr_raw', 'IR_VDJ_2_expr_raw', 'IR_VJ_1_v_gene', 'IR_VJ_2_v_gene', 'IR_VDJ_1_v_gene', 'IR_VDJ_2_v_gene', 'IR_VJ_1_d_gene', 'IR_VJ_2_d_gene', 'IR_VDJ_1_d_gene', 'IR_VDJ_2_d_gene', 'IR_VJ_1_j_gene', 'IR_VJ_2_j_gene', 'IR_VDJ_1_j_gene', 'IR_VDJ_2_j_gene', 'IR_VJ_1_c_gene', 'IR_VJ_2_c_gene', 'IR_VDJ_1_c_gene', 'IR_VDJ_2_c_gene', 'IR_VJ_1_junction_ins', 'IR_VJ_2_junction_ins', 'IR_VDJ_1_junction_ins', 'IR_VDJ_2_junction_ins', 'has_ir', 'multi_chain', 'barcode', 'donor', 'cell_clono_cdr3_aa', 'cell_clono_cdr3_nt', 'CD3', 'CD19', 'CD45RA', 'CD4', 'CD8a', 'CD14', 'CD45RO', 'CD279_PD-1', 'IgG1', 'IgG2a', 'IgG2b

In [22]:
adata.write_h5ad('../data/10x_CD8TC/v6_deep_tcr.h5ad', compression='gzip')

... storing 'alpha_seq' as categorical
... storing 'beta_seq' as categorical
