In [1]:
import sys
sys.path.append('..')
from tcr_embedding.utils_preprocessing import Preprocessing

import scanpy as sc
import scirpy as ir
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path_data = '../../data/10x_CD8TC/v7_avidity.h5ad'
adata = sc.read(path_data)

In [3]:
Preprocessing.preprocessing_pipeline(adata, 
                                     clonotype_key_added='clonotype', 
                                     column_cdr3a='IR_VJ_1_junction_aa', 
                                     column_cdr3b='IR_VDJ_1_junction_aa',
                                     cond_vars=['donor'],
                                     stratify_col='binding_name', 
                                     group_col='clonotype', 
                                     val_split=0.2)



100%|██████████| 51605/51605 [01:25<00:00, 605.25it/s]
100%|██████████| 40/40 [00:01<00:00, 27.93it/s]


In [5]:
adata.obs

Unnamed: 0,is_cell,high_confidence,multi_chain,extra_chains,IR_VJ_1_c_call,IR_VJ_2_c_call,IR_VDJ_1_c_call,IR_VDJ_2_c_call,IR_VJ_1_consensus_count,IR_VJ_2_consensus_count,...,donor+binding,set,high_count_binding_name,high_count_binding_label,alpha_len,beta_len,receptor_type,receptor_subtype,chain_pairing,clonotype_size
AAACGGGAGAAGATTC-1-donor_1,True,True,False,"[{""c_call"": ""TRBC2"", ""consensus_count"": 3996, ...",TRAC,,TRBC1,,36437.0,,...,donor_1_A0301_KLGGALQAK_IE-1_CMV_binder,val,A0301_KLGGALQAK_IE-1_CMV_binder,3,12,15,TCR,TRA+TRB,single pair,9
AAACGGGTCGGACAAG-1-donor_1,True,True,False,[],TRAC,,TRBC2,,18565.0,,...,donor_1_no_data,train,no_data,8,14,15,TCR,TRA+TRB,single pair,306
AAAGATGGTACAGACG-1-donor_1,True,True,False,"[{""c_call"": ""TRAC"", ""consensus_count"": 18416, ...",TRAC,,TRBC2,,31549.0,,...,donor_1_no_data,train,no_data,8,12,17,TCR,TRA+TRB,single pair,1
AAAGTAGAGACGCTTT-1-donor_1,True,True,False,[],TRAC,,TRBC2,,34680.0,,...,donor_1_no_data,train,no_data,8,13,17,TCR,TRA+TRB,single pair,1
AAAGTAGAGCGCTTAT-1-donor_1,True,True,False,[],TRAC,TRAC,TRBC2,,30686.0,23335.0,...,donor_1_no_data,train,no_data,8,15,12,TCR,TRA+TRB,extra VJ,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCAGTACCCAAT-8-donor_4,True,True,False,[],TRAC,TRAC,TRBC1,,7066.0,1359.0,...,donor_4_no_data,train,no_data,8,13,13,TCR,TRA+TRB,extra VJ,1
TTTGTCAGTCTAAACC-8-donor_4,True,True,False,[],TRAC,,TRBC2,,5895.0,,...,donor_4_no_data,train,no_data,8,15,15,TCR,TRA+TRB,single pair,1
TTTGTCAGTCTCTCGT-8-donor_4,True,True,False,[],TRAC,,TRBC2,,4090.0,,...,donor_4_no_data,val,no_data,8,17,13,TCR,TRA+TRB,single pair,1
TTTGTCATCCCACTTG-8-donor_4,True,True,False,[],TRAC,,TRBC2,,8036.0,,...,donor_4_no_data,train,no_data,8,16,11,TCR,TRA+TRB,single pair,1


In [4]:
Preprocessing.check_if_valid_adata(adata)



In [5]:
adata.obs.drop(columns='clonotype', inplace=True)

In [None]:
Preprocessing.encode_clonotypes(adata)

In [9]:
adata.obs.clonotype.value_counts()

18943    5328
41435    3837
2235     3823
29208    2977
10       2210
         ... 
4448        1
42950       1
17216       1
34758       1
32663       1
Name: clonotype, Length: 51605, dtype: int64

In [29]:
Preprocessing.encode_tcr(adata, column_cdr3a='IR_VJ_1_junction_aa', column_cdr3b='IR_VDJ_1_junction_aa')

In [37]:
adata.obsm['beta_seq']

array([[ 2,  1, 16, ...,  0,  0,  0],
       [ 2,  1, 16, ...,  0,  0,  0],
       [ 2, 16,  1, ...,  0,  0,  0],
       ...,
       [ 2,  1, 16, ...,  0,  0,  0],
       [ 2, 16,  1, ...,  0,  0,  0],
       [ 2,  1, 17, ...,  0,  0,  0]])

In [33]:
Preprocessing.encode_conditional_var(adata, 'donor')

In [35]:
adata.obsm['donor']

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       ...,
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]])

In [38]:
adata.obs.drop(columns=['set'], inplace=True)

In [48]:
train, val = Preprocessing.stratified_group_shuffle_split(adata.obs, stratify_col='binding_name', group_col='clonotype', val_split=0.2, random_seed=42)

adata.obs['set'] = 'train'
adata.obs.loc[val.index, 'set'] = 'val'

100%|██████████| 4/4 [00:00<00:00,  5.44it/s]


In [50]:
adata.obs.set.value_counts()

train    95203
val      33384
Name: set, dtype: int64