# BCR data preprocessing

In [1]:
import os
import airr
import pandas as pd
import scanpy as sc

sc.logging.print_header()

scanpy==1.8.1 anndata==0.7.6 umap==0.5.1 numpy==1.20.2 scipy==1.6.2 pandas==1.2.4 scikit-learn==0.24.2 statsmodels==0.12.2 python-igraph==0.9.1 pynndescent==0.5.2


In [2]:
import dandelion as ddl
ddl.logging.print_header()

dandelion==0.1.9.dev13 pandas==1.2.4 numpy==1.20.2 matplotlib==3.3.4 networkx==2.5.1 scipy==1.6.2 skbio==0.5.6


In [3]:
os.chdir('/lustre/scratch117/cellgen/team297/kt16/panfetal')

In [4]:
adata = sc.read_h5ad('PAN.A01.v01.entire_data_normalised_log.20210429.LYMPHOID.embedding.h5ad')
adata

AnnData object with n_obs × n_vars = 241950 × 33538
    obs: 'n_counts', 'n_genes', 'file', 'mito', 'doublet_scores', 'predicted_doublets', 'name', 'uniform_label', 'uniform_label_expanded_merged', 'uniform_label_lvl0', 'organ', 'Sample.lanes', 'Sort_id', 'age', 'method', 'donor', 'sex', 'Processing_method', 'AnnatomicalPart', 'Sample', 'bbk', 'scvi_clusters', 'anno_lvl_2_LYMPHOID', 'anno_lvl_2_MYELOID', 'anno_lvl_2_MEM_PROGENITORS', 'leiden_150'
    var: 'GeneID', 'GeneName', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'anno_lvl_2_LYMPHOID_colors', 'dendrogram_scvi_clusters', 'donor_colors', 'isin_HSC_IMMUNE_colors', 'isin_LYMPHOID_colors', 'isin_MEM_PROGENITORS_colors', 'isin_MYELOID_LYMPHOID_colors', 'isin_MYELOID_colors', 'isin_STROMA_colors', 'leiden_150_pred_label_expanded_colors', 'organ_colors', 'scvi', 'scvi_clusters_colors', 'uniform_label_expanded_merged_propagated_colors'
    obsm: 'X_scvi', 'X_umap'
    obsp: 'scvi_connectivities', 'scvi_distance

In [5]:
# read in bcr files
bcr_meta = pd.read_csv('bcr/raw/BCR_metadata.csv')
bcr_meta

Unnamed: 0,sample,prefix,individual
0,WSSS_F_Imm10014564,FCAImmP7292029,F29
1,WSSS_F_Imm10014565,FCAImmP7292031,F29
2,WSSS_F_Imm10014566,FCAImmP7292030,F29
3,WSSS_F_Imm10014567,FCAImmP7555860,F41
4,WSSS_F_Imm10014568,FCAImmP7555856,F41
5,WSSS_F_Imm10014569,FCAImmP7555862,F41
6,WSSS_F_Imm10014570,FCAImmP7555859,F41
7,WSSS_F_Imm10014571,FCAImmP7579230,F45
8,WSSS_F_Imm10014572,FCAImmP7579226,F45
9,WSSS_F_Imm10014573,FCAImmP7579228,F45


In [6]:
bcr = pd.DataFrame()
for x in bcr_meta['sample']:
    file = 'bcr/raw/'+x+'/dandelion/all_contig_igblast_db-pass_genotyped.tsv'
    tmp = pd.read_csv(file, sep ='\t')
    bcr = bcr.append(tmp)
bcr.shape

(181608, 82)

In [7]:
# trim to only cells found in the adata object
bcr = bcr[bcr['cell_id'].isin(adata.obs_names)].copy()
bcr.shape

(54889, 82)

In [8]:
%%time
vdj1, adata1 = ddl.pp.filter_contigs(bcr, adata, productive_only=True)

Preparing data: 45606it [01:29, 508.29it/s] 
Scanning for poor quality/ambiguous contigs: 100%|██████████| 25706/25706 [02:42<00:00, 158.61it/s]
Annotating in anndata obs slot : 100%|██████████| 241950/241950 [00:10<00:00, 23464.55it/s]


Finishing up filtering
Initializing Dandelion object
CPU times: user 5min 1s, sys: 2.45 s, total: 5min 4s
Wall time: 5min 54s


In [9]:
%%time
vdj2, adata2 = ddl.pp.filter_contigs(bcr, adata, productive_only=False)

Preparing data: 54889it [00:57, 961.12it/s] 
Scanning for poor quality/ambiguous contigs: 100%|██████████| 26725/26725 [03:11<00:00, 139.32it/s]
Annotating in anndata obs slot : 100%|██████████| 241950/241950 [00:09<00:00, 24548.53it/s]


Finishing up filtering
Initializing Dandelion object
CPU times: user 6min 2s, sys: 2.94 s, total: 6min 5s
Wall time: 6min 11s


In [10]:
%%time
vdj3, adata3 = ddl.pp.filter_contigs(bcr, adata, productive_only=False, simple = True) # back-up in case the step above failed

Preparing data: 54889it [01:04, 855.22it/s] 
Scanning for poor quality/ambiguous contigs: 100%|██████████| 26725/26725 [03:07<00:00, 142.67it/s]
Annotating in anndata obs slot : 100%|██████████| 241950/241950 [00:00<00:00, 888230.95it/s]


Finishing up filtering
Initializing Dandelion object
CPU times: user 6min 4s, sys: 3.11 s, total: 6min 7s
Wall time: 6min 24s


### Find clones

In [11]:
%%time
ddl.tl.find_clones(vdj1)

Finding clones based on VDJ chains : 100%|██████████| 312/312 [00:01<00:00, 156.01it/s]
Refining clone assignment based on VJ chain pairing : 100%|██████████| 21898/21898 [00:00<00:00, 519237.42it/s]


CPU times: user 58.1 s, sys: 421 ms, total: 58.5 s
Wall time: 1min


In [12]:
%%time
ddl.tl.find_clones(vdj2, productive_only = False)

Finding clones based on VDJ chains : 100%|██████████| 393/393 [00:02<00:00, 155.95it/s]
Refining clone assignment based on VJ chain pairing : 100%|██████████| 22451/22451 [00:00<00:00, 460419.21it/s]


CPU times: user 1min 8s, sys: 517 ms, total: 1min 9s
Wall time: 1min 11s


In [13]:
%%time
ddl.tl.find_clones(vdj3, productive_only = False) # back-up

Finding clones based on VDJ chains : 100%|██████████| 401/401 [00:03<00:00, 130.36it/s]
Refining clone assignment based on VJ chain pairing : 100%|██████████| 24659/24659 [00:00<00:00, 343564.49it/s]


CPU times: user 1min 21s, sys: 682 ms, total: 1min 21s
Wall time: 1min 24s


In [14]:
ddl.tl.clone_size(vdj1)
ddl.tl.clone_size(vdj1, max_size = 3)
ddl.update_metadata(vdj1, retrieve = ['mu_count', 'mu_freq'], retrieve_mode = 'split and average')
ddl.update_metadata(vdj1, retrieve = ['mu_count', 'mu_freq'], retrieve_mode = 'average')

In [15]:
ddl.update_metadata(vdj1, retrieve = ['junction_length', 'junction_aa_length', 'np1_length', 'np2_length', ], retrieve_mode = 'split and average')
ddl.update_metadata(vdj1, retrieve = ['sequence', 'sequence_alignment', 'sequence_alignment_aa', 'junction', 'junction_aa', 'germline_alignment', 'fwr1', 'fwr1_aa', 'fwr2', 'fwr2_aa', 'fwr3', 'fwr3_aa', 'fwr4', 'fwr4_aa', 'cdr1', 'cdr1_aa', 'cdr2', 'cdr2_aa', 'cdr3', 'cdr3_aa', 'v_sequence_alignment_aa', 'd_sequence_alignment_aa', 'j_sequence_alignment_aa'], retrieve_mode = 'split and unique only')



In [16]:
ddl.tl.clone_size(vdj2)
ddl.tl.clone_size(vdj2, max_size = 3)
ddl.update_metadata(vdj2, retrieve = ['mu_count', 'mu_freq'], retrieve_mode = 'split and average')
ddl.update_metadata(vdj2, retrieve = ['mu_count', 'mu_freq'], retrieve_mode = 'average')

In [17]:
ddl.update_metadata(vdj2, retrieve = ['junction_length', 'junction_aa_length', 'np1_length', 'np2_length', ], retrieve_mode = 'split and average')
ddl.update_metadata(vdj2, retrieve = ['sequence', 'sequence_alignment', 'sequence_alignment_aa', 'junction', 'junction_aa', 'germline_alignment', 'fwr1', 'fwr1_aa', 'fwr2', 'fwr2_aa', 'fwr3', 'fwr3_aa', 'fwr4', 'fwr4_aa', 'cdr1', 'cdr1_aa', 'cdr2', 'cdr2_aa', 'cdr3', 'cdr3_aa', 'v_sequence_alignment_aa', 'd_sequence_alignment_aa', 'j_sequence_alignment_aa'], retrieve_mode = 'split and unique only')



In [18]:
ddl.tl.clone_size(vdj3)
ddl.tl.clone_size(vdj3, max_size = 3)
ddl.update_metadata(vdj3, retrieve = ['mu_count', 'mu_freq'], retrieve_mode = 'split and average')
ddl.update_metadata(vdj3, retrieve = ['mu_count', 'mu_freq'], retrieve_mode = 'average')

In [19]:
ddl.update_metadata(vdj3, retrieve = ['junction_length', 'junction_aa_length', 'np1_length', 'np2_length', ], retrieve_mode = 'split and average')
ddl.update_metadata(vdj3, retrieve = ['sequence', 'sequence_alignment', 'sequence_alignment_aa', 'junction', 'junction_aa', 'germline_alignment', 'fwr1', 'fwr1_aa', 'fwr2', 'fwr2_aa', 'fwr3', 'fwr3_aa', 'fwr4', 'fwr4_aa', 'cdr1', 'cdr1_aa', 'cdr2', 'cdr2_aa', 'cdr3', 'cdr3_aa', 'v_sequence_alignment_aa', 'd_sequence_alignment_aa', 'j_sequence_alignment_aa'], retrieve_mode = 'split and unique only')



In [20]:
ddl.tl.transfer(adata1, vdj1)
ddl.tl.transfer(adata2, vdj2)
ddl.tl.transfer(adata3, vdj3)

In [21]:
adata1.write('PAN.A01.v01.entire_data_normalised_log.20210429.LYMPHOID.embedding_dandelion_productive.h5ad', compression = 'gzip')

... storing 'filter_rna' as categorical
... storing 'has_contig' as categorical
... storing 'filter_contig_quality' as categorical
... storing 'filter_contig_VDJ' as categorical
... storing 'filter_contig_VJ' as categorical
... storing 'contig_QC_pass' as categorical
... storing 'clone_id' as categorical
... storing 'sample_id' as categorical
... storing 'locus_VDJ' as categorical
... storing 'locus_VJ' as categorical
... storing 'productive_VDJ' as categorical
... storing 'productive_VJ' as categorical
... storing 'v_call_genotyped_VDJ' as categorical
... storing 'v_call_genotyped_VJ' as categorical
... storing 'd_call_VDJ' as categorical
... storing 'j_call_VDJ' as categorical
... storing 'j_call_VJ' as categorical
... storing 'c_call_VDJ' as categorical
... storing 'c_call_VJ' as categorical
... storing 'duplicate_count_VDJ' as categorical
... storing 'duplicate_count_VJ' as categorical
... storing 'junction_aa_VDJ' as categorical
... storing 'junction_aa_VJ' as categorical
... stor

In [22]:
adata2.write('PAN.A01.v01.entire_data_normalised_log.20210429.LYMPHOID.embedding_dandelion_productive_and_nonproductive.h5ad', compression = 'gzip')

... storing 'filter_rna' as categorical
... storing 'has_contig' as categorical
... storing 'filter_contig_quality' as categorical
... storing 'filter_contig_VDJ' as categorical
... storing 'filter_contig_VJ' as categorical
... storing 'contig_QC_pass' as categorical
... storing 'clone_id' as categorical
... storing 'sample_id' as categorical
... storing 'locus_VDJ' as categorical
... storing 'locus_VJ' as categorical
... storing 'productive_VDJ' as categorical
... storing 'productive_VJ' as categorical
... storing 'v_call_genotyped_VDJ' as categorical
... storing 'v_call_genotyped_VJ' as categorical
... storing 'd_call_VDJ' as categorical
... storing 'j_call_VDJ' as categorical
... storing 'j_call_VJ' as categorical
... storing 'c_call_VDJ' as categorical
... storing 'c_call_VJ' as categorical
... storing 'duplicate_count_VDJ' as categorical
... storing 'duplicate_count_VJ' as categorical
... storing 'junction_aa_VDJ' as categorical
... storing 'junction_aa_VJ' as categorical
... stor

In [23]:
adata3.write('PAN.A01.v01.entire_data_normalised_log.20210429.LYMPHOID.embedding_dandelion_productive_and_nonproductive_simple.h5ad', compression = 'gzip')

... storing 'filter_rna' as categorical
... storing 'has_contig' as categorical
... storing 'filter_contig_quality' as categorical
... storing 'filter_contig_VDJ' as categorical
... storing 'filter_contig_VJ' as categorical
... storing 'contig_QC_pass' as categorical
... storing 'clone_id' as categorical
... storing 'sample_id' as categorical
... storing 'locus_VDJ' as categorical
... storing 'locus_VJ' as categorical
... storing 'productive_VDJ' as categorical
... storing 'productive_VJ' as categorical
... storing 'v_call_genotyped_VDJ' as categorical
... storing 'v_call_genotyped_VJ' as categorical
... storing 'd_call_VDJ' as categorical
... storing 'j_call_VDJ' as categorical
... storing 'j_call_VJ' as categorical
... storing 'c_call_VDJ' as categorical
... storing 'c_call_VJ' as categorical
... storing 'duplicate_count_VDJ' as categorical
... storing 'duplicate_count_VJ' as categorical
... storing 'junction_aa_VDJ' as categorical
... storing 'junction_aa_VJ' as categorical
... stor

In [24]:
%%time
vdj1.write_h5('panfetal_bcr_dandelion_filtered_productive.h5')



CPU times: user 50.8 s, sys: 1.69 s, total: 52.5 s
Wall time: 53.6 s


In [25]:
%%time
vdj2.write_h5('panfetal_bcr_dandelion_filtered_productive_and_nonproductive.h5')



CPU times: user 1min 7s, sys: 2.82 s, total: 1min 10s
Wall time: 1min 11s


In [26]:
%%time
vdj3.write_h5('panfetal_bcr_dandelion_filtered_productive_and_nonproductive_simple.h5')



CPU times: user 1min 33s, sys: 4.33 s, total: 1min 38s
Wall time: 1min 38s


In [27]:
# also save the obs separately
adata1.obs.to_csv('PAN.A01.v01.entire_data_normalised_log.20210429.LYMPHOID.embedding_dandelion_productive.csv')

In [28]:
adata2.obs.to_csv('PAN.A01.v01.entire_data_normalised_log.20210429.LYMPHOID.embedding_dandelion_productive_and_nonproductive.csv')

In [29]:
adata3.obs.to_csv('PAN.A01.v01.entire_data_normalised_log.20210429.LYMPHOID.embedding_dandelion_productive_and_nonproductive_simple.csv')

In [30]:
vdj1.write_airr('panfetal_bcr_dandelion_filtered_productive_airr.tsv')

In [31]:
vdj2.write_airr('panfetal_bcr_dandelion_filtered_productive_and_nonproductive_airr.tsv')

In [32]:
vdj3.write_airr('panfetal_bcr_dandelion_filtered_productive_and_nonproductive_simple_airr.tsv')