# Thymus ageing atlas: B cell clonality and lineage analysis

In [None]:
import sys 
import os
from datetime import datetime
today = datetime.today().strftime('%Y-%m-%d')

import pandas as pd
import numpy as np
import scanpy as sc
import anndata as ad
import hdf5plugin
import dandelion as ddl

import matplotlib.pyplot as plt
import seaborn as sns

# Add repo path to sys path (allows to access scripts and metadata from repo)
repo_path = '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/B_compartment'
sys.path.insert(1, repo_path) 
sys.path.insert(2, '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts')

# Add R libs path
os.environ['R_LIBS_USER'] = '/nfs/team205/lm25/condaEnvs/thymusAgeing/lib/R/library'

%load_ext rpy2.ipython
%reload_ext autoreload
%autoreload 2

# Define paths
plots_path = f'{repo_path}/plots/'
data_path = f'{repo_path}/data/'
model_path = os.path.join(repo_path, 'models')
general_data_path = '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/data'

print('Dir for plots: {}'.format(plots_path))
print('Dir for data: {}'.format(data_path))

# Formatting
from matplotlib import font_manager
font_manager.fontManager.addfont("/nfs/team205/ny1/ThymusSpatialAtlas/software/Arial.ttf")
plt.style.use('/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts/plotting/thyAgeing.mplstyle')

# Import custom scripts
from utils import get_latest_version,update_obs,freq_by_donor
from anno_levels import get_ct_levels, get_ct_palette, age_group_levels, age_group_palette
from plotting.utils import plot_grouped_boxplot, calc_figsize

In [None]:
%%capture output
%%R

library(tidyverse)
library(patchwork)
library(magrittr)

source('/nfs/team205/lm25/customScripts/visualisation/customTheme.R')

options(max.print=150)

## Load and prepare data

In [None]:
# Load adata
object_version = 'v4_2024-11-06'
adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_bSplit_scvi_{object_version}.zarr')

# Add cell type annotation
ct_anno = pd.read_csv(f'{data_path}/preprocessing/ctAnnotation/thyAgeing_bSplitxTissue_scvi_v2_2025-02-20_v5.csv', index_col=0)
cols_overlapping = [col for col in ct_anno.columns if col in adata.obs.columns]
if any(cols_overlapping):
    adata.obs.drop(columns=cols_overlapping, inplace=True)
adata.obs = adata.obs.join(ct_anno)

# Update metadata
latest_meta_path = get_latest_version(dir = f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
latest_meta = pd.read_excel(latest_meta_path)
update_obs(adata, latest_meta, on = 'index', ignore_warning = True)

vdj_version = 'v3_2025-02-19'
bcr = ddl.read_h5ddl(f'{data_path}/objects/vdj/thymusAgeing_bcrFiltered_{vdj_version}.h5ddl')

In [None]:
# Remove explore level cells
adata = adata[~adata.obs['taa_l5'].isna()]
#adata = adata[~adata.obs['taa_l5'].str.contains('explore')]

adata

In [None]:
# Specify cell type columns
from anno_levels import get_ct_levels, age_group_levels
col_cell_type_broad = 'taa_l3'
col_cell_type_fine = 'taa_l5'
col_cell_type_broad_levels = get_ct_levels(level = col_cell_type_broad, taa_l1 = 'B')
col_cell_type_fine_levels = get_ct_levels(level = col_cell_type_fine, taa_l1 = 'B')
col_age_group = 'age_group'
col_age_group_levels = eval(f'{col_age_group}_levels')

%R -i plots_path,col_cell_type_broad,col_cell_type_fine,col_cell_type_broad_levels,col_cell_type_fine_levels,col_age_group,col_age_group_levels,data_path

### Explore BCR mutation level

In [None]:
bcr_data = bcr.data.merge(adata.obs[[col_age_group, col_cell_type_fine]], left_on = 'cell_id', right_index = True, how = 'left')

bcr_data.head()

In [None]:
df = bcr_data[['mu_count', 'mu_freq', col_cell_type_fine, 'donor', col_age_group]].copy()
df = df.loc[~df[col_cell_type_fine].str.contains('dev', na=True)]
df[col_age_group] = pd.Categorical(df[col_age_group], categories = ['infant', 'paed', 'adult'], ordered = True)

plt.figure(figsize = (calc_figsize(width = 70, height = 40)))
sns.violinplot(data = df, x = col_cell_type_fine, y = 'mu_count', hue = col_age_group, inner = 'quartile', density_norm='width',
               order = [c for c in col_cell_type_fine_levels if c in df[col_cell_type_fine].tolist()], palette = age_group_palette, scale = 'width')
plt.xticks(rotation = 90)
plt.xlabel('Cell type')
plt.ylabel('N(mutations)')
plt.legend(title = 'Age group', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.savefig(f'{plots_path}/vdjAnalysis/clonotypes/thyAgeing_bSplit_bcrMutationCount_violin.pdf')

### Population markers

In [None]:
sc.tl.rank_genes_groups(adata, groupby = 'taa_l4', n_genes = 100, use_raw = False, method = 't-test', key_added = f'rank_genes_taa_l4', reference = 'rest', pts = False)

In [None]:
marker_genes = {k:sc.get.rank_genes_groups_df(adata, group = k, key = f'rank_genes_taa_l4') for k in adata.obs['taa_l4'].unique().tolist() if not k == 'B_GC-like_explore'}
marker_genes

In [None]:
marker_genes['B_GC-like'].head(50)

- CCL22/CCL17 expressed in EBV infection and germinal centers (https://pubmed.ncbi.nlm.nih.gov/38574017/,https://pubmed.ncbi.nlm.nih.gov/33597749/)

In [None]:
marker_genes['B_med'].head(30)

- MS4A1: B cell proliferation/plasma cell differentiation
- ARPC1B, ARPC2, ARPC3: role in B cell proliferation and controls BCR singalling (deficiency leads to autoimmune disorder), potential role in GCs -> antigen extraction ([Roper,2019](https://elifesciences.org/articles/48093))
- CD74: B cell activation 
- RAC2: regulates cell adhesion in response to BCR stimulation

In [None]:
marker_genes['B_age-associated'].head(30)

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

Extrafollicular vs GC AID+ cells: Extrafollicular AID+ cells were E47+ and Id2–, and occasionally cMYC+, JunB+, and CCL22+, and largely negative for IRF4 (Figure 5A-E). In the GC, AID+ cells were E47+ and Id2– (or borderline Id2+), and negative for cMYC, CCL22, IRF4, and JunB19,39,51 (and data not shown). Extrafollicular CD30+ cells uniquely shared some but not all of the AID+ cell phenotype, being virtually 100% cMYC+, JunB+, and Pax5+, and partially CD20+ and CCL22+ (in decreasing order of frequency). CD30+ cells were, however, IRF4+, Id2+, occasionally IgD+, and E47–. A transitional phenotype toward GC centroblasts was found in CD30+ GC cells (data not shown). (https://www.sciencedirect.com/science/article/pii/S0006497120638253)

AIRE functions as checkpoint control in GC reaction, induced upon CD40 stimulation -> AIRE interacts with AIDA to reduce SHM (AIRE deficient mice have hypermutated antibodies) (https://pmc.ncbi.nlm.nih.gov/articles/PMC10802573/)

CCL22/CCL17 is induced upon CD40 stimulation in GCs and promotes affinity maturation (https://www.nature.com/articles/s41586-021-03239-2)

Extrathymic AIRE expression is regulated through IRF4, IRF8, TBX21, TCF7 and depends on NFkB signalling (plus RANK/CD40/LTB in mTECs) (https://www.sciencedirect.com/science/article/pii/S1568997222001112)

In [None]:
gc_markers = {'General': ['AICDA', 'AIRE', 'TNFRSF8', 'JUNB', 'MYC', 'TCF3', 'ID2', 'ID3', 'CD83', 'CD40', 'IRF8', 'TCF7'],
              'GC': ['PAX5', 'BCL6', 'IRF4'],
              'DZ' : ['CXCR4'],
              #'LZ': ['CXCR5'],
              #'Extrafollicular': ['TCF3'],
              'EBV' : ['CCL22', 'CCL17', 'EBI3', 'CCL3', 'ICAM1'],
              'Immunoglobulin': ['IGHM', 'IGHD', 'IGHA1', 'IGHA2','IGHG1', 'IGHG3', 'IGHE'],
              'PC': ['PRDM1', 'XBP1', 'IRF4'],
              'ABC': ['TBX21', 'RGS1', 'IFI44L', 'ALOX5AP']}

b_markers = {'Other': ['IL6R', 'TNFRSF18','TNFSF13B'],
             'DZ' : ['AICDA', 'CXCR4', 'MYC', 'MKI67', 'TOP2A', 'PCNA', 'BACH2', 'TCF3', 'PAX5', 'IRF4', 'MEF2B', 'FOXO1'],
               'LZ' : ['CXCR5', 'CD83', 'CD86', 'MYBL1', 'SOCS3', 'CD40'],
               'T cell contact': ['CXCL10', 'CCL5', 'CCL3'],
               'GC_misc' : ['ID2', 'ID3',],
               'BCR activtation' : ['CCL22', 'CCL17', 'EBI3', 'CCL3', 'ICAM1'],
               'Recruitment' : ['CCR7'],
               'B_naive' : ['CD22', 'SELL', 'IL4R', 'TCL1A', 'CR2', 'FOXO1', 'IGHM', 'IGHD',],
               'B_mem' : ['CD27', 'CD38', 'FCRL4', 'FCRL5', 'CD44', 'PRDM1', 'IGHA1', 'IGHG1', 'IGHE'],
               'B_plasma' : ['PRDM1', 'XBP1', 'MZB1'], 
               'B_age-assoc' : ['TBX21', 'ITGAX', 'IRF4'],
               'B_pan' : ['CD19', 'MS4A1'],
               'B_med' : ['HLA-DRA', 'HLA-DRB1','AIRE', 'IL15', 'LTA', 'LTB', 'PTPRC', 'CD5', 'SPN', 'CD80' ,'LY6G6C'],
               'B_dev' : ['IGLL1', 'MME', 'RAG1'],
               'B_dev_thy' : ['CD34', 'VPREB1', 'TYROBP'],
               'B_mem_CD1C+' : ['CD1C', 'IGHM', 'FCRL3', 'CR2']}

In [None]:
sc.pl.DotPlot(adata, 
                b_markers,
                categories_order=[c for c in col_cell_type_fine_levels if c in adata.obs[col_cell_type_fine].tolist()],
                groupby = col_cell_type_fine,
                figsize = calc_figsize(width = 250, height = 50),
                mean_only_expressed=True,
                cmap = 'magma').add_totals().style(smallest_dot=1, largest_dot = 40).savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_bSplit_bMarkersExt_dotplot.png')  

In [None]:
# Marker expression
b_markers = {'DZ' : ['CXCR4', 'MYC', 'MKI67', 'TOP2A', 'AICDA', 'PCNA', 'BACH2'],
               'LZ' : ['CXCR5', 'CD83', 'IRF4', 'CD86', 'MYBL1', 'SOCS3'],
               'recruitment' : ['CCR7'],
               'B_naive' : ['CD22', 'SELL', 'IL4R', 'TCL1A', 'CR2', 'FOXO1', 'IGHM', 'IGHD',],
               'B_mem' : ['CD27', 'CD38', 'FCRL4', 'FCRL5', 'CD44', 'PRDM1', 'IGHA1', 'IGHG1', 'IGHE'],
               'B_plasma' : ['PRDM1', 'XBP1', 'MZB1'], 
               'B_age-assoc' : ['TBX21', 'ITGAX',], # ITGAX = CD11c
               'B_pan' : ['CD19', 'MS4A1'],
               'B_med' : ['HLA-DRA', 'AIRE', 'IL15', 'LTA', 'LTB'],
               'B_dev' : ['IGLL1', 'MME', 'RAG1', 'PAX5', 'EBF1', 'BCL11B'],
               'B_dev_thy' : ['CD34', 'VPREB1', 'TYROBP',],}

In [None]:
sc.pl.DotPlot(adata, 
                b_markers,
                categories_order=[c for c in col_cell_type_fine_levels if c in adata.obs[col_cell_type_fine].tolist()],
                groupby = col_cell_type_fine,
                figsize = calc_figsize(width = 200, height = 50),
                mean_only_expressed=True,
                cmap = 'magma').add_totals().style(smallest_dot=1, largest_dot = 40).savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_bSplit_bMarkers_dotplot.pdf')  

In [None]:
[c for c in]

Different stages of GC-like B cells: GC-like, GC-like_prolif, GC-like_activated (prev B_med, potentially related to EBV infection) -> Does EBV infection play a role? There are reports of GC-like fates of EBV infected cells (checking with Alex P whether we can investigate EBV transcript expression)

CD40 stimulation induces AIRE: https://www.sciencedirect.com/science/article/pii/S2589004222021253?via%3Dihub, https://www.pnas.org/doi/10.1073/pnas.2220120120, https://www.sciencedirect.com/science/article/pii/S1074761315002113

CD5 is marker of thymic B cells (https://www.frontiersin.org/journals/immunology/articles/10.3389/fimmu.2021.766698/full#B25)

Paper suggesting that recirculating B cells can be licensced: https://www.sciencedirect.com/science/article/pii/S1074761315002113

-> unsure of whether AIDA/AIRE B cells are resident thymic B cells or recirculating

- Are AIDA/AIRE expressing cells one population? -> likely yes, they co-cluster, share CD5 expression and have GC-like phenotype
- Are these cells antigen presenting medullary B cells or recirculating B cells in GC reaction? Not sure currently. Medullary B cells are considered to present antigens to maturing thymocytes, partly driven through AIRE which is induced upon CD40 stimulation (https://www.sciencedirect.com/science/article/pii/S2589004222021253?via%3Dihub, https://www.pnas.org/doi/10.1073/pnas.2220120120, https://www.sciencedirect.com/science/article/pii/S1074761315002113). It is questionable whether these cells are of thymic origin only or whether recirculating B cells can also be licensced to present antigens (https://www.sciencedirect.com/science/article/pii/S1074761315002113). Medullary B cells also express CCL22 and CCL17, and undergo class switch recombination which requires AIDA expression, and have been reported to have a GC-like phenotype (https://www.sciencedirect.com/science/article/pii/S2589004222021253?via%3Dihub, https://www.frontiersin.org/journals/immunology/articles/10.3389/fimmu.2021.766698/full#B25). Age-associated loss of AIRE expression has also been reported (https://www.frontiersin.org/journals/immunology/articles/10.3389/fimmu.2021.766698/full#B25). On the other hand, GC B cells have been shown to express AIRE to dampen SHM upon strong BCR and CD40 stimulation (https://pmc.ncbi.nlm.nih.gov/articles/PMC10802573/). CCL22 and CCL17 expression have been described as a byproduct of strong BCR stimulation in GCs (https://www.nature.com/articles/s41586-021-03239-2). Furthermore, EBV infection has been reported to induce GC-like phenotypes and expression of CCL22, CCL17 and EBI3 (https://pubmed.ncbi.nlm.nih.gov/14747532/, https://pmc.ncbi.nlm.nih.gov/articles/PMC9554744/). -> AIDA expression causes SHM/CSR which happens both in GCs and medullary B cells // AIRE expression is induced upon CD40 stimulation and BCR signalling, triggered by thymocytes and T cells // CCL22 and CCL17 are signs of strong BCR activation
- If this is one population arising from medullary B cells, is there a phenotype switch with age (more towards GC-like)? Need to investigate.

In [None]:
gc_markers_filtered = {k: [v for v in gc_markers[k] if v in adata_spatial.var_names] for k in gc_markers.keys()}
sc.pl.DotPlot(adata_spatial, 
                gc_markers_filtered,
                groupby = 'taa_l4',
                mean_only_expressed=True,
                cmap = 'magma').add_totals().show()

In [None]:
moi = ['CXCR4', 'CXCR2', 'CXCR1', 'CXCR6', 'CXCL8', 'CXCL6', 'CXCL1',
       'CXCL5', 'CXCL3', 'CXCL2', 'CXCL9', 'CXCL10', 'CXCL11', 'CXCL13',
       'CXCL14', 'CXCL12', 'CXCR5', 'CXCL16', 'CXCL17', 'CXCR3',
       'CCR4', 'CCR8', 'CCR9', 'CCR3', 'CCR1', 'CCR2', 'CCR5AS', 'CCR5',
       'CCRL2', 'CCR6', 'CCR7', 'CCR10',
       'CCL20', 'CCL28', 'CCL26', 'CCL24', 'CCL27', 'CCL19', 'CCL21',
       'CCL22', 'CCL17', 'CCL2', 'CCL7', 'CCL11', 'CCL8', 'CCL13', 'CCL1',
       'CCL5', 'CCL16', 'CCL14', 'CCL15', 'CCL23', 'CCL18', 'CCL3',
       'CCL4', 'CCL3L1', 'CCL4L2', 'CCL25',
       'IL22RA1', 'IL23R', 'IL12RB2', 'ILF2', 'IL6R-AS1', 'IL6R', 'ILDR2',
       'IL10', 'IL19', 'IL20', 'IL24', 'IL1R2', 'IL1R1', 'IL1R1-AS1',
       'IL1RL2', 'IL1RL1', 'IL18R1', 'IL18RAP', 'IL1A', 'IL1B', 'IL37',
       'IL36G', 'IL36A', 'IL36B', 'IL36RN', 'IL1F10', 'IL1RN', 'ILKAP',
       'IL5RA', 'IL17RE', 'IL17RC', 'IL17RB', 'IL17RD', 'ILDR1', 'IL20RB',
       'IL20RB-AS1', 'IL12A-AS1', 'IL12A', 'IL1RAP', 'IL2', 'IL21',
       'IL21-AS1', 'IL15', 'IL7R', 'IL31RA', 'IL6ST', 'IL3', 'IL5',
       'IL13', 'IL4', 'IL9', 'IL17B', 'IL12B', 'ILRUN', 'IL17A', 'IL17F',
       'IL20RA', 'IL22RA2', 'IL6', 'IL7', 'IL33', 'IL11RA', 'IL15RA',
       'IL2RA', 'ILK', 'IL18BP', 'IL18', 'IL10RA', 'IL23A', 'IL26',
       'IL22', 'IL31', 'IL17D', 'IL25', 'IL16', 'IL32', 'IL4R', 'IL21R',
       'IL21R-AS1', 'IL27', 'IL34', 'IL17C', 'ILF3-DT', 'ILF3', 'IL27RA',
       'ILVBL', 'IL12RB1', 'IL4I1', 'IL11', 'IL10RB-DT', 'IL10RB',
       'IL17RA', 'IL2RB', 'IL17REL', 'IL3RA', 'IL1RAPL1', 'IL2RG',
       'IL1RAPL2', 'IL13RA2', 'IL13RA1', 'IL9R',
       'IFNLR1', 'IFNGR1', 'IFNB1', 'IFNW1', 'IFNA21', 'IFNA4', 'IFNA7',
       'IFNA10', 'IFNA16', 'IFNA14', 'IFNA5', 'IFNA6', 'IFNA13', 'IFNA2',
       'IFNA8', 'IFNA1', 'IFNE', 'IFNK', 'IFNG-AS1', 'IFNG', 'IFNL3',
       'IFNL2', 'IFNL1', 'IFNAR2', 'IFNAR1', 'IFNGR2',
       'TNFRSF18', 'TNFRSF4', 'TNFRSF14-AS1', 'TNFRSF14', 'TNFRSF25',
       'TNFRSF9', 'TNFRSF8', 'TNFRSF1B', 'TNFAIP8L2', 'TNFSF18', 'TNFSF4',
       'TNFAIP6', 'TNFSF10', 'TNFAIP8', 'TNF', 'TNFRSF21', 'TNFAIP3',
       'TNFRSF10B', 'TNFRSF10C', 'TNFRSF10D', 'TNFRSF10A-AS1',
       'TNFRSF10A', 'TNFRSF11B', 'TNFSF15', 'TNFSF8', 'TNFRSF1A',
       'TNFRSF19', 'TNFSF11', 'TNFSF13B', 'TNFAIP2', 'TNFAIP8L3',
       'TNFRSF12A', 'TNFRSF17', 'TNFSF12', 'TNFSF13', 'TNFRSF13B',
       'TNFAIP1', 'TNFRSF11A', 'TNFAIP8L1', 'TNFSF9', 'TNFSF14',
       'TNFRSF6B', 'TNFRSF13C'
       ]

In [None]:
sc.pl.DotPlot(adata, 
                moi,
                groupby = col_cell_type_fine,
                mean_only_expressed=True,
                cmap = 'magma').add_totals().savefig(f'{plots_path}/phenoAnalysis/thyAgeing_bSplit_cytokines_dotplot.pdf')

- all: CCL5, CXCL10 (induced upon T cell contact and IFN, https://pubmed.ncbi.nlm.nih.gov/25476870/), CCL3 (positive and negative regulation of GCs by Tfr, https://www.sciencedirect.com/science/article/pii/0145212695000984)
- B_med: CCL22, CCL17, IL6R (promotes plasma cell differentiation, https://www.sciencedirect.com/science/article/pii/0145212695000984), ILKAP, IL17RB, IL13RA1, IL4I1, TNFRSF11B, TNSF11 (GC-like less so)
- TNFRSF18, TNFRSF4 (also more in plasma cells)

In [None]:
sc.tl.rank_genes_groups(adata, groupby = 'taa_l5', groups = ['B_transitional'], n_genes = 100, use_raw = False, method = 't-test', key_added = f'rank_genes_transitional', reference = 'rest', pts = False)

sc.get.rank_genes_groups_df(adata, group = 'B_transitional', key = f'rank_genes_transitional').head(50)

- FCRLA: B cell activation, in early-stage and GC B cells
- CD37: mature B cells
- CD1C: presentation of lipid-based antigens, marker of IgM memory cells (https://www.frontiersin.org/journals/immunology/articles/10.3389/fimmu.2021.602539/full)
- VPREB3: early-stage and GC B cells
- PTPRC: on all B cells, increases during differentiation
- SYK: B cell differentiation (also follicular to GC, https://pmc.ncbi.nlm.nih.gov/articles/PMC4416743/), BCR signal amplification (https://elifesciences.org/articles/02069)
- SMIM14: memory differntiation
- RACK1: B cell differentation through stabilising PAX5 (https://www.nature.com/articles/s41423-024-01213-2)
- CR2: complement activation, B cell maturation

In [None]:
sc.pl.violin(adata ,keys = ['percent_mito', 'percent_ribo', 'n_counts', 'n_genes'], groupby = col_cell_type_fine)

### Construct new anno

In [None]:
ct_anno = adata.obs[['taa_l5']].astype(str).copy()
anno_levels = pd.read_excel(f'{general_data_path}/curated/thyAgeing_full_curatedAnno_v9_2025-03-03_levels.xlsx')

# Rename cell types
ct_anno.loc[ct_anno['taa_l5'] == 'B_GC-like_explore','taa_l5'] = 'B_GC-like-locnt'
ct_anno.loc[ct_anno['taa_l5'] == 'B_med','taa_l5'] = 'B_GC-like_AIRE+'
ct_anno.loc[ct_anno['taa_l5'].isin(['B_transitional', 'B_mem_CR2+']),'taa_l5'] = 'B_mem_CR2+'

ct_anno = ct_anno.reset_index(names='cell_id').merge(anno_levels, on = 'taa_l5', how = 'left').set_index('cell_id')[anno_levels.columns]

ct_anno.to_csv(f'{data_path}/objects/rna/thyAgeing_bSplit_scvi_{object_version}_curatedAnno_v6.csv')
ct_anno.head()

In [None]:
adata.obs.drop(columns = [c for c in ct_anno.columns if c in adata.obs.columns], inplace = True)
adata.obs = adata.obs.join(ct_anno)

with plt.rc_context(rc={'figure.figsize': calc_figsize(width = 'one-half', height = 120)}):
    sc.pl.umap(adata, color = ct_anno.columns.tolist(), wspace = 0.3, return_fig = True, ncols = 2)
    plt.savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_bSplit_scvi_{object_version}_ctAnnotation_v9.pdf', bbox_inches='tight', dpi=300)

In [None]:
# Marker expression
b_markers = {'B_pan' : ['CD19', 'MS4A1'],
             'B_naive' : ['CD22', 'SELL', 'IL4R', 'TCL1A', 'CR2', 'FOXO1', 'IGHM', 'IGHD',],
               'B_mem' : ['CD27', 'CD38', 'FCRL4', 'FCRL5', 'CD44', 'PRDM1', 'IGHA1', 'IGHG1', 'IGHE'],
               'B_mem_CD1C+' : ['CD1C', 'IGHM', 'FCRL3', 'CR2'],
               'B_plasma' : ['PRDM1', 'XBP1', 'MZB1'], 
               'B_age-assoc' : ['TBX21', 'ITGAX',], # ITGAX = CD11c
               'B_med' : ['HLA-DRA', 'AIRE', 'IL15', 'LTA', 'LTB'],
               'DZ' : ['CXCR4', 'MYC', 'MKI67', 'TOP2A', 'AICDA', 'PCNA', 'BACH2'],
               'LZ' : ['CXCR5', 'CD83', 'IRF4', 'CD86', 'MYBL1', 'SOCS3'],
               'B_dev' : ['IGLL1', 'MME', 'RAG1', 'PAX5', 'EBF1', 'BCL11B'],
               'B_dev_thy' : ['CD34', 'VPREB1', 'TYROBP',],}

sc.pl.DotPlot(adata, 
                b_markers,
                categories_order=[c for c in col_cell_type_fine_levels if c in adata.obs[col_cell_type_fine].tolist()],
                groupby = col_cell_type_fine,
                figsize = calc_figsize(width = 200, height = 50),
                mean_only_expressed=True,
                cmap = 'magma').add_totals().style(smallest_dot=1, largest_dot = 40).savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_bSplit_scvi_{object_version}_finalAnno_bMarkers_dotplot.pdf')  

## Plot isotype usage

In [None]:
# Add BCR metadata
bcr_meta = pd.read_csv(f'{data_path}/objects/rna/thyAgeing_bSplit_scvi_{object_version}_bcr_{vdj_version}.csv', index_col=0)
adata.obs = adata.obs.join(bcr_meta)

In [None]:
bcr_df = adata.obs[['isotype', 'isotype_status','chain_status', 'locus_status', col_cell_type_broad, 'donor', 'sample', col_age_group]].copy()
samples_to_exclude = bcr_df.groupby(['sample'])['isotype_status'].value_counts(normalize=True).where(lambda x: (x > 0.9) & (x.index.get_level_values(1) == 'No_contig')).dropna().index.get_level_values(0).tolist()
bcr_df = bcr_df[~bcr_df['sample'].isin(samples_to_exclude)]

bcr_df[['isotype_status','isotype']].value_counts()

In [None]:
df = bcr_df.groupby(['sample', 'donor', col_age_group, col_cell_type_broad], observed = True)['isotype'].value_counts(normalize=True).reset_index(name='prop')
df = df.groupby(['donor', col_age_group, col_cell_type_broad, 'isotype'], observed = True).agg(mean_prop = ('prop', 'mean')).reset_index()
df = df.loc[(df['isotype'] != 'IgE') & (~df[col_cell_type_broad].isin(['B_dev', 'B_dev_thy']))]
df

In [None]:
from plotting.utils import calc_figsize, plot_faceted_grouped_boxplot

p = plot_faceted_grouped_boxplot(data = df, x = 'isotype', y = 'mean_prop', hue = col_age_group, hue_order = ['infant', 'paed', 'adult'], order = df['isotype'].unique().tolist(), 
                             facet_kwargs = dict(col = col_cell_type_broad, col_wrap = 4, height = calc_figsize(width = 100, height = 30)[1], aspect = 1, col_order = df[col_cell_type_broad].unique().tolist()),
                             add_stats = True, #save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_devSplit_TRB_kidera_prod_', 
                             format_percent = True, format_log = False, x_label = 'Cell population', y_label = 'Mean TRB Kidera Factor', legend_title='Age group', figsize = calc_figsize(width = 100, height=50),
                             ylim = None, #legend_kwargs = dict(loc = "upper left", bbox_to_anchor=(1, 1))
                             )
p.tight_layout()
#plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_newTSplit_TRB_kidera_prod.pdf', bbox_inches='tight', dpi=300)

In [None]:
df[col_cell_type_fine].unique().tolist()

In [None]:

bcr_df['isotype_short'] = bcr_df['isotype'].apply(lambda x : 'Multiple' if (isinstance(x, str) and '|' in x) else ('Orphan' if isinstance(x, float) else x))
#bcr_df = bcr_df.loc[bcr_df['isotype_short'] != 'No_contig']

isotype_freq = bcr_df.groupby(['sample', 'donor', col_age_group])['isotype_short'].value_counts(normalize=True).reset_index(name = 'prop')
isotype_freq = isotype_freq.groupby(['donor', col_age_group, 'isotype_short']).agg(mean_prop = ('prop', 'mean'),
                                                                                 sd_prop = ('prop', 'std')).reset_index()

isotype_freq.head()

In [None]:
%%R -i isotype_freq -w 300 -h 150 -u mm

plot_data = isotype_freq %>%
tidyr::drop_na(mean_prop) %>%
dplyr::mutate(!!col_age_group := factor(!!rlang::sym(col_age_group), levels = col_age_group_levels))

kruskal_test_res = plot_data %>%
dplyr::group_by(isotype_short) %>%
dplyr::filter(length(unique(!!rlang::sym(col_age_group))) > 1) %>%
rstatix::group_by(isotype_short) %>%
rstatix::kruskal_test(as.formula(paste0('mean_prop ~', col_age_group))) %>%
rstatix::adjust_pvalue(method = 'BH') %>%
rstatix::add_significance() 
kruskal_test_res %>% readr::write_csv(file.path(data_path, 'analysis/vdj', 'thyAgeing_isotypeFreq_byAge_kruskal.csv'))

dunn_test_res = plot_data %>%
dplyr::group_by(isotype_short) %>%
dplyr::filter(length(unique(!!rlang::sym(col_age_group))) > 1) %>%
rstatix::group_by(isotype_short) %>%
rstatix::dunn_test(as.formula(paste0('mean_prop ~', col_age_group)), p.adjust.method = 'BH') %>%
rstatix::add_significance() %>%
rstatix::add_xy_position(x = 'isotype_short', step.increase = 0.05)
dunn_test_res %>% readr::write_csv(file.path(data_path, 'analysis/vdj', 'thyAgeing_isotypeFreq_byAge_dunn.csv'))

isotype_freq %>%
rstatix::group_by(!!rlang::sym(col_age_group), isotype_short) %>%
rstatix::get_summary_stats(mean_prop) %>%
readr::write_csv(file.path(data_path, 'analysis/vdj', 'thyAgeing_isotypeFreq_byAge_summary.csv'))

plot_data %>%
ggplot(aes(x = isotype_short, y = mean_prop, color = !!rlang::sym(col_age_group))) +
geom_boxplot(outlier.size = 0.5, position = position_dodge2(preserve = 'single')) +
ggpubr::stat_pvalue_manual(dunn_test_res, hide.ns = T, label = 'p.adj.signif', tip.length = 0.01) +
#ggforce::facet_row(~taa_l0, scales = 'free', space = 'free') +
labs(x = 'Cell population', y = "Mean frequency", color = 'Age group') +
grafify::scale_color_grafify(palette = "fishy") +
scale_y_continuous(labels = scales::percent_format(scale = 100)) +
theme_simple(facet = F) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
ggsave(paste0("thyAgeing_isotypeFreq_byAge_boxplot.png"), path = file.path(plots_path, 'vdjAnalysis'), 
       width = 150, height = 100, units = "mm", dpi = 300)

## Clonotypes by donor

In [None]:
# Clone numbers
df = bcr_df.groupby(['donor', col_age_group])['changeo_clone_id'].nunique().reset_index(name = 'n_clones')
df = df.merge(bcr_df.groupby(['donor', col_age_group]).size().reset_index(name = 'n_cells'))
df['n_clones_norm'] = df['n_clones'] / df['n_cells']
df

In [None]:
# Clone size
df = bcr_df.groupby(['donor', col_age_group,'changeo_clone_id'], observed=True).size().reset_index(name = 'clone_size')
donor_order = bcr_df[['donor', 'age_months']].drop_duplicates().dropna().sort_values('age_months')['donor'].tolist()

with sns.plotting_context('paper', font_scale = 1.4):
    p = sns.violinplot(data = df, x = 'donor', y = 'clone_size', hue = col_age_group, hue_order = ['infant', 'paed', 'adult'], palette = 'colorblind',
                       density_norm = 'width', aspect = 1.3, dodge = False, order = donor_order)
    p.xaxis.set_tick_params(rotation=45)
    p.set_xlabel('Donor')
    p.set_ylabel('Clone size')
    p.legend(title = 'Age group')
    p.set_yscale('log') 
    
plt.savefig(f'{plots_path}/vdjAnalysis/thyAgeing_cloneSize_byAge_violin.png', dpi = 300, bbox_inches = 'tight')

## Lineage analysis using Immcantation

Use downser from immcancantation (https://dowser.readthedocs.io/en/latest/vignettes/Building-Trees-Vignette/) to construct lineage trees 

In [None]:
%%R -i data_path,vdj_version

# Load bcr data
bcr = read_tsv(file.path(data_path, paste0('/analysis/vdj/hypermutation/thymusAgeing_bcrFiltered_', vdj_version, '_airr.tsv')))

# Remove any cells without a cell type and with stop codons
bcr = bcr %>% filter(!is.na(!!rlang::sym(col_cell_type_fine)),
                      !stop_codon)

In [None]:
%%R

# Remove cells with multiple heavy chains
multi_heavy <- table(dplyr::filter(bcr, locus == "IGH")$cell_id)
multi_heavy_cells <- names(multi_heavy)[multi_heavy > 1]

print(paste('Removing', length(multi_heavy_cells), 'cells with multiple heavy chains'))

bcr = bcr %>%
dplyr::filter(!cell_id %in% multi_heavy_cells)

nrow(bcr)

In [None]:
%%R

# Remove cells without any heavy chain
heavy_cells <- dplyr::filter(bcr, locus == "IGH")$cell_id
light_cells <- dplyr::filter(bcr, locus == "IGK" | locus == "IGL")$cell_id
no_heavy_cells <- light_cells[which(!light_cells %in% heavy_cells)]

print(paste('Removing', length(no_heavy_cells), 'cells without heavy chain'))

bcr = bcr %>%
dplyr::filter(!cell_id %in% no_heavy_cells)

nrow(bcr)

In [None]:
%%R

bcr %>%
rstatix::freq_table(donor)

In [None]:
%%R 
# Calculate distribution of distance to nearest
dist_nearest <- shazam::distToNearest(bcr,
                                      cellIdColumn = 'cell_id', # Single-cell mode
                                      fields = 'donor',
                                      locusColumn = 'locus',
                                      onlyHeavy=FALSE,
                                      nproc = 4)

# find threshold for cloning automatically and initialize the Gaussian fit
# parameters of the nearest-neighbor

# distance of clones using distribution of distance to nearest
threshold_output <- shazam::findThreshold(dist_nearest$dist_nearest,
                                          method = "gmm", model = "gamma-norm", 
                                          cutoff = "user", spc = 0.95)
threshold <- threshold_output@threshold
threshold

In [None]:
%%R

# Cluster clones
results <- scoper::hierarchicalClones(bcr,
                              cell_id = "cell_id",
                              clone = 'clone_id',
                              threshold = threshold,
                              only_heavy = FALSE, split_light = TRUE,
                              summarize_clones = FALSE,
                              fields = "donor")

In [None]:
%%R

results %>% colnames()

In [None]:
%%R
# calculate and plot the rank-abundance curve
abund <- alakazam::estimateAbundance(dplyr::filter(results, locus == "IGH"),
                           group = "donor", nboot = 100)

abund_plot <- alakazam::plot(abund, silent=T)
abund_plot

In [None]:
%%R -i plots_path -w 200 -h 200 -u mm -o clone_sizes

# get clone sizes using dplyr functions
clone_sizes <- alakazam::countClones(results,
                           groups = "donor")

# plot cells per clone
ggplot(clone_sizes, aes(x = seq_count)) +
  geom_bar() +
  facet_wrap(~donor, scales = 'free_y') +
  labs(x = "Sequences per clone", x = 'Log10(Clones)') +
  scale_y_continuous(expand = expansion(mult = c(0, 0.05)), trans = 'log10') +
  theme_bw()
ggsave('thyAgeing_bSplit_seqPerClone.png',path = file.path(plots_path, 'vdjAnalysis', 'clonotypes'),
       height = 200, width = 200, units = 'mm', dpi = 300)

In [None]:
%%R

# Remove sequences assigned to no clones
results <- dplyr::filter(results, !is.na(clone_id))

# Read in IMGT reference sequences
references <- dowser::readIMGT(dir = "/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/B_compartment/data/references/immcantation_imgt/human/vdj")

# Reconstruct germlines
results <- dowser::createGermlines(results, references, fields =c("donor"), nproc = 4)

In [None]:
%%R
results %>% colnames()

In [None]:
%%R

clones <- dowser::formatClones(results,
                       traits = c(col_cell_type_fine),
                       columns = c('donor'),
                       cell = 'cell_id',
                       #text_fields = c("taa_l5"), 
                       minseq = 3, nproc = 8)

In [None]:
%%R
clones %>% colnames()

In [None]:
%%R

Sys.setenv(IGPHYML_PATH = "/nfs/team205/lm25/condaEnvs/thymusAgeing/share/igphyml/motifs")

trees <- dowser::getTrees(clones, nproc = 8, build="igphyml", exec="/nfs/team205/lm25/condaEnvs/thymusAgeing/bin/igphyml")

trees

In [None]:
%%R -o trees_df

trees_df = trees %>%
dplyr::select(clone_id, locus, seqs, donor) %>%
data.frame() 

In [None]:
df = trees_df.merge(latest_meta[['donor', col_age_group]].drop_duplicates(), on = 'donor')
df = df.groupby(['donor', col_age_group]).agg(n_clones = ('clone_id', 'nunique')).reset_index()

df

In [None]:
%%R -i donor_meta -h 100 -w 200 -u mm

trees %>%
rstatix::freq_table(donor) %>%
dplyr::right_join(donor_meta, by = c('donor' = 'donor')) %>%
dplyr::filter(donor %in% bcr$donor) %>%
tidyr::replace_na(list(n = 0, prop = 0)) %>%
dplyr::mutate(age_group = factor(age_group, levels = c('infant', 'paed', 'adult'))) %>%
ggplot(aes(x = donor, y = n, fill = age_group)) +
  geom_bar(stat = 'identity') +
  labs(x = 'Age group', y = 'Count') +
  ggforce::facet_row(~ age_group, scale = 'free_x', space = 'free') +
  scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
  grafify::scale_fill_grafify(palette = "fishy") +
  labs(fill = 'Age group', x = 'Donor', y = 'N(trees)') +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme_simple(facet = T)
  
ggsave('thyAgeing_bSplit_nTreesByDonor_barplot.png',path = file.path(plots_path, 'vdjAnalysis', 'clonotypes'),
       height = 100, width = 200, units = 'mm', dpi = 300)

In [None]:
%%R -i vdj_version

saveRDS(trees, file = file.path(data_path, paste0('objects/vdj/thymusAgeing_bcrFiltered_', vdj_version, '_trees.rds')))

In [None]:
%%R

tree_plots <- dowser::plotTrees(trees, tips = "taa_l5", tipsize = 2)

dowser::treesToPDF(tree_plots, file.path(plots_path, 'vdjAnalysis', 'clonotypes', 'thyAgeing_bSplit_bLineageTrees.pdf'), nrow = 2, ncol = 2)

In [None]:
%%R

# calculate switches along trees compared to 100 random permutations 
# this may take a while, and can be parallelized using nproc
switches = dowser::findSwitches(clones, permutations=100, trait='donor', 
  igphyml="/nfs/team205/lm25/condaEnvs/thymusAgeing/bin/igphyml", fixtrees=TRUE)

ps = dowser::testPS(switches$switches)

print(ps$means)

In [None]:
%%R
print(ps$means)