# Thymus ageing atlas: B cell compartment - final object

In [None]:
import os
import sys
import session_info
from datetime import datetime
today = datetime.today().strftime('%Y-%m-%d')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import anndata as ad
import hdf5plugin
import scFates as scf

import warnings
warnings.filterwarnings("ignore", category=ad.ImplicitModificationWarning)

# Add repo path to sys path (allows to access scripts and metadata from repo)
#repo_path,_ = os.path.split(os.path.split(os.getcwd())[0])
repo_path = '/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/B_compartment'
sys.path.insert(1, repo_path) 
sys.path.insert(2, '/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts')

# Autoreload custom scripts
%load_ext autoreload
%autoreload 2

# Define paths
plots_path = f'{repo_path}/plots/'
data_path = f'{repo_path}/data/'
model_path = os.path.join(repo_path, 'models')
general_data_path = '/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/data'

print('Dir for plots: {}'.format(plots_path))
print('Dir for data: {}'.format(data_path))

# Formatting
from matplotlib import font_manager
sc.settings.set_figure_params(dpi = 150, color_map = 'RdPu', dpi_save = 300, vector_friendly = True, format = 'pdf')
font_manager.fontManager.addfont("/nfs/team205/ny1/ThymusSpatialAtlas/software/Arial.ttf")
plt.style.use('/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts/plotting/thyAgeing.mplstyle')

# Import custom scripts
from utils import get_latest_version,update_obs,freq_by_donor,sanitise_obs,cellxgene_prep
from anno_levels import get_ct_levels, get_ct_palette, age_group_levels, age_group_palette, t_nk_groupings
from plotting.utils import plot_grouped_boxplot, calc_figsize

In [None]:
# Load adata
object_version = 'v5_2025-04-03'
adata = ad.read_h5ad(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_{object_version}.zarr')

# Add new annotations to adata
ct_anno = pd.read_csv(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_v4_2025-02-04_curatedAnno_v10.csv', index_col = 0)
adata.obs = adata.obs.join(ct_anno, how = 'left')
adata = adata[adata.obs['anno_status'] == 'include']

# Update metadata
latest_meta_path = get_latest_version(dir = f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
latest_meta = pd.read_excel(latest_meta_path)
update_obs(adata, latest_meta, on = 'index', ignore_warning = True)

adata

In [None]:
adata = adata[(adata.obs['taa_l1'] == 'B') & (adata.obs['taa_l5'] != 'B_dev_thy')]

adata.shape

In [None]:
from scvi_wrapper import run_scvi
object_version = f'v5_{today}'

# Run scvi
scvi_run = run_scvi(adata, 
                    layer_raw = 'X', 
                    # Excluded genes
                    include_genes=[], exclude_cc_genes=True, exclude_mt_genes=True, 
                    exclude_vdjgenes = True, remove_cite = False,
                    # Highly variable gene selection
                    batch_hv="study", hvg = 500, #span = 1,
                    hvg_selection = 'experimental',
                    # scVI 
                    batch_scvi="sample",
                    cat_cov_scvi=["donor", "chemistry_simple", "sex"], 
                    cont_cov_scvi=[], 
                    max_epochs=400, batch_size=2000, early_stopping = True, early_stopping_patience = 15, early_stopping_min_delta = 10.0,
                    plan_kwargs = {'lr': 0.001, 'reduce_lr_on_plateau' : True, 'lr_patience' : 10, 'lr_threshold' : 20}, 
                    n_layers = 3, n_latent = 30, dispersion = 'gene-batch',
                    # Leiden clustering
                    leiden_clustering = None, col_cell_type = ['taa_l5', 'taa_l4'], 
                    fig_dir = f'{plots_path}/preprocessing/scvi', fig_prefix = f'thyAgeing_bSplit_scvi_{object_version}')

In [None]:
# Save adata and scvi model
overwrite = True

sanitise_obs(scvi_run['data'])

anno_cols = [c for c in scvi_run['data'].obs.columns if '_pred_' in c or '_prob_' in c or 'taa' in c]
if not os.path.exists(f'{data_path}/thyAgeing_bSplit_scvi_{object_version}.zarr') or overwrite:
    print(f'Saving adata and model: Version {object_version}')
    scvi_run['data'].obs = scvi_run['data'].obs.drop(columns=anno_cols)
    scvi_run['data'].write_h5ad(
        f'{data_path}/objects/rna/thyAgeing_bSplit_scvi_{object_version}.zarr',
        compression=hdf5plugin.FILTERS["zstd"],
        compression_opts=hdf5plugin.Zstd(clevel=5).filter_options,
    )
    scvi_run['vae'].save(f'{model_path}/thyAgeing_bSplit_scvi_{object_version}', save_anndata=False, overwrite=overwrite)
else:
    print('File already exists')

## Marker plot

In [None]:
object_version = 'v5_2025-04-16'
adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_bSplit_scvi_{object_version}.zarr')

# Add new annotations to adata
# Add new annotations to adata
ct_anno = pd.read_csv(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_v4_2025-02-04_curatedAnno_v10.csv', index_col = 0)
for c in [c for c in ct_anno.columns if c in adata.obs.columns]:
    adata.obs.drop(columns=c, inplace=True)
adata.obs = adata.obs.join(ct_anno, how = 'left')
adata = adata[(adata.obs['anno_status'] == 'include') & (adata.obs['qc_status'] == 'PASS')]

adata.shape

In [None]:
# Define columns
col_cell_type_broad = 'taa_l4'
col_cell_type_fine = 'taa_l5'
col_cell_type_broad_levels = get_ct_levels(col_cell_type_broad, taa_l1 = ['B'])
col_cell_type_fine_levels = get_ct_levels(col_cell_type_fine, taa_l1 = ['B'])
col_age_group = 'age_group'
col_age_group_levels = eval(f'{col_age_group}_levels')

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

In [None]:
from plotting.utils import plot_grouped_boxplot,thyAgeing_colors

In [None]:
sc.pl.DotPlot(adata, 
            categories_order=[c for c in col_cell_type_fine_levels if c in adata.obs[col_cell_type_fine].tolist()],
            groupby = col_cell_type_fine,
            var_names=['IGF1'],
            figsize = calc_figsize(width = 200, height = 50),
            mean_only_expressed=True,
            cmap = sns.blend_palette([thyAgeing_colors['blue'], thyAgeing_colors['purple'], thyAgeing_colors['magenta'], thyAgeing_colors['orange'], thyAgeing_colors['yellow']], as_cmap=True,), #'magma',
            ).style(smallest_dot=0, largest_dot = 40, dot_edge_lw=0.05).add_totals(size = 0.5).show()

In [None]:
sc.pl.DotPlot(adata[adata.obs[col_cell_type_fine] == 'B_plasma'], 
            categories_order=col_age_group_levels   ,
            groupby = col_age_group,
            var_names=['IGF1'],
            figsize = calc_figsize(width = 80, height = 20),
            mean_only_expressed=True,
            cmap = sns.blend_palette([thyAgeing_colors['blue'], thyAgeing_colors['purple'], thyAgeing_colors['magenta'], thyAgeing_colors['orange'], thyAgeing_colors['yellow']], as_cmap=True,), #'magma',
            ).style(smallest_dot=0, largest_dot = 40, dot_edge_lw=0.05).add_totals(size = 0.5).show()

In [None]:
# Marker expression
b_markers = {'B_pan' : ['CD19', 'MS4A1'],
             'B_naive' : ['CD22', 'SELL', 'IL4R', 'TCL1A', 'CR2', 'FOXO1', 'IGHM', 'IGHD',],
               'B_mem' : ['CD27', 'CD38', 'FCRL4', 'FCRL5', 'CD44', 'PRDM1', 'IGHA1', 'IGHG1', 'IGHE'],
               'B_mem_CR2+' : ['CD1C', 'IGHM', 'FCRL3', 'CR2'],
               'B_plasma' : ['PRDM1', 'XBP1', 'MZB1'], 
               'B_age-assoc' : ['TBX21', 'ITGAX',], # ITGAX = CD11c
               'B_med' : ['HLA-DRA', 'AIRE', 'IL15', 'LTA', 'LTB', 'CD5'],
               'DZ' : ['CXCR4', 'MYC', 'MKI67', 'TOP2A', 'AICDA', 'PCNA', 'BACH2'],
               'LZ' : ['CXCR5', 'CD83', 'IRF4', 'CD86', 'MYBL1', 'SOCS3'],
               'B_dev' : ['IGLL1', 'MME', 'RAG1', 'PAX5', 'EBF1', 'BCL11B'],
               'B_dev_thy' : ['CD34', 'VPREB1', 'TYROBP',],}
# t_markers = pd.read_excel(f'{data_path}/curated/matureT_markers.xlsx')
# t_markers = t_markers[(t_markers['compartment'] != 'T_dev') & (~t_markers['population'].str.contains('\(I'))]
# t_markers = t_markers.groupby('population')['gene'].agg(list).to_dict() 
# t_markers.pop('T_ɣδ')
#t_markers['RTE'] = ['TOX2', 'CD38']

sc.pl.DotPlot(adata, 
            categories_order=[c for c in col_cell_type_fine_levels if c in adata.obs[col_cell_type_fine].tolist()],
            groupby = col_cell_type_fine,
            var_names=b_markers,
            figsize = calc_figsize(width = 200, height = 50),
            mean_only_expressed=True,
            cmap = sns.blend_palette([thyAgeing_colors['blue'], thyAgeing_colors['purple'], thyAgeing_colors['magenta'], thyAgeing_colors['orange'], thyAgeing_colors['yellow']], as_cmap=True,), #'magma',
            ).style(smallest_dot=0, largest_dot = 40, dot_edge_lw=0.05).add_totals(size = 0.5).savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_bSplit_scvi_{object_version}_taaL5_bMarkers_dotplot.pdf') 

In [None]:
# Marker expression
b_markers = {'B_pan' : ['CD19', 'MS4A1'],
             'B_naive' : ['CD22', 'SELL', 'IL4R', 'TCL1A', 'CR2', 'FOXO1', 'IGHM', 'IGHD',],
               'B_mem' : ['CD27', 'CD38', 'FCRL4', 'FCRL5', 'CD44', 'PRDM1', 'IGHA1', 'IGHG1', 'IGHE'],
               'B_mem_CR2+' : ['CD1C', 'IGHM', 'FCRL3', 'CR2'],
               'B_plasma' : ['PRDM1', 'XBP1', 'MZB1'], 
               'B_age-assoc' : ['TBX21', 'ITGAX',], # ITGAX = CD11c
               'B_med' : ['HLA-DRA', 'AIRE', 'IL15', 'LTA', 'LTB', 'CD5'],
               'DZ' : ['CXCR4', 'MYC', 'MKI67', 'TOP2A', 'AICDA', 'PCNA', 'BACH2'],
               'LZ' : ['CXCR5', 'CD83', 'IRF4', 'CD86', 'MYBL1', 'SOCS3'],
               'B_dev' : ['IGLL1', 'MME', 'RAG1', 'PAX5', 'EBF1', 'BCL11B'],
               'B_dev_thy' : ['CD34', 'VPREB1', 'TYROBP',],}


b_markers_df = pd.DataFrame([(k, gene) for k, genes in b_markers.items() for gene in genes], columns=['cell_label', 'gene_name'])
b_markers_df.to_csv('/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/B_compartment/data/curated/thyAgeing_bMarkers_short.csv', index=False)

In [None]:
b_gc_goi = {'DZ_entry': ['CXCR5', 'AICDA'], 
            'LZ_entry': ['CCR7'],
            'APC' : ['AIRE', 'LTB', 'LTA'],
            'BCR_activation' : ['EBI3', 'S1PR2'],
            'T_B_interaction' : ['ICAM1', 'ICAM2', 'ICAM3', 'ICAM4', 'ICAM5', 'CD40', 'ICOSLG'], 
            'GC_formation': ['PLXNB1', 'PLXNB2', 'BASP1', 'P2RY8', 'BATF'],
            'IGs': ['IGHA1', 'IGHA2', 'IGHG1', 'IGHG2', 'IGHG3', 'IGHG4', 'IGKC', 'IGLC1', 'IGLC2', 'IGHE', 'IGHD'],}
b_gc_goi_small = {'DZ' : ['AICDA', 'CXCR4', 'MYC', 'MKI67', 'TOP2A', 'PCNA', 'BACH2', 'TCF3', 'PAX5', 'IRF4', 'MEF2B', 'FOXO1'],
               'LZ' : ['CXCR5', 'CD83', 'CD86', 'MYBL1', 'SOCS3', 'CD40'],
               'T cell contact': ['CXCL10', 'CCL5', 'CCL3'],
               'BCR activtation' : ['CCL22', 'CCL17', 'EBI3', 'CCL3', 'ICAM1'],
               'B_med' : ['HLA-DRA', 'HLA-DRB1','AIRE', 'IL15', 'LTA', 'LTB', 'PTPRC', 'CD5', 'SPN', 'CD80' ,'LY6G6C'],}

adata_sub = adata[adata.obs[col_cell_type_fine].str.contains('GC')]
sc.pl.DotPlot(adata_sub, 
            categories_order=[c for c in col_cell_type_fine_levels if c in adata_sub.obs[col_cell_type_fine].tolist()],
            groupby = col_cell_type_fine,
            var_names=b_gc_goi_small,
            figsize = calc_figsize(width = 200, height = 20),
            mean_only_expressed=True,
            cmap = sns.blend_palette([thyAgeing_colors['blue'], thyAgeing_colors['purple'], thyAgeing_colors['magenta'], thyAgeing_colors['orange'], thyAgeing_colors['yellow']], as_cmap=True,), #'magma',
            ).style(smallest_dot=0, largest_dot = 40, dot_edge_lw=0.05).add_totals(size = 0.5).savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_gcSplit_scvi_{object_version}_taaL5_gcMarkersShort_dotplot.pdf') 

In [None]:
from plotting.utils import plot_grouped_boxplot,thyAgeing_colors

b_gc_goi = {'DZ_entry': ['CXCR5', 'AICDA'], 
            'LZ_entry': ['CCR7'],
            'APC' : ['AIRE', 'LTB', 'LTA'],
            'BCR_activation' : ['EBI3', 'S1PR2'],
            'T_B_interaction' : ['ICAM1', 'ICAM2', 'ICAM3', 'ICAM4', 'ICAM5', 'CD40', 'ICOSLG'], 
            'GC_formation': ['PLXNB1', 'PLXNB2', 'BASP1', 'P2RY8', 'BATF'],
            'IGs': ['IGHA1', 'IGHA2', 'IGHG1', 'IGHG2', 'IGHG3', 'IGHG4', 'IGKC', 'IGLC1', 'IGLC2', 'IGHE', 'IGHD'],}
b_gc_goi_small = {'GC-like' : ['AICDA', 'MKI67', 'TOP2A', 'PCNA', 'BACH2', 'TCF3', 'PAX5', 'IRF4', 'MEF2B', 'CXCR5', 'CD86', 'MYBL1', 'SOCS3'],
               'T cell contact': ['CXCL10', 'CCL5', 'CCL3'],
               'BCR activtation' : ['CCL22', 'CCL17', 'EBI3', ],
               'B_med' : ['AIRE', 'IL15', 'LTA', 'CD5', 'SPN', 'CD80'],}

adata_sub = adata[adata.obs[col_cell_type_fine].isin(['B_GC-like', 'B_GC-like_AIRE+', 'B_GC-like_prolif', 'B_mem', 'B_naive'])]
sc.pl.DotPlot(adata_sub, 
            categories_order=['B_naive', 'B_mem', 'B_GC-like_prolif', 'B_GC-like', 'B_GC-like_AIRE+'],
            groupby = col_cell_type_fine,
            var_names=b_gc_goi_small,
            figsize = calc_figsize(width = 160, height = 20),
            mean_only_expressed=True,
            cmap = sns.blend_palette([thyAgeing_colors['blue'], thyAgeing_colors['purple'], thyAgeing_colors['magenta'], thyAgeing_colors['orange'], thyAgeing_colors['yellow']], as_cmap=True,), #'magma',
            ).style(smallest_dot=0, largest_dot = 40, dot_edge_lw=0.05).add_totals(size = 0.5).savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_gcSplit_scvi_{object_version}_taaL5_gcMarkersShort_dotplot.pdf') 

## UMAP

In [None]:
object_version = 'v5_2025-04-16'
adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_bSplit_scvi_{object_version}.zarr')

# Add new annotations to adata
ct_anno = pd.read_csv(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_v4_2025-02-04_curatedAnno_v10.csv', index_col = 0)
for c in [c for c in ct_anno.columns if c in adata.obs.columns]:
    adata.obs.drop(columns=c, inplace=True)
adata.obs = adata.obs.join(ct_anno, how = 'left')
adata = adata[adata.obs['anno_status'] == 'include']

# Update metadata
latest_meta_path = get_latest_version(dir = f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
latest_meta = pd.read_excel(latest_meta_path)
update_obs(adata, latest_meta, on = 'index', ignore_warning = True)

adata

In [None]:
adata.obs['taa_l5'].value_counts()

In [None]:
adata.obs['taa_l5'] = adata.obs['taa_l5'].str.replace('B_GC-like', 'B_med')

In [None]:
with plt.rc_context({'figure.figsize' : calc_figsize(width = 60, height = 50)}):
    sc.pl.umap(adata[adata.obs['qc_status'] == 'PASS'], color='taa_l5', return_fig=True, show = False, size = 1, frameon = False)
    plt.savefig(f'{plots_path}/ctAnnotation/v6/thyAgeing_bSplit_scvi_{object_version}_finalAnno_umap.png', bbox_inches='tight', dpi = 300)

## Save cellxgene object

In [None]:
object_version = 'v5_2025-04-16'
adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_bSplit_scvi_{object_version}.zarr')

# Add new annotations to adata
ct_anno = pd.read_csv(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_v4_2025-02-04_curatedAnno_v10.csv', index_col = 0)
for c in [c for c in ct_anno.columns if c in adata.obs.columns]:
    adata.obs.drop(columns=c, inplace=True)
adata.obs = adata.obs.join(ct_anno, how = 'left')
adata = adata[adata.obs['anno_status'] == 'include']

# Update metadata
latest_meta_path = get_latest_version(dir = f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
latest_meta = pd.read_excel(latest_meta_path)
update_obs(adata, latest_meta, on = 'index', ignore_warning = True)

adata

In [None]:
adata.var['gene_ids'] = adata.var['gene_id'].copy()

In [None]:
cellxgene_prep(adata, object_name = 'ThyAge_B_compartment')