# Thymus ageing atlas - B compartment : Integration of mature B cells with cross-tissue B cells

In [None]:
# Set the environment variable
%env SCIPY_ARRAY_API=1

import os
import sys
import session_info
from datetime import datetime
today = datetime.today().strftime('%Y-%m-%d')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import scanpy as sc
import anndata as ad
import dandelion as ddl
import hdf5plugin
from sklearn.metrics import f1_score

# Add repo path to sys path (allows to access scripts and metadata from repo)
repo_path = '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/B_compartment'
sys.path.insert(1, repo_path) 
sys.path.insert(2, '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts')

%load_ext autoreload
%autoreload 2

# import scvi
# import torch
# torch.cuda.is_available()
# from annotate_ct import get_kNN_predictions
# from scvi_wrapper import run_scvi

from utils import get_latest_version,update_obs,freq_by_donor
from plotting.utils import plot_grouped_boxplot

In [None]:
# Define paths
plots_path = f'{repo_path}/plots'
data_path = f'{repo_path}/data'
model_path = os.path.join(repo_path, 'models')
general_data_path = '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/data'

print('Dir for plots: {}'.format(plots_path))
print('Dir for data: {}'.format(data_path))

## Load data

### TAA

In [None]:
# Load TAA B data
object_version = 'v5_2024-11-06'
adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_bSplit_scvi_{object_version}.zarr')

In [None]:
# Load TAA T data
adata_t = ad.read_h5ad(f'{os.path.split(repo_path)[0]}/T_NK_compartment/data/objects/rna/thyAgeing_tSplit_scvi_v8_2024-11-07.zarr', backed = 'r')
ct_anno = pd.read_csv(f'{os.path.split(repo_path)[0]}/T_NK_compartment/data/objects/rna/thyAgeing_tSplit_scvi_v8_2024-11-07_curatedAnno_v6.csv', index_col = 0, dtype=str)
adata_t.obs = adata_t.obs.join(ct_anno)

adata_t = adata_t[adata_t.obs['taa_l5'] == 'B_dev_thy'].to_memory()

In [None]:
# Combine T and B data
adata = adata.concatenate(adata_t, index_unique=None)

# Add celltypist labels
celltypist_pred = pd.read_csv(f'{data_path}/objects/rna/thyAgeing_bSplit_scvi_{object_version}_celltypistImmuneLowAnnot.csv', index_col = 0, dtype = 'category')
adata.obs = adata.obs.join(celltypist_pred)

# Update metadata
latest_meta_path = get_latest_version(dir = f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
latest_meta = pd.read_excel(latest_meta_path)
update_obs(adata, latest_meta, on = 'index', ignore_warning = True)

adata

### DominguezConde2022 (cross-tissue)

In [None]:
adata_tissue = ad.read_h5ad(f'{data_path}/references/CountAdded_PIP_B_object_for_cellxgene.h5ad')
adata_tissue.obs['chemistry_simple'] = adata_tissue.obs['Chemistry'].apply(lambda x : '5GEX' if "5'" in x else '3GEX')
adata_tissue.obs['sex'] = adata_tissue.obs['Sex'].apply(lambda x : 'F' if x == 'Female' else 'M')
adata_tissue.obs['age_group'] = adata_tissue.obs['Age_range'].apply(lambda x : 'adult' if x in ['50-54'] else 'geriatric')
adata_tissue.obs['study'] = 'DominguezConde2022'
adata_tissue.obs.rename(columns = {'Donor':'donor', 'Manually_curated_celltype' : 'crossTissue_l0', 'Organ' : 'tissue'}, inplace = True)
adata_tissue.obs = adata_tissue.obs[['donor', 'crossTissue_l0', 'chemistry_simple', 'study', 'sex', 'tissue', 'age_group']]
adata_tissue.obs['sample'] = 'DominguezConde2022' + '_' + adata_tissue.obs['donor'].astype(str)

# Rename vars
adata_tissue.var['gene_name'] = adata_tissue.var_names

# Get raw counts
adata_tissue.X = adata_tissue.layers['counts'].copy()

adata_tissue

### Concatenate

In [None]:
# Concatenate datasets
adata = adata.concatenate([adata_tissue])

adata

In [None]:
# Check whether X is counts or normalized
adata.X[:,:100].sum(axis = 1)

In [None]:
# # Remove cells from Bautista et al. 2021 
# adata = adata[adata.obs['study'] != 'Bautista2021'].copy()

In [None]:
sc.pp.filter_cells(adata, min_genes=500)
sc.pp.filter_genes(adata, min_cells=20)

In [None]:
adata.obs['study'].value_counts()

## Integrate

In [None]:
from scvi_wrapper import run_scvi

In [None]:
object_version = f'v2_{today}'

# Run scvi
scvi_run = run_scvi(adata, 
                    layer_raw = 'X', 
                    # Excluded genes
                    include_genes=[], exclude_cc_genes=True, exclude_mt_genes=True, 
                    exclude_vdjgenes = True, remove_cite = False,
                    # Highly variable gene selection
                    batch_hv="study", hvg = 3000, #span = 1,
                    hvg_selection = 'experimental',
                    # scVI 
                    batch_scvi="sample",
                    cat_cov_scvi=["donor", "chemistry_simple", "sex"], 
                    cont_cov_scvi=[], 
                    max_epochs=400, batch_size=2000, early_stopping = True, early_stopping_patience = 15, early_stopping_min_delta = 10.0,
                    plan_kwargs = {'lr': 0.001, 'reduce_lr_on_plateau' : True, 'lr_patience' : 10, 'lr_threshold' : 20}, 
                    n_layers = 3, n_latent = 30, dispersion = 'gene-batch',
                    #Â Leiden clustering
                    leiden_clustering = None, col_cell_type = ['celltypist_mv_pred_immune_low', 'crossTissue_l0', 'taa_l5'], 
                    fig_dir = f'{plots_path}/preprocessing/scvi', fig_prefix = f'thyAgeing_bSplitxTissue_scvi_{object_version}')

In [None]:
# Save adata and scvi model
overwrite = True

for c in scvi_run['data'].obs.columns:
    if scvi_run['data'].obs[c].dtype == 'O':
        scvi_run['data'].obs[c] = scvi_run['data'].obs[c].astype('|S')
        
if not os.path.exists(f'{data_path}/objects/rna/thyAgeing_bSplitxTissue_scvi_{object_version}.zarr') or overwrite:
    scvi_run['data'].write_h5ad(
        f'{data_path}/objects/rna/thyAgeing_bSplitxTissue_scvi_{object_version}.zarr',
        compression=hdf5plugin.FILTERS["zstd"],
        compression_opts=hdf5plugin.Zstd(clevel=5).filter_options,
    )
    scvi_run['vae'].save(f'{model_path}/thyAgeing_bSplitxTissue_scvi_{object_version}', save_anndata=False, overwrite=overwrite)
else:
    print('File already exists')

## Leiden clustering

In [None]:
import re

# Load GEX data
object_version ='v2_2025-02-20'
adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_bSplitxTissue_scvi_{object_version}.zarr')

adata

In [None]:
# Louvain clustering
res_list = [3.0]
for res in res_list:
    sc.tl.leiden(adata, resolution = res, key_added = f"leiden_r{res}")
adata.obs['leiden_r3.0'] = adata.obs['leiden_r3.0'].astype(int).astype('category')

adata.obs[[f'leiden_r{str(r)}' for r in res_list]].to_csv(f'{data_path}/objects/rna/thyAgeing_bSplitxTissue_scvi_{object_version}_leidenClusters.csv')

In [None]:
sc.pl.umap(adata, color = ['leiden_r3.0'], wspace=0.5, legend_fontsize=6, return_fig = True)
plt.savefig(f'{plots_path}/ctAnnotation/v5/thyAgeing_bSplitxTissue_scvi_{object_version}_leidenClusters.png', bbox_inches = 'tight')

In [None]:
sc.pl.umap(adata, color = 'age_group')

## Annotations per cluster

In [None]:
import re

# Load GEX data
object_version ='v2_2025-02-20'
adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_bSplitxTissue_scvi_{object_version}.zarr')

leiden_res = pd.read_csv(f'{data_path}/objects/rna/thyAgeing_bSplitxTissue_scvi_{object_version}_leidenClusters.csv', index_col = 0)
adata.obs = adata.obs.join(leiden_res)
adata.obs['leiden_r3.0'] = pd.Categorical(adata.obs['leiden_r3.0'])

# Load BCR data
vdj_version = 'v3_2025-02-19'
bcr = ddl.read_h5ddl(f'{data_path}/objects/vdj/thymusAgeing_bcrFiltered_{vdj_version}.h5ddl')

adata.obs_names = adata.obs_names.str.replace(r'-\d$', '', regex=True)
ddl.tl.transfer(adata, bcr)

In [None]:
adata

In [None]:
# Log-normalise data
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

In [None]:
# General markers
import pickle

# Load marker data
with open(f'{general_data_path}/markers/allMarkers_lowGranularity_vk8.pkl', 'rb') as f:
    all_mrkrs = pickle.load(f)
    
# Plot markers
all_mrkrs = {k:[g for g in l if g in adata.var_names] for k,l in all_mrkrs.items()}
for k,l in all_mrkrs.items():
    sc.pl.DotPlot(adata, 
              groupby='leiden_r3.0',
              var_names=l,
              mean_only_expressed=True,
              cmap = 'viridis',).add_totals().savefig(f'{plots_path}/ctAnnotation/v5/thyAgeing_bSplitxTissue_{object_version}_{k}Markers_dotplot.png', dpi=300, bbox_inches='tight')

In [None]:
# B cell markers
b_markers = {'B_cells': ['CD79A', 'TCL1A'],
'B_IFN': ['MX1','IFI44L', 'STAT1'], 
'B_naive': ["FCER2", "BANK1", "FCMR", 'IGHM', 'IGHD'], 
'B_transitional' : ["CD24", 'MYO1C', 'MS4A1'],
'B_activated': ['CD69','FOS','FOSB','DUSP1','CD83'], 
'B_preGC': ["MIR155HG", "HIVEP3", "PARVB"],
'B_GC': ["GMDS", "LMO2", "LPP", "BCL6", "AICDA", "H2AFZ", "MKI67", 'POU2AF1', 'CD40', 'SUGCT'], 
'B_LZ_plasmablasts' : ['PAX5', 'CD27', 'TNFSF13','CD9', 'PRDM1', 'XBP1', 'MZB1', 'TNFRSF17', 'FKBP11'], 
'B_ETP' : ['CD34', 'VPREB1'],
'B_pre-pro': ['IL7R', 'ZCCHC7', 'RAG1'],
'B_pro': ['MME', 'DNTT', 'IGLL1'],
'B_small-pre': ['MME', "CD24",],
'B_large-pre': ['MME', 'CD24','MKI67'],
'B_cycling': ['TOP2A', 'CD19', 'MKI67'], 
'B_follicular' : ['CXCR5', 'TNFRSF13B', 'CD22'],
'B_prePB': ["FRZB", "BTNL9", "HOPX"], 
'B_dev' : ['SPN', 'VPREB1'],
'B_plasma': ["XBP1", "PRDM1", "FKBP11"], 
'B_mem': ["TNFRSF13B", "FCRL4", "CLECL1", 'CR2', 'CD27', 'MS4A1', 'IGHA1', 'IGHG1', 'IGHE'],
'B_age-associated' : ['FCRL2', 'ITGAX', 'TBX21'],
'B_perivasc': ['CXCR3', 'CR2', 'CD72' , 'CD37'],
'B_med': ['CD80', 'CD83' , 'CD86', 'HLA-DRA', 'AIRE', 'IL15', 'LTA', 'LTB']
}

gc_markers = {'DZ' : ['CXCR4', 'MYC', 'MKI67', 'TOP2A', 'AICDA', 'PCNA', 'BACH2'],
               'LZ' : ['CXCR5', 'CD83', 'IRF4', 'CD86', 'MYBL1', 'SOCS3'],
               'recruitment' : ['CCR7'],
               'B_naive' : ['CD22', 'SELL', 'IL4R', 'TCL1A', 'CR2', 'FOXO1', 'IGHM', 'IGHD',],
               'B_mem' : ['CD27', 'CD38', 'FCRL4', 'FCRL5', 'CD44', 'PRDM1', 'IGHA1', 'IGHG1', 'IGHE'],
               'B_plasma' : ['PRDM1', 'XBP1', 'MZB1'], 
               'B_age-assoc' : ['TBX21', 'ITGAX'],
               'B_pan' : ['CD19', 'MS4A1'],
               'B_med' : ['HLA-DRA', 'AIRE', 'IL15', 'LTA', 'LTB']}

In [None]:
# Plot markers
sc.pl.DotPlot(adata, 
              groupby='leiden_r3.0',
              var_names=b_markers,
              mean_only_expressed=True,
              cmap = 'viridis',).add_totals().savefig(f'{plots_path}/ctAnnotation/v5/thyAgeing_bSplitxTissue_{object_version}_bFineMarkers_dotplot.png', dpi=300, bbox_inches='tight')

# Plot markers
sc.pl.DotPlot(adata, 
              groupby='leiden_r3.0',
              var_names=gc_markers,
              mean_only_expressed=True,
              cmap = 'viridis',).add_totals().savefig(f'{plots_path}/ctAnnotation/v5/thyAgeing_bSplitxTissue_{object_version}_gcMarkers_dotplot.png', dpi=300, bbox_inches='tight')

In [None]:
sc.pl.violin(adata, groupby = 'leiden_r3.0', keys = ['mu_count_IGK', 'mu_count_IGH', 'mu_count_IGL'], rotation = 90, stripplot = False, multi_panel = True, ncols = 1)

Markers:
- 30: TNFRSF11B, CCL17, AIRE, MKI67, TOP2A
- 0,6,40,42: CD1C, CR2

In [None]:
adata[adata.obs['leiden_r3.0'] == 37].obs['age_group'].value_counts()

In [None]:
cluster_assignments = {'B_mem' : [2,4,5,7,10,14,17,18,24,25,28,26,32,33,34,35,40,41,42,43,44,45,52,36],
                       'B_mem_CR2+' : [0,50,48],
                       'B_dev' : [55],
                       'B_naive' : [3,8,11,12,15,21,22,23,31,39,47,29],
                       'B_GC-like*' : [20,30,51],
                       'B_age-associated' : [19], # 30
                       'B_transitional' : [6,40,49],
                       'B_plasma' : [16,27,37,38,53],
                       'B_plasmablast': [1],
                       'B_plasma_GC' : [],
                       'B_dev_thy' : [13,54],
                       'Remove' : [9,46],
                       } 

leftover_clusters = [c for c in adata.obs['leiden_r3.0'].unique() if c not in [item for sublist in cluster_assignments.values() for item in sublist]]
np.array(leftover_clusters)

In [None]:
# Assign temporary annotation
adata.obs['temp_anno'] = pd.NA
for anno,l in cluster_assignments.items():
    adata.obs.loc[adata.obs['leiden_r3.0'].isin(l), 'temp_anno'] = anno
    
sc.pl.umap(adata, color = 'temp_anno', wspace = 0.5, return_fig = True)
plt.savefig(f'{plots_path}/ctAnnotation/v5/thyAgeing_bSplitxTissue_scvi_{object_version}_tempAnno.png', dpi = 300, bbox_inches = 'tight')
adata.obs[['temp_anno']].to_csv(f'{data_path}/preprocessing/ctAnnotation/thyAgeing_bSplitxTissue_scvi_{object_version}_v5_tempAnno.csv')

In [None]:
# Plot markers
sc.pl.DotPlot(adata, 
              groupby='temp_anno',
              var_names=gc_markers,
              mean_only_expressed=True,
              cmap = 'viridis',).add_totals().savefig(f'{plots_path}/ctAnnotation/v5/thyAgeing_bSplitxTissue_{object_version}_tempAnno_bFineMarkers_dotplot.png', dpi=300, bbox_inches='tight')


In [None]:
ddl.tl.clone_overlap(adata, groupby="temp_anno", clone_key = 'changeo_clone_id', weighted_overlap=True)

In [None]:
sc.set_figure_params(figsize=[6, 6])
ddl.pl.clone_overlap(adata, groupby="temp_anno", weighted_overlap=True)
plt.show()

### GC-like cells

In [None]:
adata_gc = adata[adata.obs['temp_anno'].isin(['B_GC-like*'])].copy()
sc.pp.highly_variable_genes(adata_gc, n_top_genes=500)
sc.pp.pca(adata_gc)
sc.pp.neighbors(adata_gc)
sc.tl.umap(adata_gc)

res = 0.8
sc.tl.leiden(adata_gc, resolution = res, key_added = f"leiden_r{res}")
adata_gc.obs[f'leiden_r{res}'] = adata_gc.obs[f'leiden_r{res}'].astype(int).astype('category')
adata_gc.obs[[f'leiden_r{res}']].to_csv(f'{data_path}/preprocessing/ctAnnotation/thyAgeing_gcSplit_scvi_{object_version}_leidenClusters.csv')

# leiden_gc = pd.read_csv(f'{data_path}/preprocessing/ctAnnotation/thyAgeing_gcSplit_scvi_{object_version}_leidenClusters.csv', index_col = 0).rename(columns = {'leiden_r3.0':'leiden_r2.0'})
# adata_gc.obs = adata_gc.obs.join(leiden_gc)
# adata_gc.obs['leiden_r2.0'] = adata_gc.obs['leiden_r2.0'].astype(int).astype('category')

sc.pl.umap(adata_gc, color = [f'leiden_r{res}'], wspace = 0.5, return_fig = True)
plt.savefig(f'{plots_path}/ctAnnotation/v5/thyAgeing_gcSplit_scvi_{object_version}_leidenClusters.png', dpi = 300, bbox_inches = 'tight')

In [None]:
sc.pl.umap(adata_gc, color = ['age_group', 'AICDA', 'IGHE', 'AIRE', 'ITGAX', 'study', 'percent_mito', 'percent_ribo', 'n_counts','n_genes'], ncols = 3)

In [None]:
sc.pl.violin(adata_gc, groupby = 'leiden_r0.8', keys = ['percent_mito', 'percent_ribo', 'n_counts','n_genes'], rotation = 90, stripplot = False, multi_panel = True, ncols = 1)

In [None]:
# Plot markers
gc_markers['T'] = ['CD3E', 'CD8A']
sc.pl.DotPlot(adata_gc, 
              groupby='leiden_r0.8',
              var_names=gc_markers,
              mean_only_expressed=True,
              cmap = 'viridis',).add_totals().savefig(f'{plots_path}/ctAnnotation/v5/thyAgeing_bSplitxTissue_gcSplit_{object_version}_gcMarkers_dotplot.png', dpi=300, bbox_inches='tight')


In [None]:
gc_cluster_assignments = {'B_GC-like_prolif' : [3],
                       'B_GC-like': [5,7,8,9,10,11,12,13,14,15],
                       'B_med' : [4],
                       'B_plasma' : [16],
                       'B_GC-like_explore' : [0,1,2,6],
                       } 

leftover_gc_clusters = [c for c in adata_gc.obs['leiden_r0.8'].unique() if c not in [item for sublist in gc_cluster_assignments.values() for item in sublist]]
np.array(leftover_gc_clusters)

In [None]:
# Assign temporary annotation
adata_gc.obs['temp_anno'] = pd.NA
for anno,l in gc_cluster_assignments.items():
    adata_gc.obs.loc[adata_gc.obs['leiden_r0.8'].isin(l), 'temp_anno'] = anno
    
sc.pl.umap(adata_gc, color = 'temp_anno', wspace = 0.5, return_fig = True)
plt.savefig(f'{plots_path}/ctAnnotation/v5/thyAgeing_bSplitxTissue_gcSplit_{object_version}_tempAnno.png', dpi = 300, bbox_inches = 'tight')
adata_gc.obs[['temp_anno']].to_csv(f'{data_path}/preprocessing/ctAnnotation/thyAgeing_bSplitxTissue_gcSplit_{object_version}_v5_tempAnno.csv')

### Construct new annotations

In [None]:
# Construct new anno
ct_annot = adata.obs[['temp_anno']].copy().astype(str)
ct_annot.loc[adata_gc.obs.index, 'temp_anno'] = adata_gc.obs['temp_anno'].astype(str)

ct_annot.head()

In [None]:
# Update anno in anndata
adata.obs['final_anno'] = adata.obs['final_anno'].astype(str)
adata.obs.loc[ct_annot.index, 'final_anno'] = ct_annot['temp_anno']
adata.obs['final_anno'] = adata.obs['final_anno'].astype('category').cat.remove_unused_categories()

In [None]:
sc.pl.umap(adata, color = 'final_anno', wspace = 0.5)

In [None]:
adata.obs['final_anno'].value_counts()

In [None]:
adata[adata.obs['final_anno'] == 'B_GC-like_explore'].obs.groupby(['study', 'donor'], observed = True).size().sort_values().to_frame().tail(50)

In [None]:
adata[(adata.obs['final_anno'] == 'B_GC-like_explore') & (adata.obs['study'] == 'DominguezConde2022')].obs.groupby(['donor', 'tissue', 'crossTissue_l0'], observed = True).size().sort_values().to_frame().tail(50)

In [None]:
# Save anno
adata.obs[['final_anno']].to_csv(f'{data_path}/preprocessing/ctAnnotation/thyAgeing_bSplitxTissue_scvi_{object_version}_v5.csv')

### Final UMAP

In [None]:
sc.pl.umap(adata[~adata.obs['final_anno'].isin(['Remove', 'B_GC-like_explore'])], color = 'final_anno', wspace = 0.5, return_fig = True)
plt.savefig(f'{plots_path}/ctAnnotation/v5/thyAgeing_bSplitxTissue_scvi_{object_version}_finalAnno.png', dpi = 300, bbox_inches = 'tight')

## Marker expression

In [None]:
import re

# Load GEX data
object_version ='v2_2025-02-20'
adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_bSplitxTissue_scvi_{object_version}.zarr')
adata.obs_names = adata.obs_names.str.replace(r'-\d$', '', regex=True)

# Add cell type annotation
ct_anno = pd.read_csv(f'{data_path}/preprocessing/ctAnnotation/thyAgeing_bSplitxTissue_scvi_v2_2025-02-20_v5.csv', index_col=0)
cols_overlapping = [col for col in ct_anno.columns if col in adata.obs.columns]
if any(cols_overlapping):
    adata.obs.drop(columns=cols_overlapping, inplace=True)
adata.obs = adata.obs.join(ct_anno)

# Update metadata
latest_meta_path = get_latest_version(dir = f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
latest_meta = pd.read_excel(latest_meta_path)
update_obs(adata, latest_meta, on = 'index', ignore_warning = True)

adata

In [None]:
# Log-normalise data
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

In [None]:
# Plot markers
b_markers = {'DZ' : ['CXCR4', 'MYC', 'MKI67', 'TOP2A', 'AICDA', 'PCNA', 'BACH2'],
               'LZ' : ['CXCR5', 'CD83', 'IRF4', 'CD86', 'MYBL1', 'SOCS3'],
               'recruitment' : ['CCR7'],
               'B_naive' : ['CD22', 'SELL', 'IL4R', 'TCL1A', 'CR2', 'FOXO1', 'IGHM', 'IGHD',],
               'B_mem' : ['CD27', 'CD38', 'FCRL4', 'FCRL5', 'CD44', 'PRDM1', 'IGHA1', 'IGHG1', 'IGHE'],
               'B_plasma' : ['PRDM1', 'XBP1', 'MZB1'], 
               'B_age-assoc' : ['TBX21', 'ITGAX'],
               'B_pan' : ['CD19', 'MS4A1'],
               'B_med' : ['HLA-DRA', 'AIRE', 'IL15', 'LTA', 'LTB']}

final_markers = b_markers.copy()
final_markers.pop('recruitment')
final_markers['B_dev'] = ['IGLL1', 'MME', 'RAG1', 'PAX5', 'EBF1', 'BCL11B']
final_markers['B_dev_thy'] = ['CD34', 'VPREB1', 'TYROBP',]
sc.pl.DotPlot(adata[~adata.obs['taa_l5'].isin(['Remove', 'B_GC-like_explore'])],
              groupby='taa_l5',
              var_names=final_markers,
              mean_only_expressed=True,
              cmap = 'viridis',).add_totals().savefig(f'{plots_path}/ctAnnotation/v5/thyAgeing_bSplitxTissue_{object_version}_finalAnno_bMarkers_dotplot.png', dpi=300, bbox_inches='tight')

In [None]:
np.array(adata.obs['final_anno'].unique())

### Add annotation levels

In [None]:
anno_levels = pd.read_excel(f'{general_data_path}/curated/thyAgeing_full_curatedAnno_v7_2025-02-05_levels.xlsx')
anno_levels.head()

In [None]:
adata.obs = adata.obs.reset_index(names = 'names').merge(anno_levels, left_on = 'final_anno', right_on = 'taa_l5', how = 'left').set_index('names')
adata.obs[anno_levels.columns].drop_duplicates()

In [None]:
# Check whether all annotations are present
np.setdiff1d(adata.obs['final_anno'], anno_levels['taa_l5'])

In [None]:
adata.obs[anno_levels.columns].to_csv(f'{data_path}/preprocessing/ctAnnotation/thyAgeing_bSplitxTissue_scvi_{object_version}_v5.csv')