# Thymus ageing atlas - T/NK compartment : knn-transfer of HTSA cell labels

In [None]:
import os
import sys
import session_info

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import scanpy as sc
import anndata as ad
import hdf5plugin
from sklearn.metrics import f1_score

# Add repo path to sys path (allows to access scripts and metadata from repo)
repo_path = '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/B_compartment'
sys.path.insert(1, repo_path) 
sys.path.insert(2, '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts')

%load_ext autoreload
%autoreload 2

from annotate_ct import get_kNN_predictions

In [None]:
# Define paths
plots_path = f'{repo_path}/plots/'
data_path = f'{repo_path}/data/'
model_path = os.path.join(repo_path, 'models')
general_data_path = '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/data'

print('Dir for plots: {}'.format(plots_path))
print('Dir for data: {}'.format(data_path))

## Load data

In [None]:
object_version = 'v4_2024-11-06'
adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_bSplit_scvi_{object_version}.zarr')

leiden_clus = pd.read_csv(f'{data_path}/objects/rna/thyAgeing_bSplit_scvi_{object_version}_leidenClusters.csv', index_col=0)
adata.obs.drop(leiden_clus.columns, axis = 1, errors = 'ignore', inplace = True)
adata.obs = adata.obs.join(leiden_clus)
adata.obs[leiden_clus.columns] = adata.obs[leiden_clus.columns].astype('category')

# Add celltypist labels
celltypist_pred = pd.read_csv(f'{data_path}/objects/rna/thyAgeing_bSplit_scvi_{object_version}_celltypistImmuneLowAnnot.csv', index_col = 0, dtype = 'category')
adata.obs = adata.obs.join(celltypist_pred)

adata

In [None]:
bcr = pd.read_csv(f'{data_path}/objects/rna/thyAgeing_bSplit_scvi_v1_2024-05-24_bcr.csv', index_col=0)

bcr_sub = bcr[['v_call_VDJ_main', 'v_call_VJ_main',
       'd_call_VDJ_main', 'j_call_VDJ_main', 'j_call_VJ_main',
       'c_call_VDJ_main', 'c_call_VJ_main', 'v_call_B_VDJ_main',
       'd_call_B_VDJ_main', 'j_call_B_VDJ_main', 'v_call_B_VJ_main',
       'j_call_B_VJ_main', 'isotype', 'isotype_status', 'locus_status',
       'chain_status']].copy()

adata.obs = adata.obs.join(bcr_sub)

adata.obs.columns

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

## Full object

In [None]:
# General markers
import pickle

# Load marker data
with open(f'{general_data_path}/markers/allMarkers_lowGranularity_vk8.pkl', 'rb') as f:
    all_mrkrs = pickle.load(f)
    
# Plot markers
for k,l in all_mrkrs.items():
    sc.pl.DotPlot(adata, 
              groupby='leiden_r2.5',
              var_names=l,
              mean_only_expressed=True,
              cmap = 'viridis',).add_totals().savefig(f'{plots_path}/preprocessing/scvi/thyAgeing_bSplit_{object_version}_{k}Markers_dotplot.png', dpi=300, bbox_inches='tight')

In [None]:
# B cell markers
b_markers = {'B_cells': ['CD79A', 'TCL1A'],
'B_IFN': ['MX1','IFI44L', 'STAT1'], 
'B_naive': ["FCER2", "BANK1", "FCMR", 'IGHM', 'IGHD'], 
'B_transitional' : ["CD24", 'MYO1C', 'MS4A1'],
'B_activated': ['CD69','FOS','FOSB','DUSP1','CD83'], 
'B_preGC': ["MIR155HG", "HIVEP3", "PARVB"],
'B_GC': ["GMDS", "LMO2", "LPP", "BCL6", "AICDA", "H2AFZ", "MKI67", 'POU2AF1', 'CD40', 'SUGCT'], 
'B_LZ_plasmablasts' : ['PAX5', 'CD27', 'TNFSF13','CD9', 'PRDM1', 'XBP1', 'MZB1', 'TNFRSF17', 'FKBP11'], 
'B_pre-pro': ['IL7R', 'ZCCHC7', 'RAG1'],
'B_pro': ['MME', 'DNTT', 'IGLL1'],
'B_small-pre': ['MME', "CD24",],
'B_large-pre': ['MME', 'CD24','MKI67'],
'B_cycling': ['TOP2A', 'CD19', 'MKI67'], 
'B_follicular' : ['CXCR5', 'TNFRSF13B', 'CD22'],
'B_prePB': ["FRZB", "BTNL9", "HOPX"], 
'B_dev' : ['SPN', 'VPREB1'],
'B_plasma': ["XBP1", "PRDM1", "FKBP11"], 
'B_mem': ["TNFRSF13B", "FCRL4", "CLECL1", 'CR2', 'CD27', 'MS4A1', 'IGHA1', 'IGHG1', 'IGHE'],
'B_age-associated' : ['FCRL2', 'ITGAX', 'TBX21'],
'B_perivasc': ['CXCR3', 'CR2', 'CD72' , 'CD37'],
'B_med': ['CD80', 'CD83' , 'CD86', 'HLA-DRA', 'AIRE', 'IL15', 'LTA', 'LTB']
}

gc_markers = {'DZ' : ['CXCR4', 'BACH2','PCNA', 'MKI67', 'CDK1', 'CDC20', 'FOXP1', 'AICDA', 'MYC', 'EZH2', 'E2F1', 'FOXO1', 'BCL6'], 
    'LZ': ['CD83', 'SERPINA9', 'CAMK1', 'MYC', 'RGS13', 'CD44', 'CD38', 'LMO2', 'EBI3', 'HLA-DQB2', 'TRAF4', 'PLEK', 'IER2', 'NFKBIA', 'BCAR3', 'DUSP2', 'SNX11', 'PLPP5', 'PHACTR1', 'TAP1', 
    'RAB3GAP2', 'DHRS9', 'FCRL5'],
    're-entry' : ['SLA', 'FCRL2', 'CFLAR', 'FOXP1'],                    
    'bcr_activation' : ['BTK', 'BLK','BLNK'],
    'TFh_INF_help' : ['CD40', 'TRAF1', 'ICAM1', 'NFKB1', 'NFKB2', 'REL', 'RELB'],             
    'pre-mem': ['BANK1', 'CCR6','CELF2', 'IFITM1', 'IFITM2', 'IFNGR1','GPR183','CD69', 'TNFRSF13B', 'SELL', 'MYC', 'FXYD5', 'STAT1'], 
    'exit' : ['MEF2B', 'RGS13', 'S1PR2'],   
    'LZ_plasmablasts' : ['PAX5', 'CD27', 'TNFSF13','CD9', 'PRDM1', 'XBP1', 'MZB1', 'TNFRSF17', 'FKBP11'], 
    'DZ/LZ' : ['NFKBIA', 'BCAR3', 'DUSP2', 'SNX11', 'PLPP5', 'PHACTR1', 'TAP1',
    'PCNA', 'MKI67', 'CDK1', 'CDC20', 'CD72', 'PTPN6', 'IFNGR1', 'CAMK1', 'CD22'],
    'DN' : ['RAB3GAP2', 'DHRS9','FCRL5', 'SLAMF7', 'CD22', 'PDCD1', 'TBX21', 'ZEB2', 'CD19',  'IL12A'], 
    'misc' : ['IGHM', 'IGHD', 'IGHE', 'IGHA1','CCR2', 'RAG1', 'RAG2']}

In [None]:
# Plot markers
sc.pl.DotPlot(adata, 
              groupby='leiden_r2.5',
              var_names=b_markers,
              mean_only_expressed=True,
              cmap = 'viridis',).add_totals().savefig(f'{plots_path}/ctAnnotation/v3/thyAgeing_bSplit_{object_version}_bFineMarkers_dotplot.png', dpi=300, bbox_inches='tight')

# Plot markers
sc.pl.DotPlot(adata, 
              groupby='leiden_r2.5',
              var_names=gc_markers,
              mean_only_expressed=True,
              cmap = 'viridis',).add_totals().savefig(f'{plots_path}/ctAnnotation/v3/thyAgeing_bSplit_{object_version}_gcMarkers_dotplot.png', dpi=300, bbox_inches='tight')

# Plot markers
sc.pl.DotPlot(adata, 
              groupby='leiden_r2.5',
              var_names=gc_markers2,
              mean_only_expressed=True,
              cmap = 'viridis',).add_totals().savefig(f'{plots_path}/ctAnnotation/v3/thyAgeing_bSplit_{object_version}_gcMarkers2_dotplot.png', dpi=300, bbox_inches='tight')

In [None]:
sc.pl.umap(adata, color = ['isotype_status'], ncols = 1, return_fig=True)
plt.savefig(f'{plots_path}/ctAnnotation/v3/thyAgeing_bSplit_{object_version}_isotypeStatus_umap.png', dpi=300, bbox_inches='tight')

In [None]:
sc.pl.umap(adata, color = ['leiden_r2.5'], ncols = 1)

In [None]:
sc.pl.umap(adata, color = ['AIRE', 'AICDA'], cmap = 'Reds')

In [None]:
cluster_assignments = {'B_mem*' : [9,15,16,20,21,23,25,26,28,2,32,40,43,8,35],
                       'B_naive*' : [4,6,10,11,13,14,22,29,30,31,42],
                       'B_GC_med*' : [17,7,36,37,27,5,18],
                       'B_transitional' : [1,24,39,33],
                       'B_plasma' : [3,12],
                       'B_plasma_GC' : [34,41],
                       'Remove' : [0,38,19],}

leftover_clusters = [c for c in adata.obs['leiden_r2.5'].unique() if c not in [item for sublist in cluster_assignments.values() for item in sublist]]
np.array(leftover_clusters)

In [None]:
sc.pl.umap(adata[adata.obs['temp_anno'] == 'B_naive*'], color = ['leiden_r2.5'], ncols = 1)

In [None]:
# Assign temporary annotation
adata.obs['temp_anno'] = pd.NA
for anno,l in cluster_assignments.items():
    adata.obs.loc[adata.obs['leiden_r2.5'].isin(l), 'temp_anno'] = anno
    
sc.pl.umap(adata, color = 'temp_anno', wspace = 0.5, return_fig = True)
plt.savefig(f'{plots_path}/ctAnnotation/v3/thyAgeing_bSplit_scvi_{object_version}_tempAnno.png', dpi = 300, bbox_inches = 'tight')
adata.obs[['temp_anno']].to_csv(f'{data_path}/preprocessing/ctAnnotation/thyAgeing_bSplit_scvi_{object_version}_tempAnno.csv')

## GC_med

In [None]:
adata_gc = adata[adata.obs['temp_anno'] == 'B_GC_med*'].copy()
sc.tl.umap(adata_gc)

sc.tl.leiden(adata_gc, resolution = 2.0, key_added = f"leiden_r2.0")
adata_gc.obs[['leiden_r2.0']].to_csv(f'{data_path}/preprocessing/ctAnnotation/thyAgeing_gcSplit_scvi_{object_version}_leidenClusters.csv')
adata_gc.obs['leiden_r2.0'] = adata_gc.obs['leiden_r2.0'].astype(int).astype('category')

# leiden_gc = pd.read_csv(f'{data_path}/preprocessing/ctAnnotation/thyAgeing_gcSplit_scvi_{object_version}_leidenClusters.csv', index_col = 0)
# adata_gc.obs.drop(columns = 'leiden_r2.0', inplace = True)
# adata_gc.obs = adata_gc.obs.join(leiden_gc)
# adata_gc.obs['leiden_r2.0'] = adata_gc.obs['leiden_r2.0'].astype(int).astype('category')

sc.pl.umap(adata_gc, color = ['leiden_r2.0'], wspace = 0.5, return_fig = True)
plt.savefig(f'{plots_path}/ctAnnotation/v3/thyAgeing_gcSplit_scvi_{object_version}_leidenClusters.png', dpi = 300, bbox_inches = 'tight')

In [None]:
sc.pl.umap(adata_gc, color = ['study', 'AIRE', 'age_group', 'isotype_status'], wspace = 0.5, ncols = 2)

In [None]:
sc.pl.DotPlot(adata_gc, 
                var_names = b_markers, 
                groupby = 'leiden_r2.0',
                mean_only_expressed=True,
                cmap = 'viridis').add_totals().savefig(f'{plots_path}/ctAnnotation/v3/thyAgeing_gcSplit_scvi_{object_version}_leidenClusters_bFineMarkers.png')

sc.pl.DotPlot(adata_gc, 
                var_names = gc_markers2, 
                groupby = 'leiden_r2.0',
                mean_only_expressed=True,
                cmap = 'viridis').add_totals().savefig(f'{plots_path}/ctAnnotation/v3/thyAgeing_gcSplit_scvi_{object_version}_leidenClusters_gcMarkers.png')


In [None]:
for k,v in gc_markers2.items():
    sc.pl.umap(adata, color = v, cmap = 'Reds', return_fig = True)
    plt.savefig(f'{plots_path}/ctAnnotation/v3/thyAgeing_bSplit_scvi_{object_version}_{k}Genes_umap.png', dpi = 300, bbox_inches = 'tight')

Dark Zone (DZ) B Cells
- CXCR4: Highly expressed; critical for DZ localization.
- MYC: Regulates proliferation and metabolic activity.
- MKI67: A marker of proliferation, indicating active cell division.
- TOP2A: Associated with DNA replication and cell cycle progression.
- AID (AICDA): Somatic hypermutation-related enzyme.
- PCNA: Proliferation marker involved in DNA repair and replication.
- BACH2: A transcription factor supporting proliferation and somatic hypermutation.
Light Zone (LZ) B Cells
- CXCR5: Maintains a high expression level, though present in both zones.
- CD83: Marker for GC selection and interaction with T follicular helper (Tfh) cells.
- IRF4: Involved in differentiation and interaction with Tfh cells.
- CD86: Costimulatory molecule important for antigen presentation.
- MYBL1: Transcription factor linked to differentiation and reduced proliferation.
- SOCS3: Negative regulator of cytokine signaling, enriched in LZ cells.
Naive B Cells
CD19: Pan-B-cell marker.
CD20 (MS4A1): Pan-B-cell marker.
CD22: Receptor regulating B-cell receptor (BCR) signaling.
SELL (CD62L): Lymph node homing receptor.
IL4R: Cytokine receptor.
TCL1A: Pro-survival molecule, enriched in naive B cells.
CR2 (CD21): Complement receptor, high in naive B cells.
FOXO1: Key transcription factor maintaining the naive state.
Memory B Cells
CD27: Classical marker for human memory B cells (less universal in mice).
CD38: Low expression compared to germinal center B cells.
FCRL4: Associated with tissue-resident memory B cells.
FCRL5: Marker for memory and marginal zone B cells.
CD44: Adhesion molecule enriched in memory B cells.
PRDM1 (BLIMP1): Marker for differentiation and long-term survival.
MKI67: Low expression, reflecting quiescence.

In [None]:
gc_markers2 = {'DZ' : ['CXCR4', 'MYC', 'MKI67', 'TOP2A', 'AICDA', 'PCNA', 'BACH2'],
               'LZ' : ['CXCR5', 'CD83', 'IRF4', 'CD86', 'MYBL1', 'SOCS3'],
               'recruitment' : ['CCR7'],
               'B_naive' : ['CD22', 'SELL', 'IL4R', 'TCL1A', 'CR2', 'FOXO1', 'IGHM', 'IGHD',],
               'B_mem' : ['CD27', 'CD38', 'FCRL4', 'FCRL5', 'CD44', 'PRDM1', 'IGHA1', 'IGHG1', 'IGHE'],
               'B_plasma' : ['PRDM1', 'XBP1', 'MZB1'], 
               'B_pan' : ['CD19', 'MS4A1'],
               'B_med' : ['HLA-DRA', 'AIRE', 'IL15', 'LTA', 'LTB']}

In [None]:
gc_cluster_assignments = {
   'B_med' : [14],
   'B_DZ/LZ' : [18,22],
   'B_DZ' : [16,8,13], 
   'B_LZ-late' : [],
   'B_LZ' : [],
   'B_naive': [6,7,12,15,24],
   'B_mem_AID+' : [0,1,2,3,4,5,9,10,17,20,25,26],
   'B_IgE' : [11,14],
   'Remove' : [21],
   'B_transitional': [23,19],
}

gc_leftover_clusters = [c for c in adata_gc.obs['leiden_r2.0'].unique() if c not in [item for sublist in gc_cluster_assignments.values() for item in sublist]]
np.array(gc_leftover_clusters)

In [None]:
# Assign temporary annotation
adata_gc.obs['temp_anno'] = pd.NA
for anno,l in gc_cluster_assignments.items():
    adata_gc.obs.loc[adata_gc.obs['leiden_r2.0'].isin(l), 'temp_anno'] = anno
    
sc.pl.umap(adata_gc, color = 'temp_anno', wspace = 0.5, return_fig = True)
plt.savefig(f'{plots_path}/ctAnnotation/v3/thyAgeing_gcSplit_scvi_{object_version}_tempAnno.png', dpi = 300, bbox_inches = 'tight')
adata_gc.obs[['temp_anno']].to_csv(f'{data_path}/preprocessing/ctAnnotation/thyAgeing_gcSplit_scvi_{object_version}_tempAnno.csv')

In [None]:
adata_gc.obs.groupby(['age_group', 'temp_anno']).size()

In [None]:
sc.pl.DotPlot(adata_gc, 
                var_names = gc_markers2, 
                groupby = 'temp_anno',
                mean_only_expressed=True,
                cmap = 'viridis').add_totals().show()

sc.pl.DotPlot(adata_gc, 
                var_names = gc_markers, 
                groupby = 'temp_anno',
                mean_only_expressed=True,
                cmap = 'viridis').add_totals().show()

In [None]:
adata_gc.write_h5ad(f'{data_path}/preprocessing/ctAnnotation/thyAgeing_gcSplit_scvi_{object_version}.zarr',
                        compression=hdf5plugin.FILTERS["zstd"],compression_opts=hdf5plugin.Zstd(clevel=5).filter_options,)

In [None]:
sc.pl.umap(adata_gc, color = ['n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'scrublet_score'], wspace = 0.5)

In [None]:
ct_annot = adata.obs[['temp_anno']].copy().astype(str)

ct_annot.loc[adata_gc.obs.index, 'temp_anno'] = adata_gc.obs['temp_anno'].astype(str)
ct_annot.head()

In [None]:
adata.obs['temp_anno2'] = ct_annot['temp_anno']

In [None]:
sc.pl.umap(adata, color = 'temp_anno2', wspace = 0.5, return_fig = True)