# Update T/NK annotations

In [None]:
import os
import sys
import session_info
from datetime import datetime
today = datetime.today().strftime('%Y-%m-%d')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import scanpy as sc
import anndata as ad
import hdf5plugin

# Add repo path to sys path (allows to access scripts and metadata from repo)
repo_path = '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/T_NK_compartment'
sys.path.insert(1, repo_path) 
sys.path.insert(2, '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts')

# Autoreload custom scripts
%load_ext autoreload
%autoreload 2

# Define paths
plots_path = f'{repo_path}/plots/'
data_path = f'{repo_path}/data/'
model_path = os.path.join(repo_path, 'models')
general_data_path = '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/data'

print('Dir for plots: {}'.format(plots_path))
print('Dir for data: {}'.format(data_path))

# Formatting
# from matplotlib import font_manager
# font_manager.fontManager.addfont("/nfs/team205/ny1/ThymusSpatialAtlas/software/Arial.ttf")
# plt.style.use('/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts/plotting/thyAgeing.mplstyle')

# Import custom scripts
from utils import get_latest_version,update_obs,freq_by_donor
from anno_levels import get_ct_levels, get_ct_palette, age_group_levels, age_group_palette
from plotting.utils import plot_grouped_boxplot, calc_figsize

## B_dev_thy

In [None]:
# Load old T annotations
t_anno = pd.read_csv(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_v8_2024-11-07_curatedAnno_v6.csv', index_col = 0)

# Load B annotations
from anno_levels import b_path
b_anno = pd.read_csv(f'{b_path}/data/preprocessing/ctAnnotation/thyAgeing_bSplitxTissue_scvi_v2_2025-02-20_v5.csv', index_col = 0)
b_barcodes = np.intersect1d(b_anno.index, t_anno.index)

# Load anno levels
anno_level_sheet = pd.read_excel(f'{general_data_path}/curated/thyAgeing_full_curatedAnno_v8_2025-03-03_levels.xlsx')

In [None]:
anno_level_sheet

In [None]:
# Construct new annotations
new_anno = t_anno[['taa_l5']].copy()
new_anno.loc[new_anno['taa_l5'] == 'B_dev_thy', 'taa_l5'] = pd.NA
new_anno.loc[b_barcodes, 'taa_l5'] = 'B_dev_thy'

new_anno = new_anno.reset_index(names='names').merge(anno_level_sheet, left_on = 'taa_l5', right_on = 'taa_l5', how = 'left').set_index('names')
new_anno.head()

In [None]:
new_anno['taa_l5'].value_counts()

In [None]:
# Check number of NAs
new_anno['taa_l5'].isna().sum() # Doublet B_dev_thy

In [None]:
# Save new annotations
new_anno.to_csv(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_v8_2024-11-07_curatedAnno_v7.csv')

## DN compartment

In [None]:
# Load adata
object_version = 'v4_2025-02-04'
adata = ad.read_h5ad(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_{object_version}.zarr', backed = 'r')

# Add latest annotations to adata
ct_anno = pd.read_csv(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_{object_version}_curatedAnno_v8.csv', index_col = 0)
for c in ct_anno.columns:
    if c in adata.obs.columns:
        adata.obs.drop(c, axis = 1, inplace = True)
adata.obs = adata.obs.join(ct_anno)

adata = adata[adata.obs['taa_l4'].isin(['T_DN(early)' ,'T_DN(P)', 'T_DN(Q)', 'T_DN(late)', 'B_dev_thy'])].to_memory()

adata.obs['taa_l5'].value_counts()

In [None]:
# Load leiden clustr 9
adata_c9 = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_v7_2024-11-06.zarr', backed= 'r')

leiden_c9 = pd.read_csv(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_v7_2024-11-06_leidenClusters.csv', index_col = 0)
adata_c9.obs = adata_c9.obs.join(leiden_c9)

adata_c9 = adata_c9[adata_c9.obs['leiden_r2.5'] == 9].to_memory()

In [None]:
sc.pl.DotPlot(adata_c9, 
              var_names = ct_markers_filtered,
              groupby = 'chemistry_simple',
                mean_only_expressed=True,
                cmap = 'magma').add_totals().show()

In [None]:
adata_c9.obs['taa_l5'] = 'T_DN(Q)'
adata_concat = adata.concatenate(adata_c9, index_unique = None).copy()

adata_concat

In [None]:
# adata_concat = adata_concat[adata_concat.obs['scrublet_score'] <= 0.4]
# adata_concat = adata_concat[adata_concat.obs['percent_mito'] < .08]
# adata_concat = adata_concat[adata_concat.obs['percent_ribo'] < .7]

# adata_concat.shape

In [None]:
# Check for duplicates
adata_concat.obs_names.duplicated().sum() # Check for duplicates

In [None]:
# Check whether X is raw counts
adata_concat.X[:100, :100].sum() % 1 == 0

In [None]:
# Load curated markers and make dictionary
ct_markers_df = pd.read_excel(f'{data_path}/curated/curatedCellMarkers_LMM.xlsx', sheet_name = 'Final')
ct_markers = ct_markers_df.groupby('cell_label_fine')['marker_gene'].agg(list).to_dict()

# Filter markers of interest
ctoi = ['B_dev','T_DN(early)', 'T_DN(P)', 'T_DN(Q)','T_DP(P)']
ct_markers_filtered = {c: [g for g in ct_markers[c] if g in adata.var_names] for c in ctoi if c in ct_markers}
ct_markers_filtered['B_dev'].append('IFITM3')
ct_markers_filtered['T_DN(early)'].append('SPINK2')
ct_markers_filtered['T_DN(early)'].append('NOTCH1')

import pprint
pprint.pprint(ct_markers_filtered, compact=True)

In [None]:
adata_concat.layers['counts'] = adata_concat.X.copy()
sc.pp.normalize_total(adata_concat, target_sum=1e4)
sc.pp.log1p(adata_concat)

In [None]:
sc.pl.DotPlot(adata_concat, 
              var_names = ct_markers_filtered,
              categories_order = ['B_dev_thy', 'T_ETP', 'T_DN(early)', 'T_DN(P)', 'T_DN(Q)', 'T_DN(late)'],
              groupby = 'taa_l5',
                mean_only_expressed=True,
                cmap = 'magma').add_totals().show()

In [None]:
adata_concat.obs[['taa_l5']].to_csv(f'{data_path}/preprocessing/ctAnnotation/v9/thyAgeing_dnSplit_all_scvi_{object_version}_tempAnno.csv')

In [None]:
# Score cell cycle
cell_cycle_genes_df = pd.read_excel('/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/T_NK_compartment/data/curated/regev_cell_cycle_genes.xlsx', skiprows = 6, usecols=[0,1])

cell_cycle_genes = {}
for col in cell_cycle_genes_df.columns:
    cell_cycle_genes_df[col] = cell_cycle_genes_df[col].str.strip()
    cell_cycle_genes[col] = [g.strip() for g in cell_cycle_genes_df[col].dropna().tolist() if g.strip() in adata.var_names]
    
import pprint
pprint.pprint(cell_cycle_genes, compact=True)

sc.tl.score_genes_cell_cycle(adata_concat, s_genes=cell_cycle_genes['G1/S'], g2m_genes=cell_cycle_genes['G2/M'])

In [None]:
cell_cycle_score_df = adata_concat.obs[['S_score', 'G2M_score', 'donor', 'age_group', 'taa_l5', 'phase', 'study']].copy()
#cell_cycle_score_df = cell_cycle_score_df.join(knn_anno)
df = cell_cycle_score_df.groupby(['age_group', 'taa_l5', 'donor', 'study'], observed = True)['phase'].value_counts(normalize = True).unstack().fillna(0).reset_index()
df = df.melt(id_vars = ['age_group', 'taa_l5', 'donor', 'study'], value_name = 'prop', var_name = 'phase')
df['taa_l5'] = pd.Categorical(df['taa_l5'], categories = ['T_ETP','T_DN(early)', 'T_DN(P)', 'T_DN(Q)', 'T_DN(late)', 'B_dev_thy'], ordered=True)
df['taa_l5'] = df['taa_l5'].cat.remove_unused_categories()

sns.catplot(data=df, x='taa_l5', y='prop', hue = 'age_group', hue_order = ['infant', 'paed', 'adult', 'geriatric'], kind="box", palette = 'colorblind',
            col='phase', col_order=['G1', 'S','G2M'], 
            #facet_kws={'sharey': False, 'sharex': True}, 
            height=2, aspect=2, col_wrap=1)

In [None]:
from scvi_wrapper import run_scvi
object_version = f'v3_{today}'

# Run scvi
scvi_run = run_scvi(adata_concat, 
                    layer_raw = 'X', 
                    # Excluded genes
                    include_genes=[g for g in ct_markers_filtered.values() for g in g], 
                    exclude_cc_genes=False, exclude_mt_genes=True, 
                    exclude_vdjgenes = True, remove_cite = False,
                    # Highly variable gene selection
                    batch_hv="study", hvg = 200, #span = 1,
                    hvg_selection = 'experimental',
                    # scVI 
                    batch_scvi="sample",
                    cat_cov_scvi=["donor", "chemistry_simple", "sex"], 
                    cont_cov_scvi=[], 
                    max_epochs=400, batch_size=2000, early_stopping = True, early_stopping_patience = 15, early_stopping_min_delta = 10.0,
                    plan_kwargs = {'lr': 0.001, 'reduce_lr_on_plateau' : True, 'lr_patience' : 10, 'lr_threshold' : 20}, 
                    n_layers = 3, n_latent = 30, dispersion = 'gene-batch',
                    # Leiden clustering
                    leiden_clustering = None, col_cell_type = ['taa_l5'], 
                    fig_dir = f'{plots_path}/preprocessing/scvi', fig_prefix = f'thyAgeing_dnSplit_scvi_{object_version}')

In [None]:
# Save adata and scvi model
overwrite = True

anno_cols = [c for c in scvi_run['data'].obs.columns if '_pred_' in c or '_prob_' in c or 'taa' in c or 'anno_' in c or 'leiden' in c]
if not os.path.exists(f'{data_path}/thyAgeing_dnSplit_scvi_{object_version}.zarr') or overwrite:
    print('Saving new adata version: {}'.format(object_version))
    scvi_run['data'].obs = scvi_run['data'].obs.drop(columns=anno_cols)
    scvi_run['data'].write_h5ad(
        f'{data_path}/objects/rna/thyAgeing_dnSplit_scvi_{object_version}.zarr',
        compression=hdf5plugin.FILTERS["zstd"],
        compression_opts=hdf5plugin.Zstd(clevel=5).filter_options,
    )
    scvi_run['vae'].save(f'{model_path}/thyAgeing_dnSplit_scvi_{object_version}', save_anndata=False, overwrite=overwrite)
else:
    print('File already exists')

### Leiden clustering

In [None]:
adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_dnSplit_scvi_{object_version}.zarr')

In [None]:
# Louvain clustering
res_list = [1.5,2.0]
for res in res_list:
    sc.tl.leiden(adata, resolution = res, key_added = f"leiden_r{res}")
adata.obs[[f'leiden_r{str(r)}' for r in res_list]] = adata.obs[[f'leiden_r{str(r)}' for r in res_list]].astype('category')

adata.obs[[f'leiden_r{str(r)}' for r in res_list]].to_csv(f'{data_path}/objects/rna/thyAgeing_dnSplit_scvi_{object_version}_leidenClusters.csv')

sc.pl.umap(adata, color = ['leiden_r1.5', 'leiden_r2.0'], show = False, return_fig = True)
plt.savefig(f'{plots_path}/preprocessing/scvi/thyAgeing_dnSplit_scvi_{object_version}_leidenClusters.png', dpi = 300, bbox_inches = 'tight')

### Marker expression and ct assignment

In [None]:
adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_dnSplit_scvi_{object_version}.zarr')

leiden_clustering = pd.read_csv(f'{data_path}/objects/rna/thyAgeing_dnSplit_scvi_{object_version}_leidenClusters.csv', index_col = 0)
adata.obs = adata.obs.join(leiden_clustering)
adata.obs[leiden_clustering.columns] = adata.obs[leiden_clustering.columns].astype(int).astype('category')

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

In [None]:
sc.pl.DotPlot(adata, 
                ct_markers_filtered,
                groupby = 'leiden_r2.0',
                mean_only_expressed=True,
                cmap = 'magma').add_totals().savefig(f'{plots_path}/ctAnnotation/v9/thyAgeing_dnSplit_scvi_{object_version}_dnMarkers_leiden2.0_dotplot.png', dpi = 300, bbox_inches = 'tight')

In [None]:
# Check QC metrics
sc.pl.violin(adata, keys = ['percent_mito', 'percent_ribo', 'n_counts', 'n_genes', 'scrublet_score'], groupby='leiden_r2.0', show = False)
plt.savefig(f'{plots_path}/preprocessing/scvi/thyAgeing_dnSplit_scvi_{object_version}_leiden2.0_qc.png', dpi = 300, bbox_inches = 'tight')

Remove clusters:
- high mito: 16
- high scrublet score: 40

In [None]:
dn_cluster_assignments = {'B_dev_thy' : [28,25,37],
                          'T_DN(early)' : [10,39,20,11],
                          'T_DN(P)' : [3,6,21,23,33,39,],
                          'T_DN(Q)' : [0,1,2,4,5,7,8,12,13,14,15,16,17,18,19,22,24,27,29,30,31,32,36,38],
                          'T_DN(late)' : [2,9,26,34,35,40],
                          'Remove' : [16,40]}

np.array([c for c in adata.obs['leiden_r2.0'].unique().tolist() if c not in [c for c in dn_cluster_assignments.values() for c in c]])

In [None]:
sc.pl.umap(adata, color = ['CD3D', 'VPREB1', 'CD34', 'RAG1', 'CD8A'])

In [None]:
adata.obs['temp_anno'] = pd.NA
#dn_cluster_assignments = {k: [c for c in dn_cluster_assignments[k]] for k in dn_cluster_assignments.keys()}
for k, v in dn_cluster_assignments.items():
    adata.obs.loc[adata.obs['leiden_r2.0'].isin(v), 'temp_anno'] = k
    
sc.pl.umap(adata, color = 'temp_anno', return_fig = True)
plt.savefig(f'{plots_path}/ctAnnotation/v9/thyAgeing_dnSplit_scvi_{object_version}_tempAnno_umap.pdf', dpi=300, bbox_inches='tight')

In [None]:
sc.pl.DotPlot(adata[adata.obs['temp_anno'] != 'Remove'], 
                ct_markers_filtered,
                groupby = 'temp_anno',
                categories_order= ['B_dev_thy', 'T_DN(early)', 'T_DN(P)', 'T_DN(Q)', 'T_DN(late)'],
                mean_only_expressed=True,
                cmap = 'magma').add_totals().savefig(f'{plots_path}/ctAnnotation/v9/thyAgeing_dnSplit_scvi_{object_version}_dnMarkers_tempAnno_dotplot.png', dpi = 300, bbox_inches = 'tight')

In [None]:
adata.obs[adata.obs['temp_anno'] == 'B_dev_thy']['donor'].value_counts().head(10)

In [None]:
adata.obs[['temp_anno']].to_csv(f'{data_path}/preprocessing/ctAnnotation/v9/thyAgeing_dnSplit_scvi_{object_version}_tempAnno.csv')

In [None]:
# Score cell cycle
cell_cycle_genes_df = pd.read_excel('/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/T_NK_compartment/data/curated/regev_cell_cycle_genes.xlsx', skiprows = 6, usecols=[0,1])

cell_cycle_genes = {}
for col in cell_cycle_genes_df.columns:
    cell_cycle_genes_df[col] = cell_cycle_genes_df[col].str.strip()
    cell_cycle_genes[col] = [g.strip() for g in cell_cycle_genes_df[col].dropna().tolist() if g.strip() in adata.var_names]
    
import pprint
pprint.pprint(cell_cycle_genes, compact=True)

sc.tl.score_genes_cell_cycle(adata_dn, s_genes=cell_cycle_genes['G1/S'], g2m_genes=cell_cycle_genes['G2/M'])

In [None]:
cell_cycle_score_df = adata_dn.obs[['S_score', 'G2M_score', 'donor', 'age_group', 'temp_anno', 'phase', 'study']].copy()
#cell_cycle_score_df = cell_cycle_score_df.join(knn_anno)
df = cell_cycle_score_df.groupby(['age_group', 'temp_anno', 'donor', 'study'])['phase'].value_counts(normalize = True).unstack().fillna(0).reset_index()
df = df.melt(id_vars = ['age_group', 'temp_anno', 'donor', 'study'], value_name = 'prop', var_name = 'phase')
df['temp_anno'] = pd.Categorical(df['temp_anno'], categories = ['T_ETP','T_DN(early)', 'T_DN(P)', 'T_DN(Q)', 'T_DN(late)', 'T_B_dev'], ordered=True)
df['temp_anno'] = df['temp_anno'].cat.remove_unused_categories()

sns.catplot(data=df, x='temp_anno', y='prop', hue = 'age_group', hue_order = ['infant', 'paed', 'adult', 'geriatric'], kind="box", palette = 'colorblind',
            col='phase', col_order=['G1', 'S','G2M'], 
            facet_kws={'sharey': False, 'sharex': True}, 
            height=2, aspect=2, col_wrap=1)

### NEw

In [None]:
from scvi_wrapper import run_scvi
object_version = f'v2_{today}'

# Run scvi
scvi_run = run_scvi(adata_concat, 
                    layer_raw = 'X', 
                    # Excluded genes
                    include_genes=[], exclude_cc_genes=False, exclude_mt_genes=True, 
                    exclude_vdjgenes = True, remove_cite = False,
                    # Highly variable gene selection
                    batch_hv="study", hvg = 200, #span = 1,
                    hvg_selection = 'experimental',
                    # scVI 
                    batch_scvi="sample",
                    cat_cov_scvi=["donor", "chemistry_simple", "sex"], 
                    cont_cov_scvi=[], 
                    max_epochs=400, batch_size=2000, early_stopping = True, early_stopping_patience = 15, early_stopping_min_delta = 10.0,
                    plan_kwargs = {'lr': 0.001, 'reduce_lr_on_plateau' : True, 'lr_patience' : 10, 'lr_threshold' : 20}, 
                    n_layers = 3, n_latent = 30, dispersion = 'gene-batch',
                    # Leiden clustering
                    leiden_clustering = None, col_cell_type = ['taa_l5'], 
                    fig_dir = f'{plots_path}/preprocessing/scvi', fig_prefix = f'thyAgeing_dnSplit_scvi_{object_version}')

In [None]:
# Save adata and scvi model
overwrite = True

anno_cols = [c for c in scvi_run['data'].obs.columns if '_pred_' in c or '_prob_' in c or 'taa' in c or 'anno_' in c or 'leiden' in c]
if not os.path.exists(f'{data_path}/thyAgeing_dnSplit_scvi_{object_version}.zarr') or overwrite:
    scvi_run['data'].obs = scvi_run['data'].obs.drop(columns=anno_cols)
    scvi_run['data'].write_h5ad(
        f'{data_path}/objects/rna/thyAgeing_dnSplit_scvi_{object_version}.zarr',
        compression=hdf5plugin.FILTERS["zstd"],
        compression_opts=hdf5plugin.Zstd(clevel=5).filter_options,
    )
    scvi_run['vae'].save(f'{model_path}/thyAgeing_dnSplit_scvi_{object_version}', save_anndata=False, overwrite=overwrite)
else:
    print('File already exists')

In [None]:
object_version = 'v2_2025-04-04'
adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_dnSplit_scvi_{object_version}.zarr')

In [None]:
# Louvain clustering
res_list = [2.0,2.5]
for res in res_list:
    sc.tl.leiden(adata, resolution = res, key_added = f"leiden_r{res}")
adata.obs[[f'leiden_r{str(r)}' for r in res_list]] = adata.obs[[f'leiden_r{str(r)}' for r in res_list]].astype('category')

adata.obs[[f'leiden_r{str(r)}' for r in res_list]].to_csv(f'{data_path}/objects/rna/thyAgeing_dnSplit_scvi_{object_version}_leidenClusters.csv')

sc.pl.umap(adata, color = ['leiden_r2.5', 'leiden_r2.0'], show = False, return_fig = True)
plt.savefig(f'{plots_path}/preprocessing/scvi/thyAgeing_dnSplit_scvi_{object_version}_leidenClusters.png', dpi = 300, bbox_inches = 'tight')

In [None]:
sc.pl.umap(adata, color = ['leiden_r2.5', 'leiden_r2.0'], show = False, return_fig = True, ncols = 1)
plt.savefig(f'{plots_path}/preprocessing/scvi/thyAgeing_dnSplit_scvi_{object_version}_leidenClusters.png', dpi = 300, bbox_inches = 'tight')

In [None]:
adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_dnSplit_scvi_{object_version}.zarr')

leiden_clustering = pd.read_csv(f'{data_path}/objects/rna/thyAgeing_dnSplit_scvi_{object_version}_leidenClusters.csv', index_col = 0)
adata.obs = adata.obs.join(leiden_clustering)
adata.obs[leiden_clustering.columns] = adata.obs[leiden_clustering.columns].astype(int).astype('category')

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

In [None]:
# Load curated markers and make dictionary
ct_markers_df = pd.read_excel(f'{data_path}/curated/curatedCellMarkers_LMM.xlsx', sheet_name = 'Final')
ct_markers = ct_markers_df.groupby('cell_label_fine')['marker_gene'].agg(list).to_dict()

import pprint
pprint.pprint(ct_markers, compact=True)

In [None]:
# Filter markers of interest
ctoi = ['B_dev','T_DN(early)', 'T_DN(P)', 'T_DN(Q)','T_DP(P)']
ct_markers_filtered = {c: [g for g in ct_markers[c] if g in adata.var_names] for c in ctoi if c in ct_markers}
ct_markers_filtered['B_dev'].append('IFITM3')
ct_markers_filtered['T_DN(early)'].append('SPINK2')
ct_markers_filtered['T_DN(early)'].append('NOTCH1')
sc.pl.DotPlot(adata, 
                ct_markers_filtered,
                groupby = 'leiden_r2.5',
                mean_only_expressed=True,
                cmap = 'magma').add_totals().savefig(f'{plots_path}/ctAnnotation/v9/thyAgeing_dnSplit_scvi_{object_version}_dnMarkers_leiden2.5_dotplot.png', dpi = 300, bbox_inches = 'tight')

In [None]:
# Check QC metrics
sc.pl.violin(adata, keys = ['percent_mito', 'percent_ribo', 'n_counts', 'n_genes', 'scrublet_score'], groupby='leiden_r2.5', show = False)
plt.savefig(f'{plots_path}/preprocessing/scvi/thyAgeing_dnSplit_scvi_{object_version}_leiden2.5_qc.png', dpi = 300, bbox_inches = 'tight')

In [None]:
# Leiden 2.5
dn_cluster_assignments = {'B_dev_thy' : [7,32,44,36],
                          'T_DN(early)' : [15,28,34],
                          'T_DN(P)' : [0,10,12,29,27,35,37,41],
                          'T_DN(Q)' : [1,2,4,5,6,8,9,11,13,14,16,17,18,38,39,20,21,22,23,24,25,26,38,39,42,43,46,47,31,33,30],
                          'T_DN(late)' : [3,19],
                          'Remove' : [45,21,40]} # LowQC and doublets

np.array([c for c in adata.obs['leiden_r2.5'].unique().tolist() if c not in [c for c in dn_cluster_assignments.values() for c in c]])

In [None]:
# Leiden 2.0
dn_cluster_assignments = {'B_dev_thy' : [11,17],
                          'T_DN(early)' : [5],
                          'T_DN(P)' : [1,6,35,33,34,28,8,24],
                          'T_DN(Q)' : [2,3,4,
                                       12, 20, 31, 10, 29,  9, 18, 26, 23, 32, 21, 16, 15,  7, 13, 27, 14, 30, 22, 19],
                          'T_DN(late)' : [0],
                          'Remove' : [25,35]} # LowQC and doublets

np.array([c for c in adata.obs['leiden_r2.0'].unique().tolist() if c not in [c for c in dn_cluster_assignments.values() for c in c]])

In [None]:
adata.obs['temp_anno'] = pd.NA
#dn_cluster_assignments = {k: [c for c in dn_cluster_assignments[k]] for k in dn_cluster_assignments.keys()}
for k, v in dn_cluster_assignments.items():
    adata.obs.loc[adata.obs['leiden_r2.5'].isin(v), 'temp_anno'] = k
    
sc.pl.umap(adata, color = 'temp_anno', return_fig = True)
plt.savefig(f'{plots_path}/ctAnnotation/v9/thyAgeing_dnSplit_scvi_{object_version}_tempAnno_umap.pdf', dpi=300, bbox_inches='tight')

In [None]:
sc.pl.umap(adata, color = ['CD34','CD3D', 'VPREB1', 'DNTT'])

In [None]:
adata.obs[adata.obs['temp_anno'] == 'B_dev_thy']['donor'].value_counts().head(10)

In [None]:
sc.pl.DotPlot(adata, 
                ct_markers_filtered,
                categories_order = ['B_dev_thy', 'T_DN(early)', 'T_DN(P)', 'T_DN(Q)', 'T_DN(late)', 'Remove'],
                groupby = 'temp_anno',
                mean_only_expressed=True,
                cmap = 'magma').add_totals().savefig(f'{plots_path}/ctAnnotation/v9/thyAgeing_dnSplit_scvi_{object_version}_dnMarkers_tempAnno_dotplot.png', dpi = 300, bbox_inches = 'tight')

In [None]:
adata.obs['temp_anno'].value_counts()

In [None]:
adata.obs[['temp_anno']].to_csv(f'{data_path}/preprocessing/ctAnnotation/v9/thyAgeing_dnSplit_scvi_{object_version}_tempAnno.csv')

Check cell cycle activity:

In [None]:
# Score cell cycle
cell_cycle_genes_df = pd.read_excel('/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/T_NK_compartment/data/curated/regev_cell_cycle_genes.xlsx', skiprows = 6, usecols=[0,1])

cell_cycle_genes = {}
for col in cell_cycle_genes_df.columns:
    cell_cycle_genes_df[col] = cell_cycle_genes_df[col].str.strip()
    cell_cycle_genes[col] = [g.strip() for g in cell_cycle_genes_df[col].dropna().tolist() if g.strip() in adata.var_names]
    
import pprint
pprint.pprint(cell_cycle_genes, compact=True)

sc.tl.score_genes_cell_cycle(adata, s_genes=cell_cycle_genes['G1/S'], g2m_genes=cell_cycle_genes['G2/M'])

In [None]:
cell_cycle_score_df = adata.obs[['S_score', 'G2M_score', 'donor', 'age_group', 'temp_anno', 'phase', 'study']].copy()
#cell_cycle_score_df = cell_cycle_score_df.join(knn_anno)
df = cell_cycle_score_df.groupby(['age_group', 'temp_anno', 'donor', 'study'], observed = True)['phase'].value_counts(normalize = True).unstack().fillna(0).reset_index()
df = df.melt(id_vars = ['age_group', 'temp_anno', 'donor', 'study'], value_name = 'prop', var_name = 'phase')
df['temp_anno'] = pd.Categorical(df['temp_anno'], categories = ['T_ETP','T_DN(early)', 'T_DN(P)', 'T_DN(Q)', 'T_DN(late)', 'B_dev_thy'], ordered=True)
df['temp_anno'] = df['temp_anno'].cat.remove_unused_categories()

sns.catplot(data=df, x='temp_anno', y='prop', hue = 'age_group', hue_order = ['infant', 'paed', 'adult', 'geriatric'], kind="box", palette = 'colorblind',
            col='phase', col_order=['G1', 'S','G2M'], 
            #facet_kws={'sharey': False, 'sharex': True}, 
            height=2, aspect=2, col_wrap=1)

## CD4 recirc T cells

In [None]:
# Load adata_cd4_recirc
object_version = 'v4_2025-02-04'
adata_cd4_recirc = ad.read_h5ad(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_{object_version}.zarr', backed = 'r')

# Add latest annotations to adata_cd4_recirc
ct_anno = pd.read_csv(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_{object_version}_curatedAnno_v8.csv', index_col = 0)
for c in ct_anno.columns:
    if c in adata_cd4_recirc.obs.columns:
        adata_cd4_recirc.obs.drop(c, axis = 1, inplace = True)
adata_cd4_recirc.obs = adata_cd4_recirc.obs.join(ct_anno)

adata_cd4_recirc = adata_cd4_recirc[adata_cd4_recirc.obs['taa_l3'].isin(['T_CD4_recirc', 'T_Treg_recirc'])].to_memory()

In [None]:
from scvi_wrapper import run_scvi
object_version = f'v1_{today}'

# Run scvi
scvi_run = run_scvi(adata_cd4_recirc, 
                    layer_raw = 'X', 
                    # Excluded genes
                    include_genes=[], exclude_cc_genes=True, exclude_mt_genes=True, 
                    exclude_vdjgenes = True, remove_cite = False,
                    # Highly variable gene selection
                    batch_hv="study", hvg = 200, #span = 1,
                    hvg_selection = 'experimental',
                    # scVI 
                    batch_scvi="sample",
                    cat_cov_scvi=["donor", "chemistry_simple", "sex"], 
                    cont_cov_scvi=[], 
                    max_epochs=400, batch_size=2000, early_stopping = True, early_stopping_patience = 15, early_stopping_min_delta = 10.0,
                    plan_kwargs = {'lr': 0.001, 'reduce_lr_on_plateau' : True, 'lr_patience' : 10, 'lr_threshold' : 20}, 
                    n_layers = 3, n_latent = 30, dispersion = 'gene-batch',
                    # Leiden clustering
                    leiden_clustering = None, col_cell_type = ['taa_l5'], 
                    fig_dir = f'{plots_path}/ctAnnotation/v9', fig_prefix = f'thyAgeing_cd4RecircSplit_scvi_{object_version}')

In [None]:
# Save adata and scvi model
overwrite = True

anno_cols = [c for c in scvi_run['data'].obs.columns if '_pred_' in c or '_prob_' in c or 'taa' in c or 'anno_' in c or 'leiden' in c]
if not os.path.exists(f'{data_path}/preprocessing/ctAnnotation/v9/thyAgeing_cd4RecircSplit_scvi_{object_version}.zarr') or overwrite:
    scvi_run['data'].obs = scvi_run['data'].obs.drop(columns=anno_cols)
    scvi_run['data'].write_h5ad(
        f'{data_path}/preprocessing/ctAnnotation/v9/thyAgeing_cd4RecircSplit_scvi_{object_version}_v2.zarr',
        compression=hdf5plugin.FILTERS["zstd"],
        compression_opts=hdf5plugin.Zstd(clevel=5).filter_options,
    )
    scvi_run['vae'].save(f'{model_path}/thyAgeing_cd4RecircSplit_scvi_{object_version}', save_anndata=False, overwrite=overwrite)
else:
    print('File already exists')

### Leiden clustering

In [None]:
# Louvain clustering
adata_cd4_recirc = ad.read_h5ad(f'{data_path}/preprocessing/ctAnnotation/v9/thyAgeing_cd4RecircSplit_scvi_{object_version}_v2.zarr')

res_list = [1.0, 1.5]
for res in res_list:
    sc.tl.leiden(adata_cd4_recirc, resolution = res, key_added = f"leiden_r{res}")
adata_cd4_recirc.obs[[f'leiden_r{str(r)}' for r in res_list]] = adata_cd4_recirc.obs[[f'leiden_r{str(r)}' for r in res_list]].astype(int).astype('category')

adata_cd4_recirc.obs[[f'leiden_r{str(r)}' for r in res_list]].to_csv(f'{data_path}/preprocessing/ctAnnotation/v9/thyAgeing_cd4RecircSplit_scvi_{object_version}_leidenClusters.csv')

sc.pl.umap(adata_cd4_recirc, color = [f'leiden_r{str(r)}' for r in res_list], show = False, return_fig = True)
plt.savefig(f'{plots_path}/ctAnnotation/v9/thyAgeing_cd4RecircSplit_scvi_{object_version}_leidenClusters.png', dpi = 300, bbox_inches = 'tight')

### Marker expression and ct annotation

In [None]:
object_version = 'v1_2025-03-28'
adata_cd4_recirc = ad.read_h5ad(f'{data_path}/preprocessing/ctAnnotation/v9/thyAgeing_cd4RecircSplit_scvi_{object_version}_v2.zarr')
leiden_clustering = pd.read_csv(f'{data_path}/preprocessing/ctAnnotation/v9/thyAgeing_cd4RecircSplit_scvi_{object_version}_leidenClusters.csv', index_col = 0)
adata_cd4_recirc.obs = adata_cd4_recirc.obs.join(leiden_clustering)
adata_cd4_recirc.obs[leiden_clustering.columns] = adata_cd4_recirc.obs[leiden_clustering.columns].astype(int).astype('category')

In [None]:
sc.pp.normalize_total(adata_cd4_recirc, target_sum=1e4)
sc.pp.log1p(adata_cd4_recirc)

In [None]:
# Load curated markers and make dictionary
ct_markers_df = pd.read_excel(f'{data_path}/curated/curatedCellMarkers_LMM.xlsx', sheet_name = 'Final')
ct_markers = ct_markers_df.groupby('cell_label_fine')['marker_gene'].agg(list).to_dict()

cd4_markers = {'T_CD4_recirc' : ['PRDM1', 'PTPRC', 'LAG3', 'HAVCR2', 'CTLA4', 'PDCD1', 'CCR7', 'S1PR1'],
               'T_CD4_r1_recirc':['LAG3', 'IL10', 'TGFB1', 'PRDM1', 'GZMA', 'IFNG','PDCD1'],
               'T_T(agonist)':['NR4A1', 'NR4A2', 'NR4A3', 'BCL2L11'],
               'T_CD4_FH' : ['CD40LG', 'PDCD1', 'BCL6', 'IL21', 'CXCL13', 'SLAMF6', 'CD200', 'MAF', 'CXCR5'],
               'T_CD4_h1' : ['IFNG', 'TNF', 'TBX21', 'STAT1', 'STAT4', 'CXCR3', 'IL12RB2', 'LY6E', 'GZMK'],
               'T_CD4_h17' : ['IL17A', 'IL17F', 'IL22', 'IL21', 'RORC', 'STAT3', 'AHR', 'CCR6', 'IL23R', 'CCL20'],
}

# Filter markers of interest
ctoi = ['T_CD4','T_CD4_fh','T_CD4_h1','T_CD4_h17','T_CD4_r1','T_CD4_recirc', 'T_Treg']
ct_markers_filtered = {c: list(set([g for g in ct_markers[c] if g in adata_cd4_recirc.var_names])) for c in ctoi if c in ct_markers}
ct_markers_filtered['TCR_act'] = ['NR4A1', 'NR4A2', 'NR4A3']
ct_markers_filtered['T_CD4_fr'] =  ['FOXP3', 'IL2RA', 'CTLA4', 'IL10', 'TGFB1']
ct_markers_filtered['T_CD4_mem'] = ['FOSB', 'KLF6', 'BCL2', 'SELL', 'IFNG', 'FOXP1', 'LEF1', 'IL7R']
ct_markers_filtered['T_CD4_h2'] = ['GATA3', 'STAT6', 'BATF', 'IRF4', 'IL4', 'IL5', 'IL13', 'CCR4', 'CCR8', 'PTGDR2']
ct_markers_filtered['Homing'] = ['S1PR1', 'CCR7', 'CXCR6', 'CXCR3', 'CCR4', 'CCR5', 'CCR6', 'CCR8']
ct_markers_filtered['TNF signalling'] = ['TNFRSF18', 'TNFRSF4', 'TNFRSF1B']
ct_markers_filtered['Activation'] = ['IL2RA', 'TIGIT', 'ICOS', 'LMNA']
ct_markers_filtered['Proliferation'] = ['TOP2A', 'MKI67']
ct_markers_filtered['T_CD4_fh'].extend(['SH2D1A', 'BATF'])

plt.rcParams.update(plt.rcParamsDefault)
sc.pl.DotPlot(adata_cd4_recirc, 
                ct_markers_filtered,
                groupby = 'leiden_r1.5',
                mean_only_expressed=True,
                cmap = 'magma').add_totals().savefig(f'{plots_path}/ctAnnotation/v9/thyAgeing_cd4RecircSplit_scvi_{object_version}_cd4Markers_leiden1.5_dotplot.png', dpi = 300, bbox_inches = 'tight')

In [None]:
[c for c in ct_markers_filtered['T_CD4_h2'] if not c in adata_cd4_recirc.var_names]

In [None]:
cd4_cluster_assignments = {'T_CD4_naive_recirc' : [4,10,7],
                          'T_CD4_fh' : [3],
                          'T_CD4_unstim' : [1,5,8,15,21,23,9,24],
                          'T_CD4_h' : [11,13,19,20], # CM CCR7+, EM CCR6+/CXCR3+
                          'T_CD4_act' : [6], # CCR4
                          'T_Treg_fr' : [],
                          'T_Treg_recirc' : [2,12,17,14,18],
                          'T_Treg_tr' : [0,16,22],
                          'Remove' : [25]}

np.array([c for c in adata_cd4_recirc.obs['leiden_r1.5'].unique().tolist() if c not in [c for c in cd4_cluster_assignments.values() for c in c]])

Single-cell RNA sequencing (scRNA-seq) allows for the identification of specific gene expression profiles that can serve as markers for different cell types, including CD4 memory T cells. Here are some key markers typically associated with CD4 memory T cells:

1. **CD45RO**: A classic marker distinguishing memory T cells from naive T cells (which express CD45RA).
2. **CCR7**: Central memory T cells (T_CM) express CCR7, which is involved in homing to lymphoid tissues.
3. **CD27**: Often expressed on memory T cells, particularly central memory T cells.
4. **CD62L (L-selectin)**: Another marker for central memory T cells, involved in lymph node homing.
5. **IL7R (CD127)**: Expressed on memory T cells, indicating their responsiveness to IL-7, which is important for their survival.
6. **CXCR3**: Often expressed on effector memory T cells (T_EM), involved in migration to inflamed tissues.
7. **CCR6**: Expressed on a subset of memory T cells, particularly Th17 cells.
8. **CD69**: An early activation marker that can be transiently expressed on memory T cells.
9. **CD95 (Fas)**: Expressed on memory T cells, involved in apoptosis regulation.
10. **BCL2**: Anti-apoptotic protein, often upregulated in memory T cells to promote their survival.
11. **KLRG1**: Expressed on a subset of effector memory T cells, often associated with terminal differentiation.
12. **HLA-DR**: MHC class II molecule, sometimes expressed on activated memory T cells.
13. **IFNG (Interferon-gamma)**: Cytokine often produced by memory T cells, particularly Th1 cells.
14. **GZMB (Granzyme B)**: Expressed by cytotoxic memory T cells, involved in target cell killing.
15. **PRF1 (Perforin)**: Another marker of cytotoxic memory T cells, involved in target cell lysis.

These markers can be used to identify and characterize CD4 memory T cells in single-cell RNA sequencing data, providing insights into their functional states and roles in immune responses.

In [None]:
cd4_cluster_assignments = {'T_CD4_naive_recirc' : [24,  1,  9,  8,  4, 10, 23, 21,  7,  5, 15],
                          'T_CD4_fh' : [3],
                          'T_CD4_h' : [20,11,13,19,20],
                          'T_CD4_act' : [6],
                          'T_Treg_fr' : [],
                          'T_Treg_HSPhi' : [2,12,17,14],
                          'T_Treg_recirc' : [0,16,22],
                          'Remove' : [25]}

np.array([c for c in adata_cd4_recirc.obs['leiden_r1.5'].unique().tolist() if c not in [c for c in cd4_cluster_assignments.values() for c in c]])

In [None]:
adata_cd4_recirc.obs['temp_anno'] = pd.NA
#dn_cluster_assignments = {k: [c for c in dn_cluster_assignments[k]] for k in dn_cluster_assignments.keys()}
for k, v in cd4_cluster_assignments.items():
    adata_cd4_recirc.obs.loc[adata_cd4_recirc.obs['leiden_r1.5'].isin(v), 'temp_anno'] = k
    
sc.pl.umap(adata_cd4_recirc, color = 'temp_anno', return_fig = True)
plt.savefig(f'{plots_path}/ctAnnotation/v9/thyAgeing_cd4RecircSplit_scvi_{object_version}_tempAnno_umap.pdf', dpi=300, bbox_inches='tight')

In [None]:
sc.tl.rank_genes_groups(adata_cd4_recirc, groupby = 'temp_anno', n_genes = 30, method = 'wilcoxon', reference='rest')

In [None]:
sc.get.rank_genes_groups_df(adata_cd4_recirc, group = ['T_CD4_act'])

In [None]:
sc.pl.DotPlot(adata_cd4_recirc, 
                ct_markers_filtered,
                groupby = 'temp_anno',
                mean_only_expressed=True,
                cmap = 'magma').add_totals().savefig(f'{plots_path}/ctAnnotation/v9/thyAgeing_cd4RecircSplit_scvi_{object_version}_cd4Markers_tempAnno_dotplot.png', dpi = 300, bbox_inches = 'tight')

HSPhi T cell: https://www.sciencedirect.com/science/article/pii/S2589004223016656

In [None]:
adata_cd4_recirc.obs[['temp_anno']].to_csv(f'{data_path}/preprocessing/ctAnnotation/v9/thyAgeing_cd4RecircSplit_scvi_{object_version}_tempAnno.csv')

## Whole T/NK object 

### Integration

In [None]:
# Load adata_full
object_version = 'v4_2025-02-04'
adata_full = ad.read_h5ad(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_{object_version}.zarr')

# Add latest annotations to adata_full
ct_anno = pd.read_csv(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_{object_version}_curatedAnno_v8.csv', index_col = 0)
for c in ct_anno.columns:
    if c in adata_full.obs.columns:
        adata_full.obs.drop(c, axis = 1, inplace = True)
adata_full.obs = adata_full.obs.join(ct_anno)

In [None]:
barcodes = ct_anno.index[ct_anno['taa_l1'].isin(['T', 'NK'])].to_list()
print(len(barcodes))
barcodes = set(barcodes + adata.obs_names.tolist())

len(barcodes)

In [None]:
adata_full = adata_full.concatenate(adata_c9, index_unique = None).copy()
adata_full = adata_full[adata_full.obs_names.isin(barcodes)].to_memory()

adata_full.shape

In [None]:
object_version = f'v9_{today}'

# Run scvi
scvi_run = run_scvi(adata_full, 
                    layer_raw = 'X', 
                    # Excluded genes
                    include_genes=[], exclude_cc_genes=True, exclude_mt_genes=True, 
                    exclude_vdjgenes = True, remove_cite = False,
                    # Highly variable gene selection
                    batch_hv="study", hvg = 3000, 
                    hvg_selection = 'experimental',
                    # scVI 
                    batch_scvi="sample",
                    cat_cov_scvi=["donor", "chemistry_simple", "sex"], 
                    #cont_cov_scvi=["percent_mito", 'percent_ribo', 'n_genes'], 
                    max_epochs=400, batch_size=2000, early_stopping = True, early_stopping_patience = 15, early_stopping_min_delta = 10.0,
                    plan_kwargs = {'lr': 0.001, 'reduce_lr_on_plateau' : True, 'lr_patience' : 10, 'lr_threshold' : 20}, 
                    n_layers = 3, n_latent = 30, dispersion = 'gene-batch',
                    # Leiden clustering
                    leiden_clustering = None, col_cell_type = ['taa_l5'], 
                    fig_dir = f'{plots_path}/preprocessing', fig_prefix = f'thyAgeing_tSplit_scvi_{object_version}')

In [None]:
# Save adata and scvi model
overwrite = True

anno_cols = [c for c in scvi_run['data'].obs.columns if '_pred_' in c or '_prob_' in c or 'taa' in c or 'anno_' in c or 'leiden' in c]
if not os.path.exists(f'{data_path}/thyAgeing_tSplit_scvi_{object_version}.zarr') or overwrite:
    scvi_run['data'].obs = scvi_run['data'].obs.drop(columns=anno_cols)
    scvi_run['data'].write_h5ad(
        f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}.zarr',
        compression=hdf5plugin.FILTERS["zstd"],
        compression_opts=hdf5plugin.Zstd(clevel=5).filter_options,
    )
    scvi_run['vae'].save(f'{model_path}/thyAgeing_tSplit_scvi_{object_version}', save_anndata=False, overwrite=overwrite)
else:
    print('File already exists')

### Annotations

In [None]:
object_version = 'v9_2025-03-28'
adata_full = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}.zarr')

# Add latest annotations to adata
ct_anno = pd.read_csv(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_v4_2025-02-04_curatedAnno_v8.csv', index_col = 0)
for c in ct_anno.columns:
    if c in adata_full.obs.columns:
        adata_full.obs.drop(c, axis = 1, inplace = True)
adata_full.obs = adata_full.obs.join(ct_anno)

# Update metadata
latest_meta_path = get_latest_version(dir = f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
latest_meta = pd.read_excel(latest_meta_path)
update_obs(adata_full, latest_meta, on = 'index', ignore_warning = True)

In [None]:
adata_full.obs['temp_anno'] = adata_full.obs['taa_l5'].astype(str)

dn_anno = pd.read_csv(f'{data_path}/preprocessing/ctAnnotation/v9/thyAgeing_dnSplit_all_scvi_v4_2025-02-04_tempAnno.csv', index_col = 0)
cd4_recirc_anno = pd.read_csv(f'{data_path}/preprocessing/ctAnnotation/v9/thyAgeing_cd4RecircSplit_scvi_v1_2025-03-28_tempAnno.csv', index_col = 0)

adata_full.obs.loc[dn_anno.index,'temp_anno'] = dn_anno['taa_l5'].values
adata_full.obs.loc[cd4_recirc_anno.index,'temp_anno'] = cd4_recirc_anno['temp_anno'].values
adata_full.obs.loc[adata_full.obs['temp_anno'] == 'T_CD4_prolif','temp_anno'] = 'T_CD4_act'

adata_full.obs['temp_anno'] = adata_full.obs['temp_anno'].astype('category')
adata_full.obs.loc[adata_full.obs['temp_anno'].isin(['Remove', 'nan']),'temp_anno'] = pd.NA
adata_full.obs['temp_anno'].cat.remove_unused_categories()

adata_full.obs['temp_anno'].value_counts()

In [None]:
adata_full.obs_names.duplicated().sum()

In [None]:
sc.pl.umap(adata_full[~adata_full.obs['temp_anno'].isna()], color = 'temp_anno')

In [None]:
adata_full.obs[['temp_anno']].to_csv(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}_tempAnno.csv')

In [None]:
anno_levels = pd.read_excel(f'{general_data_path}/curated/thyAgeing_full_curatedAnno_v9_2025-03-03_levels.xlsx')
ct_anno = adata_full.obs[['temp_anno']].copy()
ct_anno = ct_anno.reset_index(names='barcodes').merge(anno_levels, left_on = 'temp_anno', right_on = 'taa_l5', how = 'inner').drop(columns = ['temp_anno']).set_index('barcodes')
ct_anno.head()

In [None]:
ct_anno['taa_l5'].value_counts()

In [None]:
# Check number of NAs (removed cells)
ct_anno.isna().sum()

In [None]:
# Check whether all cell types are included in reference
np.setdiff1d(adata_full.obs['temp_anno'].unique().astype(str), ct_anno['taa_l5'].unique().astype(str))

In [None]:
ct_anno.to_csv(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}_curatedAnno_v9.csv')

In [None]:
from plotting.utils import plot_grouped_boxplot, calc_figsize
from anno_levels import get_ct_levels, get_ct_palette, age_group_levels, age_group_palette

col_cell_type_fine = 'taa_l5'
col_age_group = 'age_group'
col_age_group_levels = age_group_levels
col_cell_type_fine_levels = [c for c in get_ct_levels('taa_l5') if c in adata_full.obs[col_cell_type_fine].unique().tolist()]

In [None]:
adata_full.obs.drop(columns = ct_anno.columns, inplace = True, errors='ignore')
adata_full.obs = adata_full.obs.join(ct_anno)

In [None]:
anno_df.groupby(['taa_l5', col_age_group, 'donor']).size()

In [None]:
df

In [None]:
# Create anno df
anno_df = adata_full.obs[['sample', 'donor', 'sex', 'sort', 'study', col_age_group, 'age_months', col_cell_type_fine,]].copy()
anno_df.dropna(subset = col_cell_type_fine, inplace = True)
anno_df_sub = anno_df[(anno_df['sort'].isin(['TOT', 'CD3P']))].copy()
df = freq_by_donor(anno_df_sub, sample_col = 'sample', donor_col = 'donor', summary_col=col_cell_type_fine, add_meta = [col_age_group])

plot_grouped_boxplot(data = df, x = col_cell_type_fine, y = 'mean_prop', hue = col_age_group, order = col_cell_type_fine_levels, hue_order = col_age_group_levels, 
                     palette = age_group_palette, x_label = 'Cell population', y_label = 'Frequency', legend_title = 'Age group', add_stats = True, format_percent = True, figsize = calc_figsize(height = 'half', height_ratio = 0.75),
                     #save_stats = f'{data_path}/analyses/freqAnalysis/all_curatedAnno_v7/thyAgeing_tSplit_relTfreqFine'
                     )
plt.savefig(f'{plots_path}/ctAnnotation/v9/testFreq.pdf')

In [None]:
# Calculate transition rates
transitions = [('T_DN(early)', 'T_DN(P)'), ('T_DN(P)', 'T_DN(Q)'), ('T_DN(Q)', 'T_DP(P)'), ('T_DP(P)', 'T_DP(Q)'), ('T_DP(Q)', 'T_αβT(entry)'), 
               ('T_αβT(entry)', 'T_CD4_naive'), ('T_αβT(entry)', 'T_CD8_naive'), ('T_αβT(entry)', 'T_Treg')]
transitions_order = [f'{t0} -> {t1}' for t0, t1 in transitions]

trans_rates = []
for trans in transitions:
    df = anno_df[(anno_df[col_cell_type_fine].isin(trans)) & (anno_df['sort'].isin(['TOT', 'CD3P']))]
    
    pairwise_combinations = pd.MultiIndex.from_product([df['sample'].unique(), df[col_cell_type_fine].unique()], names=['sample', col_cell_type_fine]).to_frame(index=False)
    pairwise_combinations = pairwise_combinations.merge(df[['sample','donor', col_age_group]], on = 'sample', how = 'left')
    
    df = df.groupby([col_age_group, col_cell_type_fine, 'sample', 'donor'], observed = True).size().reset_index(name='counts')
    df = df.merge(pairwise_combinations, on = [col_cell_type_fine, col_age_group, 'sample', 'donor'], how = 'outer').fillna(0)
    df = df.pivot_table(index=[col_age_group, 'sample', 'donor'], columns=col_cell_type_fine, values='counts', aggfunc='sum').fillna(0).reset_index()

    df['ratio'] = np.where(df[trans[0]] == 0, np.nan, (df[trans[1]] / df[trans[0]]) + 0.001)
    df['transition'] = f'{trans[0]} -> {trans[1]}'
    
    df = df.groupby([col_age_group, 'transition', 'donor'], observed=True).agg(ratio = ('ratio', 'mean')).reset_index()
    
    trans_rates.append(df)
    
trans_rates = pd.concat(trans_rates)

trans_rates.head()

In [None]:
# Formatting
from matplotlib import font_manager
font_manager.fontManager.addfont("/nfs/team205/ny1/ThymusSpatialAtlas/software/Arial.ttf")
plt.style.use('/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts/plotting/thyAgeing.mplstyle')

In [None]:
plot_grouped_boxplot(data = trans_rates.reset_index(), x = 'transition', y = 'ratio', hue = col_age_group, order = transitions_order, hue_order = col_age_group_levels, 
                     x_label = 'Transition', y_label = 'Transition rate', legend_title = 'Age group', add_stats = True, format_log = True, figsize = calc_figsize(height = 70, width = 73), 
                     y_intercept = 1.0, ylim = (0.1, 5000),
                     #save_stats = f'{data_path}/analyses/freqAnalysis/all_curatedAnno_v7/thyAgeing_tSplit_fateTransitions'
                     )
plt.savefig(f'{plots_path}/ctAnnotation/v9/testTrans.pdf')

In [None]:
nhood_enrichment = pd.read_csv(f'{data_path}/objects/rna/thyAgeing_tSplitxTissue_scvi_v1_2024-11-29_nhoodEnrichment.csv', index_col=0)
nhood_enrichment.replace(pd.NA, 'none', inplace = True)
nhood_enrichment = nhood_enrichment.loc[nhood_enrichment.index.str.endswith('-0')]
nhood_enrichment.index = nhood_enrichment.index.str.replace('-0', '')
anno_df = anno_df.join(nhood_enrichment)

In [None]:
nhood_enrichment['nhood_enrichment'].value_counts() 

In [None]:
df_location = anno_df[(anno_df['sort'].isin(['TOT', 'CD3P'])) & (anno_df[col_cell_type_fine].isin(col_cell_type_fine_levels))].groupby(['donor', col_cell_type_fine])['nhood_enrichment'].value_counts(normalize=True).to_frame('freq').reset_index()

import seaborn as sns

import matplotlib.pyplot as plt

# Convert 'freq' to percentage
df_location['freq'] = df_location['freq']

# Plot grouped barplot
plt.figure(figsize=calc_figsize(width_ratio = 1, height_ratio = 0.5))
ax = plt.axes()
sns.barplot(data=df_location, x=col_cell_type_fine, y='freq', hue='nhood_enrichment', order = [l for l in col_cell_type_fine_levels if l in df_location[col_cell_type_fine].unique()], hue_order = ['blood', 'tissue', 'none'], ci='sd')
plt.xlabel('Cell population')
plt.ylabel('Proportion')
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x:.0%}'))
plt.xticks(rotation=90)
plt.legend(title='Location enrichment')
plt.tight_layout()
#plt.savefig(f'{plots_path}/freqAnalysis/all_curatedAnno_v7/thyAgeing_tSplit_cellType_by_nhood.pdf')
plt.show()

In [None]:
blood_enriched = anno_df[(anno_df['sort'].isin(['TOT', 'CD3P'])) & (anno_df['nhood_enrichment'] == 'blood')].groupby(['donor', col_age_group])[col_cell_type_fine].value_counts(normalize=True).to_frame('freq').reset_index().dropna()
tissue_enriched = anno_df[(anno_df['sort'].isin(['TOT', 'CD3P'])) & (anno_df['nhood_enrichment'] == 'tissue')].groupby(['donor', col_age_group])[col_cell_type_fine].value_counts(normalize=True).to_frame('freq').reset_index().dropna()
df = tissue_enriched.merge(blood_enriched, on = ['donor', col_age_group, col_cell_type_fine], suffixes = ('_tissue', '_blood'))
df['ratio'] = df['freq_tissue'] / df['freq_blood']
df.dropna(inplace = True)  
df = df.loc[~df[col_cell_type_fine].isin(['T_CD8αα(entry)', # Exclude innate
 'T_CD8αα(I)',
 'T_CD8αα(II)',
 'T_MAIT',
 'ILC',
 'T_γδT',
 'NK_tr'])] 
#df[col_cell_type_fine].cat.remove_unused_categories(inplace = True)

In [None]:
plot_grouped_boxplot(data = df, x = col_cell_type_fine, y = 'ratio', hue = col_age_group, order = [c for c in col_cell_type_fine_levels if c in df[col_cell_type_fine].unique()], hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'Ratio [tissue/blood]', legend_title = 'Age group', add_stats = True, format_log = True, figsize = calc_figsize(width = 300, height = 300), ylim = (0.01,100), y_intercept = 1,
                     #save_stats = f'{data_path}/analyses/freqAnalysis/all_curatedAnno_v7/thyAgeing_tSplit_ratioTissueBlood'
)
plt.show()