# Thymus ageing atlas: T/NK comparment marker plots

In [None]:
import os
import sys
import session_info
from datetime import datetime
today = datetime.today().strftime('%Y-%m-%d')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import scanpy as sc
import anndata as ad
import hdf5plugin

# Add repo path to sys path (allows to access scripts and metadata from repo)
repo_path = '/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/T_NK_compartment'
sys.path.insert(1, repo_path) 
sys.path.insert(2, '/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts')

# Autoreload custom scripts
%load_ext autoreload
%autoreload 2

# Define paths
plots_path = f'{repo_path}/plots/'
data_path = f'{repo_path}/data/'
model_path = os.path.join(repo_path, 'models')
general_data_path = '/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/data'

print('Dir for plots: {}'.format(plots_path))
print('Dir for data: {}'.format(data_path))

# Formatting
from matplotlib import font_manager
font_manager.fontManager.addfont("/nfs/team205/ny1/ThymusSpatialAtlas/software/Arial.ttf")
plt.style.use('/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts/plotting/thyAgeing.mplstyle')

# Import custom scripts
from utils import get_latest_version,update_obs,freq_by_donor,cellxgene_prep
from anno_levels import get_ct_levels, get_ct_palette, age_group_levels, age_group_palette
from plotting.utils import plot_grouped_boxplot, calc_figsize, thyAgeing_colors

In [None]:
# Load adata
object_version = 'v5_2025-04-03'
adata = ad.read_h5ad(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_{object_version}.zarr')

# Add new annotations to adata
ct_anno = pd.read_csv(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_v4_2025-02-04_curatedAnno_v10.csv', index_col = 0)
for c in ct_anno.columns:
    if c in adata.obs.columns:
        adata.obs.drop(c, axis = 1, inplace = True)
adata.obs = adata.obs.join(ct_anno)

# Filter data (only include annotated cells)
adata = adata[~adata.obs['taa_l5'].str.contains('locnt|-sp|explore', na = True)]

# Update metadata
latest_meta_path = get_latest_version(dir = f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
latest_meta = pd.read_excel(latest_meta_path)
update_obs(adata, latest_meta, on = 'index', ignore_warning = True)

adata

## Developing T cells

In [None]:
barcodes = adata.obs_names[adata.obs['taa_l2'].isin(['T_predev', 'T_dev', 'T_mature', 'T_innate', 'NK', 'pDC'])].tolist() + adata.obs_names[adata.obs['taa_l4'] == 'B_dev_thy'].tolist()
adata_sub = adata[barcodes, :]

adata_sub.obs['taa_l4'].value_counts()

In [None]:
# Log-normalise data
sc.pp.normalize_total(adata_sub, target_sum=1e4)
sc.pp.log1p(adata_sub)

In [None]:
anno_levels = [c for c in get_ct_levels('taa_l5', taa_l1 = ['T', 'NK', 'DC', 'B']) if c in adata_sub.obs['taa_l5'].unique()]

anno_levels

In [None]:
t_markers = pd.read_excel(f'{data_path}/curated/matureT_markers.xlsx')
t_markers = t_markers[t_markers['compartment'] != 'T_recirc']
t_markers = t_markers.groupby('population')['gene'].agg(list).to_dict() 

sc.pl.DotPlot(adata_sub, 
            groupby='taa_l5',
            categories_order=anno_levels,
            var_names=t_markers,
            figsize = calc_figsize(width = 200, height = 80),
            mean_only_expressed=True,
            cmap = sns.blend_palette([thyAgeing_colors['blue'], thyAgeing_colors['purple'], thyAgeing_colors['magenta'], thyAgeing_colors['orange'], thyAgeing_colors['yellow']], as_cmap=True,), #'magma',
            ).style(smallest_dot=0, largest_dot = 40, dot_edge_lw=0.05).add_totals(size = 0.5).savefig(f'{plots_path}/ctAnnotation/v7/thyAgeing_tSplit_scvi_{object_version}_curatedAnno_v8_nonRecircMarkers_dotplot.pdf') 

In [None]:
adata_sub = adata[adata.obs['taa_l4'].isin(['T_DN(early)', 'B_dev_thy', 'pDC', 'T_DN(P)'])]

# Log-normalise data
sc.pp.normalize_total(adata_sub, target_sum=1e4)
sc.pp.log1p(adata_sub)

In [None]:
lin_markers = {'ETP' : ['CD34'],
              'T lin' : ['CD3D', 'CD3E', 'NOTCH1', 'TCF7', 'PTCRA', 'RAG1', 'RAG2', 'DNTT'],
              'B lin' : ['VPREB1', 'IFITM3', 'IGLL1', 'CD79A'],
              'pDC lin' : ['LILRA4', 'CLEC4C', 'RUNX2', 'TYROBP']}

In [None]:
from itertools import product

cat_order = ['_'.join(pair) for pair in product(['T_DN(early)', 'T_DN(P)', 'B_dev_thy', 'pDC'], ['infant', 'paed', 'adult', 'aged'])]
sc.pl.DotPlot(adata_sub, 
            groupby=['taa_l4', 'age_group'],
            categories_order=cat_order,
            var_names=lin_markers,
            figsize = calc_figsize(width = 150, height = 50),
            mean_only_expressed=True,
            cmap = sns.blend_palette([thyAgeing_colors['blue'], thyAgeing_colors['purple'], thyAgeing_colors['magenta'], thyAgeing_colors['orange'], thyAgeing_colors['yellow']], as_cmap=True,), #'magma',
            ).style(smallest_dot=0, largest_dot = 40, dot_edge_lw=0.05).add_totals(size = 0.5).swap_axes().savefig(f'{plots_path}/ctAnnotation/v7/thyAgeing_tSplit_scvi_{object_version}_curatedAnno_v8_devLinMarkers_dotplot.pdf') 

## Recirculating T cells

In [None]:
barcodes = adata.obs_names[(adata.obs['taa_l2'].isin(['T_mature','T_recirc','T_innate', 'NK'])) & (~adata.obs['taa_l4'].str.contains('αα|agonist'))].tolist() 
adata_sub = adata[barcodes, :]

# Log-normalise data
sc.pp.normalize_total(adata_sub, target_sum=1e4)
sc.pp.log1p(adata_sub)

adata_sub.obs['taa_l4'].value_counts()

In [None]:
anno_levels = [c for c in get_ct_levels('taa_l5', taa_l1 = ['T', 'NK', 'DC', 'B']) if c in adata_sub.obs['taa_l5'].unique()]

anno_levels

In [None]:
t_markers = pd.read_excel(f'{data_path}/curated/matureT_markers.xlsx')
t_markers = t_markers[(t_markers['compartment'] != 'T_dev') & (~t_markers['population'].str.contains('\(I'))]
t_markers = t_markers.groupby('population')['gene'].agg(list).to_dict() 
t_markers.pop('T_ɣδ')
t_markers['T_age-assoc'] = ['MT1X','MT1E'] # Add metallothionein markers
#t_markers['RTE'] = ['TOX2', 'CD38']

sc.pl.DotPlot(adata_sub, 
            groupby='taa_l5',
            categories_order=anno_levels,
            var_names=t_markers,
            figsize = calc_figsize(width = 200, height = 80),
            mean_only_expressed=True,
            cmap = sns.blend_palette([thyAgeing_colors['blue'], thyAgeing_colors['purple'], thyAgeing_colors['magenta'], thyAgeing_colors['orange'], thyAgeing_colors['yellow']], as_cmap=True,), #'magma',
            ).style(smallest_dot=0, largest_dot = 40, dot_edge_lw=0.05).add_totals(size = 0.5).savefig(f'{plots_path}/ctAnnotation/v9/thyAgeing_tSplit_scvi_{object_version}_curatedAnno_v9_recircMarkers_dotplot.pdf') 

In [None]:
t_markers.keys()

## T/NK final UMAP

In [None]:
object_version = 'v9_2025-03-28'
adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}.zarr')

# Add new annotations to adata
ct_anno = pd.read_csv(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_v4_2025-02-04_curatedAnno_v10.csv', index_col = 0)
for c in ct_anno.columns:
    if c in adata.obs.columns:
        adata.obs.drop(c, axis = 1, inplace = True)
adata.obs = adata.obs.join(ct_anno)

# Filter data (only include annotated cells)
adata = adata[~adata.obs['taa_l5'].str.contains('locnt|-sp|explore', na = True)]

# Update metadata
latest_meta_path = get_latest_version(dir = f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
latest_meta = pd.read_excel(latest_meta_path)
update_obs(adata, latest_meta, on = 'index', ignore_warning = True)

adata

In [None]:
with plt.rc_context({'figure.figsize' : calc_figsize(width = 150, height = 90)}):
    sc.pl.umap(adata, color='taa_l4', return_fig=True, show = False, size = 1, frameon = False)
    plt.savefig(f'{plots_path}/ctAnnotation/v9/thyAgeing_tSplit_scvi_{object_version}_finalAnno_umap.png', bbox_inches='tight', dpi = 300)

In [None]:
# Add vdj data
meta_tcr = pd.read_csv(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_v9_2025-03-28_tcrab_v6.csv', index_col = 0)
adata.obs = adata.obs.join(meta_tcr)

In [None]:
adata.obs.columns.to_numpy()

In [None]:
adata.obs['has_prod_TRB'] = np.where(adata.obs['productive_VDJ'].str.contains('T', na=False), 'yes', 'no')
adata.obs['has_prod_TRA'] = np.where(adata.obs['productive_VJ'].str.contains('T', na=False), 'yes', 'no')

adata.obs['has_prod_TRB'].value_counts()

In [None]:
with plt.rc_context({'figure.figsize' : calc_figsize(width = 80, height = 45)}):
    sc.pl.umap(adata[adata.obs['productive_VDJ'] != 'No_contig'], color=['has_prod_TRB', 'has_prod_TRA'], return_fig=True, show = False, size = 1, frameon = False, ncols = 2,
               palette = [thyAgeing_colors['teal'], thyAgeing_colors['orange']])
    plt.savefig(f'{plots_path}/preprocessing/thyAgeing_tSplit_scvi_{object_version}_prodTCR_umap.png', bbox_inches='tight', dpi = 300)

## Save cellxgene object

In [None]:
object_version = 'v9_2025-03-28'
adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}.zarr')

# Add new annotations to adata
ct_anno = pd.read_csv(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_v4_2025-02-04_curatedAnno_v10.csv', index_col = 0)
for c in ct_anno.columns:
    if c in adata.obs.columns:
        adata.obs.drop(c, axis = 1, inplace = True)
adata.obs = adata.obs.join(ct_anno)

# Filter data (only include annotated cells)
adata = adata[~adata.obs['taa_l5'].str.contains('locnt|-sp|explore', na = True)]

# Update metadata
latest_meta_path = get_latest_version(dir = f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
latest_meta = pd.read_excel(latest_meta_path)
update_obs(adata, latest_meta, on = 'index', ignore_warning = True)

adata

In [None]:
cellxgene_prep(adata, object_name = 'ThyAge_T_NK_compartment')