# Thymus ageing atlas: Assemble final annotations

In [None]:
import os
import sys
import session_info
from datetime import datetime
today = datetime.today().strftime('%Y-%m-%d')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import anndata as ad
import hdf5plugin

import warnings
warnings.filterwarnings('ignore', category=ad.ImplicitModificationWarning)

# Add repo path to sys path (allows to access scripts and metadata from repo)
repo_path,_ = os.path.split(os.path.split(os.getcwd())[0])
repo_path = '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis'
sys.path.insert(1, repo_path) 
sys.path.insert(2, '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts')

# Autoreload custom scripts
%load_ext autoreload
%autoreload 2

# Define paths
plots_path = f'{repo_path}/plots/'
data_path = f'{repo_path}/data/'
model_path = os.path.join(repo_path, 'models')
general_data_path = '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/data'

print('Dir for plots: {}'.format(plots_path))
print('Dir for data: {}'.format(data_path))

# Formatting
from matplotlib import font_manager
font_manager.fontManager.addfont("/nfs/team205/ny1/ThymusSpatialAtlas/software/Arial.ttf")
plt.style.use('/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts/plotting/thyAgeing.mplstyle')

# Import custom scripts
from utils import get_latest_version,update_obs,freq_by_donor
from anno_levels import get_ct_levels, get_ct_palette, age_group_levels, age_group_palette
from plotting.utils import plot_grouped_boxplot, calc_figsize

## v9

In [None]:
#Â Load adata
object_version = 'v4_2025-02-04'
adata = ad.read_h5ad(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_{object_version}.zarr', backed = 'r')

In [None]:
# Load annotation sheet
anno_sheet = pd.read_excel(f'{general_data_path}/objects/thyAgeing_objects_v2.xlsx')
anno_sheet = anno_sheet.loc[~anno_sheet['compartment'].isin(['All', 'TEC-Fb-Vasc'])]

anno_sheet

In [None]:
# Construct anno
ct_anno = pd.DataFrame(index = adata.obs.index.astype(str), columns = ['taa_l5'])

for row in anno_sheet.iterrows():
    anno = pd.read_csv(row[1]['latest_anno'], index_col = 0)
    anno.index = anno.index.astype(str)
    barcodes = np.intersect1d(anno.index, adata.obs.index)
    
    print('{} compartment: Overlapping barcodes: {}'.format(row[1]['compartment'], len(barcodes)))
    
    ct_anno.loc[barcodes, 'taa_l5'] = anno.loc[barcodes,row[1]['annotation_column']]
    
# Manually rename Fb clusters  
ct_anno.loc[ct_anno['taa_l5'] == 'Fb-mixed', 'taa_l5'] = 'Fb-interm'
ct_anno.loc[ct_anno['taa_l5'] == 'Fb-interlo_P16', 'taa_l5'] = 'Fb-interlo_PI16'

In [None]:
ct_anno['taa_l5'].isna().sum() # Resonable number of missing annotations

In [None]:
# Add annotation levels
anno_level_sheet = pd.read_excel(f'{general_data_path}/curated/thyAgeing_full_curatedAnno_v9_2025-03-03_levels.xlsx')
ct_anno = ct_anno.reset_index(names='names').merge(anno_level_sheet, left_on = 'taa_l5', right_on = 'taa_l5', how = 'left').set_index('names')

ct_anno.head()

In [None]:
np.setdiff1d(ct_anno['taa_l5'].astype(str),anno_level_sheet['taa_l5'].astype(str))

In [None]:
np.setdiff1d(anno_level_sheet['taa_l5'].astype(str),ct_anno['taa_l5'].astype(str))

In [None]:
ct_anno.to_csv(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_{object_version}_curatedAnno_v8.csv')

Plot new annotations on UMAP:

In [None]:
# Add new annotations to adata
for c in ct_anno.columns:
    if c in adata.obs.columns:
        adata.obs.drop(c, axis = 1, inplace = True)

adata.obs = adata.obs.join(ct_anno, how = 'left')

In [None]:
sc.pl.umap(adata, color = ct_anno.columns, ncols = 2, cmap = 'tab20', wspace = 0.5)

## v10

In [None]:
object_version = 'v4_2025-02-04'

# Load annotation sheet
anno_sheet = pd.read_csv(f'{general_data_path}/objects/thyAgeing_objects_v2_2025-03-28.csv')
anno_sheet = anno_sheet.loc[~anno_sheet['compartment'].isin(['All', 'TEC-Fb-Vasc'])]

anno_sheet

In [None]:
# Construct anno
ct_anno = {}
for row in anno_sheet.iterrows():
    anno = pd.read_csv(row[1]['latest_anno'], index_col = 0)[[row[1]['annotation_column']]]
    anno = anno.rename(columns = {row[1]['annotation_column']: 'taa_l5'})[['taa_l5']].reset_index(names = 'names')
    
    # Only keep B_dev_thy annotations from T compartment
    if row[1]['compartment'] == 'B':
        anno = anno.loc[anno['taa_l5'] != 'B_dev_thy']
    
    print('Number of duplicated barcodes: {}'.format(anno['names'].duplicated().sum()))
    ct_anno[row[1]['compartment']] = anno
    
ct_anno = pd.concat(ct_anno, ignore_index=True, )
    
# Manually rename Fb clusters  
ct_anno.loc[ct_anno['taa_l5'] == 'Fb-mixed', 'taa_l5'] = 'Fb-interm'
ct_anno.loc[ct_anno['taa_l5'] == 'Fb-interlo_P16', 'taa_l5'] = 'Fb-interlo_PI16'

ct_anno

In [None]:
ct_anno.loc[ct_anno['taa_l5'].isin(['B_dev_thy', 'T_DN(early)', 'T_DN(P)']), 'taa_l5'].value_counts()

In [None]:
# Check for duplicates
ct_anno['names'].duplicated().sum() # Resonable number of missing annotations

In [None]:
# Add annotation levels
anno_level_sheet = pd.read_excel(f'{general_data_path}/curated/thyAgeing_full_curatedAnno_v9_2025-03-03_levels.xlsx')
ct_anno = ct_anno.merge(anno_level_sheet, on = 'taa_l5', how = 'left').set_index('names')

ct_anno.head()

In [None]:
f'{general_data_path}/objects/rna/thyAgeing_all_scvi_{object_version}_curatedAnno_v10.csv'

In [None]:
# Check which annotations are missing in anno sheet
np.setdiff1d(ct_anno['taa_l5'].astype(str),anno_level_sheet['taa_l5'].astype(str))

In [None]:
ct_anno.to_csv(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_{object_version}_curatedAnno_v10.csv')

In [None]:
test = pd.read_csv(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_{object_version}_curatedAnno_v10.csv', index_col = 0)
test.head()