# Thymus ageing atlas: Cell frequencies

In [None]:
import os
import sys
import session_info
from datetime import datetime
today = datetime.today().strftime('%Y-%m-%d')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import scanpy as sc
import anndata as ad
import mudata as mu
import hdf5plugin

# import pertpy
# milo = pertpy.tl.Milo()

# Add repo path to sys path (allows to access scripts and metadata from repo)
repo_path = '/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis'
sys.path.insert(1, repo_path) 
sys.path.insert(2, '/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts')

# Autoreload custom scripts
%load_ext autoreload
%autoreload 2

# Define paths
plots_path = f'{repo_path}/plots/'
data_path = f'{repo_path}/data/'
model_path = os.path.join(repo_path, 'models')
general_data_path = '/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/data'

print('Dir for plots: {}'.format(plots_path))
print('Dir for data: {}'.format(data_path))

# Formatting
from matplotlib import font_manager
font_manager.fontManager.addfont("/nfs/team205/ny1/ThymusSpatialAtlas/software/Arial.ttf")
plt.style.use('/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts/plotting/thyAgeing.mplstyle')

# Import custom scripts
from utils import get_latest_version,update_obs,freq_by_donor
from anno_levels import get_ct_levels, get_ct_palette, age_group_levels, age_group_palette
from plotting.utils import plot_grouped_boxplot, calc_figsize

In [None]:
# Load adata
object_version = 'v5_2025-04-03'
adata = ad.read_h5ad(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_{object_version}.zarr')

# Add new annotations to adata
ct_anno = pd.read_csv(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_v4_2025-02-04_curatedAnno_v10.csv', index_col = 0)
for c in ct_anno.columns:
    if c in adata.obs.columns:
        adata.obs.drop(c, axis = 1, inplace = True)
adata.obs = adata.obs.join(ct_anno)

# Filter data (only include annotated cells)
adata = adata[(adata.obs['anno_status'] == 'include'),:]

# Update metadata
latest_meta_path = get_latest_version(dir = f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
latest_meta = pd.read_excel(latest_meta_path)
update_obs(adata, latest_meta, on = 'index', ignore_warning = True)

adata

In [None]:
# Define columns
col_cell_type_broad = 'taa_l2'
col_cell_type_fine = 'taa_l3'
col_cell_type_broad_levels = get_ct_levels(col_cell_type_broad)
col_cell_type_fine_levels = get_ct_levels(col_cell_type_fine)
col_age_group = 'age_group'
col_age_group_levels = eval(f'{col_age_group}_levels')

In [None]:
# Create anno df
anno_df = adata.obs[['sample', 'donor', 'sex', 'sort', 'study', col_age_group, 'age_months' ,col_cell_type_broad, col_cell_type_fine, 'taa_l5', 'taa_l1', 'taa_l4']].copy()
anno_df.dropna(subset = 'taa_l5', inplace = True)

anno_df.head()

## Cell frequencies

In [None]:
anno_df_sub = anno_df[(anno_df['sort'].isin(['TOT']))].copy()
df = freq_by_donor(anno_df_sub, sample_col = 'sample', donor_col = 'donor', summary_col=col_cell_type_fine, add_meta = [col_age_group])

df.to_csv(f'{data_path}/analyses/freqAnalysis/thyAgeing_all_{col_cell_type_fine}_byDonor_freq.csv')

In [None]:
anno_df_sub = anno_df[(anno_df['sort'].isin(['TOT']))].copy()
df = freq_by_donor(anno_df_sub, sample_col = 'sample', donor_col = 'donor', summary_col=col_cell_type_fine, add_meta = [col_age_group])

plot_grouped_boxplot(data = df, x = col_cell_type_fine, y = 'mean_prop', hue = col_age_group, order = [c for c in col_cell_type_fine_levels if c in df[col_cell_type_fine].unique().tolist()], hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'Frequency', legend_title = 'Age group', add_stats = True, format_percent = True, figsize = calc_figsize(height = 'half', height_ratio = 0.75),
                     save_stats = f'{data_path}/analyses/freqAnalysis/thyAgeing_all_{col_cell_type_fine}_freq')
plt.savefig(f'{plots_path}/freqAnalysis/thyAgeing_all_{col_cell_type_fine}_freq_boxplot.pdf')

In [None]:
anno_df_sub = anno_df[(anno_df['sort'].isin(['TOT']))].copy()
df = freq_by_donor(anno_df_sub, sample_col = 'sample', donor_col = 'donor', summary_col=col_cell_type_broad, add_meta = [col_age_group])

df.to_csv(f'{data_path}/analyses/freqAnalysis/thyAgeing_all_{col_cell_type_broad}_byDonor_freq.csv')

In [None]:
anno_df_sub = anno_df[(anno_df['sort'].isin(['TOT']))].copy()
df = freq_by_donor(anno_df_sub, sample_col = 'sample', donor_col = 'donor', summary_col=col_cell_type_broad, add_meta = [col_age_group])

plot_grouped_boxplot(data = df, x = col_cell_type_broad, y = 'mean_prop', hue = col_age_group, order = [c for c in col_cell_type_broad_levels if c in df[col_cell_type_broad].unique().tolist()], hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'Frequency', legend_title = 'Age group', add_stats = True, format_percent = True, figsize = calc_figsize(height = 'half', height_ratio = 0.75),
                     save_stats = f'{data_path}/analyses/freqAnalysis/thyAgeing_all_{col_cell_type_broad}_freq')
plt.savefig(f'{plots_path}/freqAnalysis/thyAgeing_all_{col_cell_type_broad}_freq_boxplot.pdf')

In [None]:
from sklearn.decomposition import PCA

ct_prop_matrix = df.pivot_table(index = 'donor', columns = 'taa_l3', values = 'mean_prop')
# Drop columns with all NaNs and fill remaining NaNs with 0
ct_prop_matrix_clean = ct_prop_matrix.fillna(0)

# Perform PCA
pca = PCA(n_components=4)
pcs = pca.fit_transform(ct_prop_matrix_clean)

# Prepare donor age group mapping
donor_age_group = df.drop_duplicates('donor').set_index('donor')['age_group']
colors = [age_group_palette[donor_age_group.get(donor, 'adult')] for donor in ct_prop_matrix_clean.index]

fig, axs = plt.subplots(2, 3, figsize=(18, 10))
pc_pairs = [(0,1), (0,2), (0,3), (1,2), (1,3), (2,3)]
for ax, (i, j) in zip(axs.flat, pc_pairs):
    ax.scatter(pcs[:, i], pcs[:, j], c=colors, label=None)
    for idx, donor in enumerate(ct_prop_matrix_clean.index):
        ax.text(pcs[idx, i], pcs[idx, j], donor, fontsize=7)
    ax.set_xlabel(f'PC{i+1}')
    ax.set_ylabel(f'PC{j+1}')
    ax.set_title(f'PC{i+1} vs PC{j+1}')
plt.tight_layout()
plt.legend(handles=[plt.Line2D([0], [0], marker='o', color='w', label=ag, markerfacecolor=age_group_palette[ag], markersize=8) for ag in age_group_levels], title='Age group', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:
# Get PCA loadings for PC1
pc1_loadings = pd.Series(pca.components_[0], index=ct_prop_matrix_clean.columns)
pc1_loadings.sort_values(ascending=False)

In [None]:
# Get PCA loadings for PC2
pc2_loadings = pd.Series(pca.components_[1], index=pc1_loadings.index)
pc2_loadings.sort_values(ascending=False)

## Cell ratios

In [None]:
anno_df_sub = anno_df.loc[anno_df['sort'] == 'TOT'].copy()

# Calculate proportions
df = anno_df_sub.groupby(['sample', 'donor', col_age_group])[col_cell_type_broad].value_counts(normalize=True).to_frame('prop').reset_index()

# Calculate ratios
ratios_df = {}
for sample in anno_df_sub['sample'].unique():
    df_sub = df.loc[df['sample'] == sample].copy()
    ratios = pd.DataFrame(np.matmul(np.array(1/df_sub['prop']).reshape(1,-1).transpose(), np.array(df_sub['prop']).reshape(1, -1)), index = df_sub[col_cell_type_broad].unique(), columns = df_sub[col_cell_type_broad].unique())
    ratios = ratios.reset_index().melt(id_vars='index', var_name='numerator', value_name='ratio').rename(columns={'index': 'denominator'})
    ratios_df[sample] = ratios
ratios_df = pd.concat(ratios_df).reset_index(names = ['sample', 'x']).drop('x', axis = 1)
ratios_df = ratios_df.merge(anno_df_sub[['sample', 'donor', col_age_group]].drop_duplicates(), on = 'sample')
ratios_df = ratios_df.groupby(['sample', 'donor', col_age_group, 'denominator', 'numerator'])['ratio'].mean().reset_index()

ratios_df.head()

In [None]:
df['taa_l3'].unique()

In [None]:
df = ratios_df.loc[(ratios_df['denominator'].isin(['T_DN', 'T_DP', 'T_αβT(entry)', 'T_CD4', 'T_CD8', 'T_Treg'])) & (ratios_df['numerator'].str.contains('TEC'))].copy()

In [None]:
plot_grouped_boxplot(data = df, x = 'numerator', y = 'ratio', hue = col_age_group, order = [c for c in col_cell_type_broad_levels if c in df['numerator'].unique().tolist()], hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'Ratio', legend_title = 'Age group', add_stats = True, format_percent = False, format_log = True, figsize = calc_figsize(height = 60, width = 60),
                     ylim = [0.001, 10000], y_intercept = 1, #save_stats = f'{data_path}/analyses/freqAnalysis/thyAgeing_all_{col_cell_type_broad}_ratio'
                     legend_kwargs = {'bbox_to_anchor':(0.5, 1.4), 'loc':'upper center', 'ncol': 4})
plt.savefig(f'{plots_path}/freqAnalysis/thyAgeing_all_{col_cell_type_broad}_T_DN_ratio_boxplot.pdf')

In [None]:
from plotting.utils import plot_faceted_grouped_boxplot
plot_faceted_grouped_boxplot(data = df, x = 'numerator', y = 'ratio', hue = col_age_group, hue_order = col_age_group_levels, order = [c for c in col_cell_type_broad_levels if c in df['numerator'].unique().tolist()], 
                             facet_kwargs = dict(col = 'denominator', col_wrap = 6, height = calc_figsize(width = 'double', height_ratio = 0.5)[0]/5, aspect = 1, col_order = ['T_DN', 'T_DP', 'T_αβT(entry)', 'T_CD4', 'T_CD8', 'T_Treg']),
                             add_stats = True, #save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_devSplit_TRB_kidera_prod_', 
                             format_percent = False, format_log = True, x_label = 'Cell population', y_label = 'Ratio', legend_title='Age group', figsize = calc_figsize(width = 'double', height_ratio = 0.5),
                             ylim = [0.001, 10000], y_intercept = 1,#legend_kwargs = dict(loc = "upper left", bbox_to_anchor=(1, 1))
                             )
plt.savefig(f'{plots_path}/freqAnalysis/thyAgeing_all_{col_cell_type_broad}_T_ratio_boxplot.pdf')

## Infiltrating immune cells

In [None]:
np.array(get_ct_levels('taa_l4'))

In [None]:
ct_taaL2_oi = ['T_recirc', 'T_innate', 'NK','B_cell', 'B_plasma','Neutrophil', 'Mono', 'Mac', 'DC1', 'DC2', 'aDC', 'DC', 'pDC', 'Mast']
col_cell_type_broad = 'taa_l3'
col_cell_type_fine = 'taa_l4'
col_age_group = 'age_group'

anno_df = adata[(adata.obs['sort'] == 'TOT') & (adata.obs['donor'] != 'A66'), :].obs[['sample', 'donor', col_cell_type_broad, col_cell_type_fine, col_age_group, 'taa_l2']].copy()
anno_df['taa_l3_l4'] = anno_df.apply(lambda x: x[col_cell_type_broad] if x['taa_l2'] in ['Neutrophil', 'Mono', 'Mac', 'DC1', 'DC2', 'aDC', 'DC', 'pDC', 'Mast', 'NK'] else x[col_cell_type_fine], axis=1)
df = freq_by_donor(anno_df, sample_col = 'sample', donor_col = 'donor', summary_col='taa_l3_l4', add_meta = [col_age_group])
df = df.merge(anno_df[['taa_l3_l4', 'taa_l2']].drop_duplicates(), on = 'taa_l3_l4', how = 'left')
df = df.loc[(~df['taa_l3_l4'].isin(['T_Treg_tr', 'T_CD8αα(entry)', 'T_CD8αα(I)', 'T_CD8αα(II)', 'T_γδT', 'B_dev', 'B_dev_thy'])) &
            (df['taa_l2'].isin(ct_taaL2_oi))]
anno_df = anno_df.loc[~anno_df['taa_l3_l4'].isin(['B_plasmablast', 'Neutrophil'])].copy()
df = df.loc[df['taa_l3_l4'].isin(anno_df['taa_l3_l4'].unique().tolist())].copy()

col_cell_type_broad_levels = get_ct_levels(col_cell_type_broad, include_ct=anno_df[col_cell_type_broad].unique().tolist())
col_cell_type_fine_levels = get_ct_levels(col_cell_type_fine, include_ct=anno_df[col_cell_type_fine].unique().tolist())
taa_l3_l4_levels = list(dict.fromkeys([c for c in col_cell_type_fine_levels+col_cell_type_broad_levels if c in df['taa_l3_l4'].unique().tolist()]))
col_age_group_levels = eval(f'{col_age_group}_levels')

df.head()

In [None]:
from plotting.utils import plot_grouped_boxplot,thyAgeing_colors,thyAgeing_greys,get_tint_palette

In [None]:
plot_grouped_boxplot(data = df, x = 'taa_l3_l4', y = 'mean_prop', order = taa_l3_l4_levels, 
                     hue = col_age_group, hue_order = col_age_group_levels, palette = get_tint_palette(thyAgeing_colors['magenta']),
                     x_label = 'Cell population', y_label = 'Mean proportion', legend_title = 'Age group', add_stats = True, format_percent = False,
                     #save_stats = f'{data_path}/analyses/freqAnalysis/clonotypes/thyAgeing_recircSplit_meanProp', 
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'},
                     figsize = calc_figsize(width = 180, height = 50),
                     format_log=True, ylim=[0.001, 1],
                     annotator_args = dict(line_offset = 10, text_offset = -3, line_offset_to_group = 20))
plt.gca().set_yticks([0.01, 0.1, 1])
plt.gca().set_yticklabels(['1%', '10%', '100%'])
plt.savefig(f'{plots_path}/freqAnalysis/thyAgeing_recircSplit_taa_l3_l4_freq_boxplot.pdf')

## By gender

In [None]:
anno_df_sub = anno_df[(anno_df['sort'].isin(['TOT']))].copy()

# Create new cell type annotations
anno_df_sub['taa_l3_l2'] = anno_df_sub.apply(lambda x: x['taa_l3'] if x['taa_l2'] in ['T_predev', 'T_dev'] else x['taa_l2'], axis = 1)

taa_l3_l2_levels = get_ct_levels('taa_l3', include_ct = ['T_DN(early)', 'T_DN', 'T_DP', 'T_αβT(entry)'])
taa_l3_l2_levels.extend(get_ct_levels('taa_l2', include_ct = np.setdiff1d(anno_df_sub['taa_l3_l2'].unique(), np.array(['T_DN(early)', 'T_DN', 'T_DP', 'T_αβT(entry)']))))

In [None]:
df = freq_by_donor(anno_df_sub, sample_col = 'sample', donor_col = 'donor', summary_col='taa_l3_l2', add_meta = [col_age_group, 'sex'])
df

In [None]:
from plotting.utils import plot_grouped_boxplot,thyAgeing_colors,get_tint_palette
df_sub = df.loc[df['age_group'] == 'adult'].copy()
plot_grouped_boxplot(data = df_sub, x = 'taa_l3_l2', y = 'mean_prop', order = taa_l3_l2_levels, 
                     hue = 'sex', hue_order = ['M', 'F'], palette = get_tint_palette(thyAgeing_colors['purple'], n = 2),
                     x_label = 'Cell population', y_label = 'Mean proportion', legend_title = 'Age group', format_percent = True,  add_stats = False,
                     #save_stats = f'{data_path}/analyses/freqAnalysis/clonotypes/thyAgeing_recircSplit_meanProp', 
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'},
                     figsize = calc_figsize(width = 180, height = 50),
                     format_log=True, ylim=[0.001, 1],
                     annotator_args = dict(line_offset = 10, text_offset = -3, line_offset_to_group = 20))
plt.gca().set_yticks([0.01, 0.1, 1])
plt.gca().set_yticklabels(['1%', '10%', '100%'])
plt.savefig(f'{plots_path}/freqAnalysis/thyAgeing_all_taa_l3_l2_gender_adult_freq_boxplot.pdf')

In [None]:
from plotting.utils import plot_grouped_boxplot,thyAgeing_colors,get_tint_palette
df_sub = df.loc[df['age_group'] == 'infant'].copy()
plot_grouped_boxplot(data = df_sub, x = 'taa_l3_l2', y = 'mean_prop', order = taa_l3_l2_levels, 
                     hue = 'sex', hue_order = ['M', 'F'], palette = get_tint_palette(thyAgeing_colors['purple'], n = 2),
                     x_label = 'Cell population', y_label = 'Mean proportion', legend_title = 'Age group', format_percent = True,  add_stats = False,
                     #save_stats = f'{data_path}/analyses/freqAnalysis/clonotypes/thyAgeing_recircSplit_meanProp', 
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'},
                     figsize = calc_figsize(width = 180, height = 50),
                     format_log=True, ylim=[0.001, 1],
                     annotator_args = dict(line_offset = 10, text_offset = -3, line_offset_to_group = 20))
plt.gca().set_yticks([0.01, 0.1, 1])
plt.gca().set_yticklabels(['1%', '10%', '100%'])
plt.savefig(f'{plots_path}/freqAnalysis/thyAgeing_all_taa_l3_l2_gender_infant_freq_boxplot.pdf')

In [None]:
from plotting.utils import plot_grouped_boxplot,thyAgeing_colors,get_tint_palette
df = freq_by_donor(anno_df_sub, sample_col = 'sample', donor_col = 'donor', summary_col='taa_l4', add_meta = [col_age_group, 'sex'])
df_sub = df.loc[(df['age_group'] == 'adult') & (df['donor'] != 'A66')].copy()
plot_grouped_boxplot(data = df_sub, x = 'taa_l4', y = 'mean_prop', order = df_sub['taa_l4'].unique().tolist(), 
                     hue = 'sex', hue_order = ['M', 'F'], palette = get_tint_palette(thyAgeing_colors['purple'], n = 2),
                     x_label = 'Cell population', y_label = 'Mean proportion', legend_title = 'Age group', format_percent = True,  add_stats = False,
                     #save_stats = f'{data_path}/analyses/freqAnalysis/clonotypes/thyAgeing_recircSplit_meanProp', 
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'},
                     figsize = calc_figsize(width = 180, height = 50),
                     format_log=True, ylim=[0.0001, 1], remove_legend = False,
                     annotator_args = dict(line_offset = 10, text_offset = -3, line_offset_to_group = 20))
plt.gca().set_yticks([0.0001, 0.001, 0.01, 0.1, 1])
plt.gca().set_yticklabels(['0.01%', '0.1%', '1%', '10%', '100%'])
plt.savefig(f'{plots_path}/freqAnalysis/thyAgeing_all_taa_l4_gender_adult_freq_boxplot.pdf')

In [None]:
from plotting.utils import plot_grouped_boxplot,thyAgeing_colors,get_tint_palette
df = freq_by_donor(anno_df_sub, sample_col = 'sample', donor_col = 'donor', summary_col='taa_l4', add_meta = [col_age_group, 'sex'])
df_sub = df.loc[(df['age_group'] == 'infant') & (df['donor'] != 'A66')].copy()
plot_grouped_boxplot(data = df_sub, x = 'taa_l4', y = 'mean_prop', order = df_sub['taa_l4'].unique().tolist(), 
                     hue = 'sex', hue_order = ['M', 'F'], palette = get_tint_palette(thyAgeing_colors['purple'], n = 2),
                     x_label = 'Cell population', y_label = 'Mean proportion', legend_title = 'Age group', format_percent = True,  add_stats = False,
                     #save_stats = f'{data_path}/analyses/freqAnalysis/clonotypes/thyAgeing_recircSplit_meanProp', 
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'},
                     figsize = calc_figsize(width = 180, height = 50),
                     format_log=True, ylim=[0.0001, 1], remove_legend = False,
                     annotator_args = dict(line_offset = 10, text_offset = -3, line_offset_to_group = 20))
plt.gca().set_yticks([0.0001, 0.001, 0.01, 0.1, 1])
plt.gca().set_yticklabels(['0.01%', '0.1%', '1%', '10%', '100%'])
plt.savefig(f'{plots_path}/freqAnalysis/thyAgeing_all_taa_l4_gender_infant_freq_boxplot.pdf')

## Stacked bar plots

### TECs

In [None]:
%load_ext autoreload
%autoreload 2

from plotting.utils import population_stacked_plot

In [None]:
# Define columns
col_cell_type_broad = 'taa_l3'
col_cell_type_broad_levels = get_ct_levels(col_cell_type_broad, taa_l1 = 'TEC')
col_age_group = 'age_group'
col_age_group_levels = eval(f'{col_age_group}_levels')

sample_cnts = adata[adata.obs['taa_l1'] == 'TEC'].obs['sample'].value_counts()
filtered_samples = sample_cnts[sample_cnts>100].index.tolist()

population_stacked_plot(adata, col_cell_type='taa_l3', plt_path = f'{plots_path}/freqAnalysis/thyAgeing_TECSplit_relTfreqFine_stackedbar.pdf',
                        ctypes = col_cell_type_broad_levels, sorts = None, samples=filtered_samples)

In [None]:
# Define columns
col_cell_type_broad = 'taa_l3'
col_cell_type_broad_levels = get_ct_levels(col_cell_type_broad, taa_l1 = 'Fb')
col_age_group = 'age_group'
col_age_group_levels = eval(f'{col_age_group}_levels')

sample_cnts = adata[adata.obs['taa_l1'] == 'Fb'].obs['sample'].value_counts()
filtered_samples = sample_cnts[sample_cnts>100].index.tolist()

population_stacked_plot(adata, col_cell_type='taa_l3', plt_path = f'{plots_path}/freqAnalysis/thyAgeing_FbSplit_relTfreqFine_stackedbar.pdf',
                        ctypes = col_cell_type_broad_levels, sorts = None, samples=filtered_samples)

In [None]:
# Define columns
col_cell_type_broad = 'taa_l3'
col_cell_type_broad_levels = get_ct_levels(col_cell_type_broad, taa_l1 = ['EC', 'Mural'])
col_age_group = 'age_group'
col_age_group_levels = eval(f'{col_age_group}_levels')

sample_cnts = adata[adata.obs['taa_l1'].isin(['EC', 'Mural'])].obs['sample'].value_counts()
filtered_samples = sample_cnts[sample_cnts>100].index.tolist()

population_stacked_plot(adata, col_cell_type='taa_l3', plt_path = f'{plots_path}/freqAnalysis/thyAgeing_vascSplit_relTfreqFine_stackedbar.pdf',
                        ctypes = col_cell_type_broad_levels, sorts = None, samples=filtered_samples)