# Thymus ageing atlas: VDJ usage

In [None]:
import os
import sys
import session_info
from datetime import datetime
today = datetime.today().strftime('%Y-%m-%d')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import scanpy as sc
import anndata as ad
import hdf5plugin
import dandelion as ddl

import warnings
warnings.filterwarnings("ignore", category=ad.ImplicitModificationWarning)

# Add repo path to sys path (allows to access scripts and metadata from repo)
#repo_path,_ = os.path.split(os.path.split(os.getcwd())[0])
repo_path = '/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/T_NK_compartment'
sys.path.insert(1, repo_path) 
sys.path.insert(2, '/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts')

# Autoreload custom scripts
%load_ext autoreload
%autoreload 2

# Define paths
plots_path = f'{repo_path}/plots/'
data_path = f'{repo_path}/data/'
model_path = os.path.join(repo_path, 'models')
general_data_path = '/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/data'

print('Dir for plots: {}'.format(plots_path))
print('Dir for data: {}'.format(data_path))

# Formatting
from matplotlib import font_manager
font_manager.fontManager.addfont("/nfs/team205/ny1/ThymusSpatialAtlas/software/Arial.ttf")
plt.style.use('/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts/plotting/thyAgeing.mplstyle')

# Import custom scripts
from utils import get_latest_version,update_obs,freq_by_donor
from anno_levels import get_ct_levels, get_ct_palette, age_group_levels, age_group_palette, t_nk_groupings
from plotting.utils import plot_grouped_boxplot, calc_figsize, thyAgeing_colors, get_tint_palette, get_chroma_palette, create_blend_palette

In [None]:
# Define paths
plots_path = f'{repo_path}/plots'
data_path = f'{repo_path}/data'
model_path = os.path.join(repo_path, 'models')
general_data_path = '/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/data'

print('Dir for plots: {}'.format(plots_path))
print('Dir for data: {}'.format(data_path))

## Load data

To analyse:
- differences in usage across ages
- differences in usage between different lineages
- check expression in NK cluster

In [None]:
# Load adata
object_version = 'v5_2025-04-03'
adata = ad.read_h5ad(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_{object_version}.zarr')

# Add new annotations to adata
ct_anno = pd.read_csv(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_v4_2025-02-04_curatedAnno_v10.csv', index_col = 0)
for c in ct_anno.columns:
    if c in adata.obs.columns:
        adata.obs.drop(c, axis = 1, inplace = True)
adata.obs = adata.obs.join(ct_anno)

# Filter data (only include annotated cells)
adata = adata[~adata.obs['taa_l5'].str.contains('locnt|-sp|explore', na = True)]

# Update metadata
latest_meta_path = get_latest_version(dir = f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
latest_meta = pd.read_excel(latest_meta_path)
update_obs(adata, latest_meta, on = 'index', ignore_warning = True)

# Add vdj data
meta_tcr = pd.read_csv(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_v9_2025-03-28_tcrab_v6.csv', index_col = 0)
adata.obs = adata.obs.join(meta_tcr)

# Exclude non-T cells
adata = adata[~adata.obs['rearrangement_status_VDJ'].isna()]

# Load TCRab data
vdj_version = 'v6_2025-04-03'
tcrab_info = ddl.read_h5ddl(f'{data_path}/objects/vdj/thymusAgeing_tcrabFiltered_{vdj_version}.h5ddl')

In [None]:
# Define columns
col_cell_type_broad = 'taa_l3'
col_cell_type_fine = 'taa_l4'
col_cell_type_broad_levels = get_ct_levels(col_cell_type_broad, taa_l1 = ['T', 'NK'])
col_cell_type_fine_levels = get_ct_levels(col_cell_type_fine, taa_l1 = ['T', 'NK'])
col_age_group = 'age_group'
col_age_group_levels = eval(f'{col_age_group}_levels')

In [None]:
# Remove any samples which were not TCR sequenced
sample_freq = adata.obs.groupby('sample')['chain_status'].apply(lambda x: x.value_counts(normalize = True)).reset_index(name = 'prop').rename(columns = {'level_1' : 'chain_status'})
exclude_samples = sample_freq.loc[(sample_freq['chain_status'] == 'No_contig') & (sample_freq['prop'] == 1)]['sample'].unique()
print(f'Excluding {len(exclude_samples)} samples with no TCR data')
adata = adata[~adata.obs['sample'].isin(exclude_samples)]

sample_freq

In [None]:
# Prepare TCR df
tcr_df = tcrab_info.data[['sequence_id','locus', 'productive','umi_count', 'v_call', 'd_call', 'j_call', 'cdr3_aa','cdr3']].copy()
tcr_df['barcode'] = tcr_df['sequence_id'].str.split('_|-').str[-3]
tcr_df['contig_id'] = tcr_df['sequence_id'].str.split('_').str[-1]
tcr_df['index'] = tcr_df['sequence_id'].str.split('-').str[0]
tcr_df = tcr_df.merge(latest_meta[['index', 'donor']], on = 'index', how = 'left')
tcr_df.head()

In [None]:
trb_df = tcr_df[tcr_df['locus'] == 'TRB'].copy()
trb_df = trb_df[['barcode','index','donor', 'umi_count', 'v_call', 'd_call', 'j_call', 'cdr3_aa', 'cdr3', 'productive']] \
    .rename(columns = {'donor':'subject' ,'umi_count' : 'count', 'v_call' : 'v_b_gene', 'd_call' : 'd_b_gene', 'j_call' : 'j_b_gene', 'cdr3_aa' : 'cdr3_b_aa', 'cdr3' : 'cdr3_b_nucseq'})
trb_df.index = trb_df['index'] + '-' + trb_df['barcode']
trb_df = trb_df.drop(columns = ['d_b_gene'])

trb_df.head()

## Developing thymocytes and newly matured T cells

### Clonal diversity (Gini & Shannon entropy)

In [None]:
from pygini import gini
from scipy.stats import entropy

# Define a function to calculate Gini coefficient
def gini_coefficient(x):
    return gini(x.value_counts(normalize = True).values)

def shannon_entropy(x):
    return entropy(x.value_counts(normalize = True).values)

df = trb_df.loc[trb_df['productive'] == 'T'][['index','cdr3_b_aa']].copy().join(adata.obs[[col_cell_type_fine, col_age_group, 'donor', 'sex']])
df = df.groupby(['index', 'donor',col_cell_type_fine, col_age_group]).agg(gini_coefficient = ('cdr3_b_aa', gini_coefficient),
                                                                          entropy = ('cdr3_b_aa', shannon_entropy)).reset_index()
df = df.groupby([col_cell_type_fine, col_age_group, 'donor']).agg(gini_coefficient = ('gini_coefficient', 'mean'),
                                                                  entropy = ('entropy', 'mean')).reset_index()
df = df.loc[df[col_cell_type_fine].isin(['T_DN(Q)', 'T_DP(P)','T_DP(Q)', 'T_αβT(entry)', 'T_CD8_naive','T_CD4_naive', 'T_Treg'])]

df_diversity = df.copy()
df

In [None]:
plot_grouped_boxplot(data = df, x = col_cell_type_fine, y = 'entropy', hue = col_age_group, order = ['T_DN(Q)', 'T_DP(P)','T_DP(Q)', 'T_αβT(entry)', 'T_CD8_naive','T_CD4_naive', 'T_Treg'], hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'Mean entropy', legend_title = 'Age group', add_stats = True, format_percent = False, figsize = calc_figsize(width = 70, height = 40),
                     save_stats = f'{data_path}/analyses/vdj/clonotypes/thyAgeing_devSplit_TRB_meanEntropy_', palette = get_tint_palette(thyAgeing_colors['magenta']),
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'}), 
plt.savefig(f'{plots_path}/vdj/clonotypes/thyAgeing_devSplit_TRB_meanEntropy_boxplot.pdf', dpi = 300, bbox_inches = 'tight')

In [None]:
plot_grouped_boxplot(data = df, x = col_cell_type_fine, y = 'gini_coefficient', hue = col_age_group, order = ['T_DN(Q)', 'T_DP(P)','T_DP(Q)', 'T_αβT(entry)', 'T_CD8_naive','T_CD4_naive', 'T_Treg'], hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'Mean Gini', legend_title = 'Age group', add_stats = True, format_percent = False, figsize = calc_figsize(width = 70, height = 40),
                     save_stats = f'{data_path}/analyses/vdj/clonotypes/thyAgeing_devSplit_TRB_meanGini_', palette = get_tint_palette(thyAgeing_colors['magenta']),
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'})
plt.xticks(ha='center')
plt.yticks(np.arange(0, df['gini_coefficient'].max(), step=0.05))
plt.savefig(f'{plots_path}/vdj/clonotypes/thyAgeing_devSplit_TRB_meanGini__boxplot.pdf', dpi = 300, bbox_inches = 'tight')

### Number of clones

In [None]:
df = trb_df.loc[trb_df['productive'] == 'T'][['index','cdr3_b_aa']].copy().join(adata.obs[[col_cell_type_fine, col_age_group, 'donor', 'sex']])
df = df.groupby(['donor', col_cell_type_fine, col_age_group, 'cdr3_b_aa', 'index'], observed = True).agg(clone_size = ('cdr3_b_aa', 'size')).reset_index()
df = df.groupby(['index', 'donor', col_cell_type_fine, col_age_group], observed = True).agg(n_clones = ('cdr3_b_aa', 'nunique'),
                                                                          n_cells = ('clone_size', 'sum'),
                                                                          clone_size = ('clone_size', 'mean')
                                                                          ).reset_index()
df['n_clones_norm'] = (df['n_clones'] / df['n_cells']) * 1000
df = df.groupby([col_cell_type_fine, col_age_group, 'donor'], observed = True).agg(n_clones = ('n_clones', 'mean'),
                                                                  n_clones_norm = ('n_clones_norm', 'mean'),
                                                                  n_cells = ('n_cells', 'mean'),
                                                                  clone_size = ('clone_size', 'mean')).reset_index()
df = df.loc[df[col_cell_type_fine].isin(['T_DN(Q)', 'T_DP(P)','T_DP(Q)', 'T_αβT(entry)', 'T_CD8_naive','T_CD4_naive', 'T_Treg'])]
df.head()

In [None]:
plot_grouped_boxplot(data = df, x = col_cell_type_fine, y = 'clone_size', hue = col_age_group, order = ['T_DN(Q)', 'T_DP(P)','T_DP(Q)', 'T_αβT(entry)', 'T_CD8_naive','T_CD4_naive', 'T_Treg'], hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'Clone size', legend_title = 'Age group', add_stats = True, format_percent = False, figsize = calc_figsize(width = 70, height = 40),
                     save_stats = f'{data_path}/analyses/vdj/clonotypes/thyAgeing_devSplit_TRB_cloneSize_', palette = get_tint_palette(thyAgeing_colors['magenta']),
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'})
plt.xticks(rotation=45, ha='right', y=0.03)
plt.yticks(np.arange(df['clone_size'].min(), df['clone_size'].max(), step=0.1))
plt.savefig(f'{plots_path}/vdj/clonotypes/thyAgeing_devSplit_TRB_cloneSize_boxplot.pdf', dpi = 300, bbox_inches = 'tight')

In [None]:
df_all = pd.concat([df.melt(id_vars = ['donor', col_cell_type_fine, col_age_group], value_vars = ['n_clones','n_clones_norm','n_cells','clone_size']),
           df_diversity.melt(id_vars = ['donor', col_cell_type_fine, col_age_group], value_vars = ['gini_coefficient','entropy'])])
df_all = df_all.loc[df_all['variable'].isin(['n_clones','n_clones_norm','clone_size','entropy'])]

df_all.head()

In [None]:
from plotting.utils import plot_faceted_grouped_boxplot
p = plot_faceted_grouped_boxplot(data = df_all.reset_index(), x = col_cell_type_fine, 
                                 y = 'value', hue = col_age_group, hue_order = col_age_group_levels, order = ['T_DN(Q)', 'T_DP(P)','T_DP(Q)', 'T_αβT(entry)', 'T_CD8_naive','T_CD4_naive', 'T_Treg'],
                             facet_kwargs = dict(col = 'variable', col_wrap = 2), plot_kwargs= dict(sharey=False, sharex = True),
                             add_stats = True, #save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_devSplit_TRB_kidera_prod_', 
                             format_percent = False, format_log = False, x_label = 'Cell population', y_label = 'N(prod. rearr.)', legend_title='Age group', figsize = calc_figsize(width = 70, height = 60),
                             ylim = None, palette = get_tint_palette(thyAgeing_colors['magenta']), #legend_kwargs = dict(loc = "upper left", bbox_to_anchor=(1, 1))
                             )
for ax in p.axes.flat:
    ax.set_facecolor("#efefef")
p.tight_layout()
p.set_xlabels('Cell population')
p.set_ylabels('Mean')
plt.savefig(f'{plots_path}/vdj/clonotypes/thyAgeing_devSplit_TRB_allMetrics_faceted_boxplot.pdf', dpi = 300, bbox_inches = 'tight')

## Recirculating T cells

In [None]:
# Define recirculating T cells
recirc_cells = [c for c in col_cell_type_fine_levels if c in adata[adata.obs['taa_l2'] == 'T_recirc'].obs[col_cell_type_fine].unique()]
recirc_cells

In [None]:
from pygini import gini
from scipy.stats import entropy

# Define a function to calculate Gini coefficient
def gini_coefficient(x):
    return gini(x.value_counts(normalize = True).values)

def shannon_entropy(x):
    return entropy(x.value_counts(normalize = True).values)

df = trb_df.loc[trb_df['productive'] == 'T'][['index','cdr3_b_aa']].copy().join(adata.obs[[col_cell_type_fine, col_age_group, 'donor', 'sex']])
df = df.groupby(['index', 'donor',col_cell_type_fine, col_age_group]).agg(gini_coefficient = ('cdr3_b_aa', gini_coefficient),
                                                                          entropy = ('cdr3_b_aa', shannon_entropy)).reset_index()
df = df.groupby([col_cell_type_fine, col_age_group, 'donor']).agg(gini_coefficient = ('gini_coefficient', 'mean'),
                                                                  entropy = ('entropy', 'mean')).reset_index()
df = df.loc[df[col_cell_type_fine].isin(recirc_cells)]

df_diversity = df.copy()
df

In [None]:
plot_grouped_boxplot(data = df, x = col_cell_type_fine, y = 'entropy', hue = col_age_group, order = recirc_cells, hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'Mean entropy', legend_title = 'Age group', add_stats = True, format_percent = False, figsize = calc_figsize(width = 70, height = 40),
                     save_stats = f'{data_path}/analyses/vdj/clonotypes/thyAgeing_recircSplit_TRB_meanEntropy_', palette = get_tint_palette(thyAgeing_colors['magenta']),
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'}, ylim = (0,8)), 
plt.yticks(np.arange(df['entropy'].min(), 8.1, step=2))
plt.savefig(f'{plots_path}/vdj/clonotypes/thyAgeing_recircSplit_TRB_meanEntropy_boxplot.pdf', dpi = 300, bbox_inches = 'tight')

plot_grouped_boxplot(data = df, x = col_cell_type_fine, y = 'gini_coefficient', hue = col_age_group, order = recirc_cells, hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'Mean Gini', legend_title = 'Age group', add_stats = True, format_percent = False, figsize = calc_figsize(width = 70, height = 40),
                     save_stats = f'{data_path}/analyses/vdj/clonotypes/thyAgeing_recircSplit_TRB_meanGini_', palette = get_tint_palette(thyAgeing_colors['magenta']),
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'},)
plt.xticks(ha='center')
plt.yticks(np.arange(0, 0.51, step=0.1))
plt.savefig(f'{plots_path}/vdj/clonotypes/thyAgeing_recircSplit_TRB_meanGini_boxplot.pdf', dpi = 300, bbox_inches = 'tight')

### Number of clones

In [None]:
df = trb_df.loc[trb_df['productive'] == 'T'][['index','cdr3_b_aa']].copy().join(adata.obs[[col_cell_type_fine, col_age_group, 'donor', 'sex']])
df = df.groupby(['donor', col_cell_type_fine, col_age_group, 'cdr3_b_aa', 'index'], observed = True).agg(clone_size = ('cdr3_b_aa', 'size')).reset_index()
df = df.groupby(['index', 'donor', col_cell_type_fine, col_age_group], observed = True).agg(n_clones = ('cdr3_b_aa', 'nunique'),
                                                                          n_cells = ('clone_size', 'sum'),
                                                                          clone_size = ('clone_size', 'mean')
                                                                          ).reset_index()
df['n_clones_norm'] = (df['n_clones'] / df['n_cells']) * 1000
df = df.groupby([col_cell_type_fine, col_age_group, 'donor'], observed = True).agg(n_clones = ('n_clones', 'mean'),
                                                                  n_clones_norm = ('n_clones_norm', 'mean'),
                                                                  n_cells = ('n_cells', 'mean'),
                                                                  clone_size = ('clone_size', 'mean')).reset_index()
df = df.loc[df[col_cell_type_fine].isin(recirc_cells)]
df.head()

In [None]:
plot_grouped_boxplot(data = df, x = col_cell_type_fine, y = 'clone_size', hue = col_age_group, order = recirc_cells, hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'Clone size', legend_title = 'Age group', add_stats = True, format_percent = False, figsize = calc_figsize(width = 70, height = 40),
                     save_stats = f'{data_path}/analyses/vdj/clonotypes/thyAgeing_recircSplit_TRB_cloneSize_', palette = get_tint_palette(thyAgeing_colors['magenta']),
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'}, ylim = (1, 2.1))
plt.xticks(rotation=45, ha='right', y=0.03)
plt.yticks(np.arange(df['clone_size'].min(), 2.1, step=0.5))
plt.savefig(f'{plots_path}/vdj/clonotypes/thyAgeing_recircSplit_TRB_cloneSize_boxplot.pdf', dpi = 300, bbox_inches = 'tight')

In [None]:
df_all = pd.concat([df.melt(id_vars = ['donor', col_cell_type_fine, col_age_group], value_vars = ['n_clones','n_clones_norm','n_cells','clone_size']),
           df_diversity.melt(id_vars = ['donor', col_cell_type_fine, col_age_group], value_vars = ['gini_coefficient','entropy'])])
df_all = df_all.loc[df_all['variable'].isin(['n_clones','n_clones_norm','clone_size','entropy'])]

df_all.head()

In [None]:
from plotting.utils import plot_faceted_grouped_boxplot,thyAgeing_greys
p = plot_faceted_grouped_boxplot(data = df_all.reset_index(), x = col_cell_type_fine, 
                                 y = 'value', hue = col_age_group, hue_order = col_age_group_levels, order = recirc_cells,
                             facet_kwargs = dict(col = 'variable', col_wrap = 2), plot_kwargs= dict(sharey=False, sharex = True),
                             add_stats = True, #save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_devSplit_TRB_kidera_prod_', 
                             format_percent = False, format_log = False, x_label = 'Cell population', y_label = 'N(prod. rearr.)', legend_title='Age group', figsize = calc_figsize(width = 100, height = 100),
                             ylim = None, palette = get_tint_palette(thyAgeing_colors['magenta']), #legend_kwargs = dict(loc = "upper left", bbox_to_anchor=(1, 1))
                             #annotator_args = dict(line_offset = 20, text_offset = -2, line_offset_to_group = 10),
                             )
for ax in p.axes.flat:
    ax.set_facecolor(thyAgeing_greys['grey1'])
p.tight_layout()
p.set_xlabels('Cell population')
p.set_ylabels('Mean')
plt.savefig(f'{plots_path}/vdj/clonotypes/thyAgeing_recircSplit_TRB_allMetrics_faceted_boxplot.pdf', dpi = 300, bbox_inches = 'tight')