# Thymus ageing atlas: CDR3 features

In [None]:
import os
import sys
import session_info
from datetime import datetime
today = datetime.today().strftime('%Y-%m-%d')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import scanpy as sc
import anndata as ad
import hdf5plugin
import dandelion as ddl

import warnings
warnings.filterwarnings("ignore", category=ad.ImplicitModificationWarning)

# Add repo path to sys path (allows to access scripts and metadata from repo)
#repo_path,_ = os.path.split(os.path.split(os.getcwd())[0])
repo_path = '/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/T_NK_compartment'
sys.path.insert(1, repo_path) 
sys.path.insert(2, '/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts')

# Autoreload custom scripts
%load_ext autoreload
%autoreload 2

# Define paths
plots_path = f'{repo_path}/plots/'
data_path = f'{repo_path}/data/'
model_path = os.path.join(repo_path, 'models')
general_data_path = '/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/data'

print('Dir for plots: {}'.format(plots_path))
print('Dir for data: {}'.format(data_path))

# Formatting
from matplotlib import font_manager
font_manager.fontManager.addfont("/nfs/team205/ny1/ThymusSpatialAtlas/software/Arial.ttf")
plt.style.use('/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts/plotting/thyAgeing.mplstyle')

# Import custom scripts
from utils import get_latest_version,update_obs,freq_by_donor
from anno_levels import get_ct_levels, get_ct_palette, age_group_levels, age_group_palette, t_nk_groupings
from plotting.utils import plot_grouped_boxplot, calc_figsize

## Load and prepare data


AnnData

In [None]:
# Load adata
object_version = 'v5_2025-04-03'
adata = ad.read_h5ad(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_{object_version}.zarr')

# Add new annotations to adata
ct_anno = pd.read_csv(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_v4_2025-02-04_curatedAnno_v10.csv', index_col = 0)
for c in ct_anno.columns:
    if c in adata.obs.columns:
        adata.obs.drop(c, axis = 1, inplace = True)
adata.obs = adata.obs.join(ct_anno)

# Filter data (only include annotated cells)
adata = adata[~adata.obs['taa_l5'].str.contains('locnt|-sp|explore', na = True)]

# Update metadata
latest_meta_path = get_latest_version(dir = f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
latest_meta = pd.read_excel(latest_meta_path)
update_obs(adata, latest_meta, on = 'index', ignore_warning = True)

# Add vdj data
meta_tcr = pd.read_csv(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_v9_2025-03-28_tcrab_v6.csv', index_col = 0)
adata.obs = adata.obs.join(meta_tcr)

# Exclude non-T cells
adata = adata[~adata.obs['rearrangement_status_VDJ'].isna()]

In [None]:
# Define columns
col_cell_type_broad = 'taa_l3'
col_cell_type_fine = 'taa_l4'
col_cell_type_broad_levels = get_ct_levels(col_cell_type_broad, taa_l1 = ['T', 'NK'])
col_cell_type_fine_levels = get_ct_levels(col_cell_type_fine, taa_l1 = ['T', 'NK'])
col_age_group = 'age_group'
col_age_group_levels = eval(f'{col_age_group}_levels')

In [None]:
# Load TCRab data
vdj_version = 'v6_2025-04-03'
tcrab_info = ddl.read_h5ddl(f'{data_path}/objects/vdj/thymusAgeing_tcrabFiltered_{vdj_version}.h5ddl')

tcrab_info

Data overview:

In [None]:
df = adata[adata.obs['rearrangement_status_VDJ'] != 'No_contig'].obs[['sample', 'donor', 'age_group']].copy()
df.groupby(['age_group']).agg(n_donors = ('donor', 'nunique'), n_sample = ('sample', 'nunique'), n_tcr = ('sample', 'count'), donors = ('donor', lambda x: ', '.join(x.unique())))

In [None]:
# Remove any samples which were not TCR sequenced
sample_freq = adata.obs.groupby('sample')['chain_status'].apply(lambda x: x.value_counts(normalize = True)).reset_index(name = 'prop').rename(columns = {'level_1' : 'chain_status'})
exclude_samples = sample_freq.loc[(sample_freq['chain_status'] == 'No_contig') & (sample_freq['prop'] == 1)]['sample'].unique()
print(f'Excluding {len(exclude_samples)} samples with no TCR data')
adata = adata[~adata.obs['sample'].isin(exclude_samples)]

sample_freq

Prepare CDR3 data

In [None]:
cdr3_df = tcrab_info.data[['cdr3_aa' ,'cdr3', 'productive', 'junction_length', 'locus']].copy()
cdr3_df['names'] = cdr3_df.index.str.replace(r'_contig_\d+', '', regex=True)
cdr3_df = cdr3_df.merge(adata.obs[[col_age_group, col_cell_type_broad, col_cell_type_fine, 'donor', 'sample']].reset_index(names='names'), on = 'names', how = 'left')

# Calculate cdr3 length
cdr3_df['cdr3_aa_length'] = cdr3_df['cdr3_aa'].str.len()
cdr3_df['cdr3_length'] = cdr3_df['cdr3'].str.len()

# Factorise age_group
cdr3_df[col_age_group] = cdr3_df[col_age_group].astype('category')
cdr3_df[col_age_group] = cdr3_df[col_age_group].cat.set_categories(col_age_group_levels, ordered = True)

cdr3_df.head()

In [None]:
cdr3_df.groupby(col_age_group).agg(n_donor = ('donor', 'nunique'), n_sample = ('sample', 'nunique'), n_cell = ('cdr3', 'count'))

## Proportion of productive rearrangements

In [None]:
adata.obs[['has_contig', 'chain_status', 'productive_VDJ', 'locus_VDJ']]

In [None]:
df = adata.obs[['donor', 'age_group','sample', col_cell_type_fine, 'productive_VDJ', 'productive_VJ']]
df['TRA'] = df['productive_VJ'].str.contains('T', na = False)
df['TRB'] = df['productive_VDJ'].str.contains('T', na = False)
df = df.melt(id_vars = ['donor', 'age_group','sample', col_cell_type_fine], value_vars=['TRA', 'TRB'], var_name = 'locus', value_name = 'productive')
df = df.loc[df[col_cell_type_fine].isin(t_nk_groupings['dev'])]

exlude_samples_cells = df.groupby(['donor', 'age_group','sample'], observed = True)[col_cell_type_fine].value_counts().reset_index(name = 'n_cells')
exlude_samples_cells = exlude_samples_cells.loc[exlude_samples_cells['n_cells'] < 20][['sample', col_cell_type_fine]]
exlude_samples_cells['exclude'] = True
df = df.merge(exlude_samples_cells, on = ['sample', col_cell_type_fine], how = 'left')
df = df.loc[df['exclude'] != True]

df = df.groupby(['donor', 'age_group','sample', col_cell_type_fine, 'locus'], observed = True)['productive'].value_counts(normalize=True).reset_index(name = 'prop')
df = df.groupby(['donor', 'age_group', col_cell_type_fine, 'locus','productive'], observed = True).agg(mean_prop = ('prop', 'mean')).reset_index()
df = df.loc[~df[col_cell_type_fine].isin(['T_DN(early)', 'T_DN(late)', 'T_Treg(agonist)'])]

df

In [None]:
from plotting.utils import plot_faceted_grouped_boxplot, get_tint_palette, calc_figsize, thyAgeing_colors

df = df.loc[df['productive']]
p = plot_faceted_grouped_boxplot(data = df, x = col_cell_type_fine, y = 'mean_prop', hue = col_age_group, hue_order = col_age_group_levels, order = [c for c in t_nk_groupings['dev'] if c not in ['T_DN(early)', 'T_DN(late)', 'T_Treg(agonist)']],
                             facet_kwargs = dict(col = 'locus', col_wrap = 1, col_order = ['TRB', 'TRA']), figsize = calc_figsize(width = 40, height = 60),
                             add_stats = True, #save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_devSplit_TRB_kidera_prod_', 
                             format_percent = True, format_log = False, x_label = 'Cell population', y_label = 'Prop. prod. rearr.', legend_title='Age group', #figsize = calc_figsize(width = 80, height = 80),
                             ylim = None, #legend_kwargs = dict(loc = "upper left", bbox_to_anchor=(1, 1))
                            palette = get_tint_palette(thyAgeing_colors['magenta'])
                             )
p.tight_layout()
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_devSplit_propProd.pdf', bbox_inches='tight', dpi=300)

In [None]:
# TRA
plot_grouped_boxplot(data = df.loc[(df['productive']) & (df['locus'] == 'TRA')], x = col_cell_type_fine, y = 'mean_prop', hue = col_age_group, order = [c for c in col_cell_type_fine_levels if c in df[col_cell_type_fine].tolist()], hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'Proportion of prod TRA', legend_title = 'Age group', add_stats = True, format_percent = True, figsize = calc_figsize(width = 100, height = 50),
                     save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_devSplit_TRA_propProp_'
                     )
plt.xticks(rotation=45, ha='right', y=0.03)
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_devSplit_TRA_propProp.pdf', bbox_inches='tight', dpi=300)

#TRB
plot_grouped_boxplot(data = df.loc[(df['productive']) & (df['locus'] == 'TRB')], x = col_cell_type_fine, y = 'mean_prop', hue = col_age_group, order = [c for c in col_cell_type_fine_levels if c in df[col_cell_type_fine].tolist()], hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'Proportion of prod TRB', legend_title = 'Age group', add_stats = True, format_percent = True, figsize = calc_figsize(width = 100, height = 50),
                     save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_devSplit_TRB_propProp_'
                     )
plt.xticks(rotation=45, ha='right', y=0.03)
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_devSplit_TRB_propProp.pdf', bbox_inches='tight', dpi=300)

### Number of rearrangements per cell

In [None]:
from plotting.utils import thyAgeing_colors, get_tint_palette, get_chroma_palette, plot_faceted_grouped_boxplot

In [None]:
df = adata.obs[['donor', col_age_group,'sample', col_cell_type_fine, 'productive_VDJ', 'productive_VJ']].copy()
df['TRB'] = df['productive_VDJ'].str.count('T|F')
df['TRA'] = df['productive_VJ'].str.count('T|F')
df['TRB_prod'] = df['productive_VDJ'].str.count('T')
df['TRA_prod'] = df['productive_VJ'].str.count('T')
df['TRB_nonprod'] = df['productive_VDJ'].str.count('F')
df['TRA_nonprod'] = df['productive_VJ'].str.count('F')
df = df.melt(id_vars = ['donor', col_age_group,'sample', col_cell_type_fine], value_vars=['TRA', 'TRB', 'TRA_prod', 'TRB_prod', 'TRA_nonprod', 'TRB_nonprod'], var_name = 'locus', value_name = 'n_rearr')
df = df.loc[df[col_cell_type_fine].isin(['T_DN(Q)', 'T_DP(P)', 'T_DP(Q)', 'T_αβT(entry)'])]
df = df.groupby(['donor', col_age_group,'sample', col_cell_type_fine, 'locus'], observed = True)['n_rearr'].mean().reset_index(name = 'n_rearr')
df = df.groupby(['donor', col_age_group, col_cell_type_fine, 'locus'], observed = True).agg(mean_n = ('n_rearr', 'mean')).reset_index()
df

All rearrangements:

In [None]:
# TRA
plot_grouped_boxplot(data = df.loc[(df['locus'] == 'TRA')], x = col_cell_type_fine, y = 'mean_n', order = [c for c in col_cell_type_fine_levels if c in df[col_cell_type_fine].tolist()], 
                     hue = col_age_group, hue_order = col_age_group_levels, palette = get_tint_palette(thyAgeing_colors['magenta']),
                     x_label = 'Cell population', y_label = 'N(all) TRA', legend_title = 'Age group', 
                     add_stats = True, format_percent = False, figsize = calc_figsize(width = 50, height = 40),
                     save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_devSplit_TRA_nRearr_',
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'})
plt.xticks(rotation=45, ha='right', y=0.03)
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_devSplit_TRA_nRearr.pdf', bbox_inches='tight', dpi=300)

# TRB
p = plot_grouped_boxplot(data = df.loc[(df['locus'] == 'TRB')], x = col_cell_type_fine, y = 'mean_n', order = [c for c in col_cell_type_fine_levels if c in df[col_cell_type_fine].tolist()], 
                         hue = col_age_group, hue_order = col_age_group_levels, palette = get_tint_palette(thyAgeing_colors['magenta']),
                         x_label = 'Cell population', y_label = 'N(all) TRB', legend_title = 'Age group', 
                         add_stats = True, format_percent = False, figsize = calc_figsize(width = 50, height = 40),
                         save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_devSplit_TRB_nRearr_',
                         legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'})
plt.xticks(rotation=45, ha='right', y=0.03)
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_devSplit_TRB_nRearr.pdf', bbox_inches='tight', dpi=300)

Non-productive rearrangements:

In [None]:
# TRA prod
plot_grouped_boxplot(data = df.loc[(df['locus'] == 'TRA_nonprod')], x = col_cell_type_fine, y = 'mean_n', hue = col_age_group, order = [c for c in col_cell_type_fine_levels if c in df[col_cell_type_fine].tolist()], hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'N(non-prod) TRA', legend_title = 'Age group', add_stats = True, format_percent = False, figsize = calc_figsize(width = 50, height = 40),
                     save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_devSplit_TRA_nNonProdRearr_', palette = get_tint_palette(thyAgeing_colors['magenta']),
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'}, remove_legend = True, ylim = (0,1))
plt.xticks(rotation=45, ha='right', y=0.03)
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_devSplit_TRA_nNonProdRearr.pdf', bbox_inches='tight', dpi=300)

#TRB prod
plot_grouped_boxplot(data = df.loc[(df['locus'] == 'TRB_nonprod')], x = col_cell_type_fine, y = 'mean_n', hue = col_age_group, order = [c for c in col_cell_type_fine_levels if c in df[col_cell_type_fine].tolist()], hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'N(non-prod) TRB', legend_title = 'Age group', add_stats = True, format_percent = False, figsize = calc_figsize(width = 50, height = 40),
                     save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_devSplit_TRB_nNonProdRearr_', palette = get_tint_palette(thyAgeing_colors['magenta']),
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'}, remove_legend = True, ylim = (0,1))
plt.xticks(rotation=45, ha='right', y=0.03)
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_devSplit_TRB_nNonProdRearr.pdf', bbox_inches='tight', dpi=300)

Productive rearrangements:

In [None]:
# TRA
plot_grouped_boxplot(data = df.loc[(df['locus'] == 'TRA_prod')], x = col_cell_type_fine, y = 'mean_n', order = [c for c in col_cell_type_fine_levels if c in df[col_cell_type_fine].tolist()], 
                     hue = col_age_group, hue_order = col_age_group_levels, palette = get_tint_palette(thyAgeing_colors['magenta']),
                     x_label = 'Cell population', y_label = 'N(prod) TRA', legend_title = 'Age group', 
                     add_stats = True, format_percent = False, figsize = calc_figsize(width = 50, height = 40),
                     save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_devSplit_TRA_nProdRearr_',
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'})
plt.xticks(rotation=45, ha='right', y=0.03)
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_devSplit_TRA_nProdRearr.pdf', bbox_inches='tight', dpi=300)

# TRB
p = plot_grouped_boxplot(data = df.loc[(df['locus'] == 'TRB_prod')], x = col_cell_type_fine, y = 'mean_n', order = [c for c in col_cell_type_fine_levels if c in df[col_cell_type_fine].tolist()], 
                         hue = col_age_group, hue_order = col_age_group_levels, palette = get_tint_palette(thyAgeing_colors['magenta']),
                         x_label = 'Cell population', y_label = 'N(prod) TRB', legend_title = 'Age group', 
                         add_stats = True, format_percent = False, figsize = calc_figsize(width = 50, height = 40),
                         save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_devSplit_TRB_nProdRearr_',
                         legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'})
plt.xticks(rotation=45, ha='right', y=0.03)
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_devSplit_TRB_nProdRearr.pdf', bbox_inches='tight', dpi=300)

In [None]:
# df[['locus', 'locus_status']] = df['locus'].str.split('_', expand = True)
# df = df.loc[~pd.isna(df['locus_status'])]

p = plot_faceted_grouped_boxplot(data = df.loc[df['locus_status'] == 'prod'], x = col_cell_type_fine, y = 'mean_n', order = ['T_DN(Q)', 'T_DP(P)', 'T_DP(Q)', 'T_αβT(entry)'],
                             facet_kwargs = dict(col = 'locus', col_wrap = 1, col_order = ['TRB', 'TRA']),
                             plot_kwargs = dict(sharex = True),
                             hue = col_age_group, hue_order = col_age_group_levels, palette = get_tint_palette(thyAgeing_colors['magenta']),
                             add_stats = True, #save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_devSplit_TRB_kidera_prod_', 
                             format_percent = False, format_log = False, ylim = None, figsize = calc_figsize(width = 40, height = 65),
                             x_label = 'Cell population', y_label = 'N(prod)', legend_title='Age group',
                             #legend_kwargs = dict(loc = "upper left", bbox_to_anchor=(1, 1))
                             )
p.tight_layout()
for ax in p.axes.flat:
    ax.set_ylabel('N(prod)')
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_devSplit_nProdRearr.pdf', bbox_inches='tight', dpi=300)

p = plot_faceted_grouped_boxplot(data = df.loc[df['locus_status'] == 'nonprod'], x = col_cell_type_fine, y = 'mean_n', order = ['T_DN(Q)', 'T_DP(P)', 'T_DP(Q)', 'T_αβT(entry)'],
                             facet_kwargs = dict(col = 'locus', col_wrap = 1, col_order = ['TRB', 'TRA']),
                             hue = col_age_group, hue_order = col_age_group_levels, palette  = get_tint_palette(thyAgeing_colors['magenta']),
                             add_stats = True, #save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_devSplit_TRB_kidera_prod_', 
                             format_percent = False, format_log = False, figsize = calc_figsize(width = 40, height = 65),
                             x_label = 'Cell population', y_label = 'N(non-prod)', legend_title='Age group',
                             ylim = None, #legend_kwargs = dict(loc = "upper left", bbox_to_anchor=(1, 1))
                             )
p.tight_layout()
for ax in p.axes.flat:
    ax.set_ylabel('N(non-prod)')
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_devSplit_nNonProdRearr.pdf', bbox_inches='tight', dpi=300)

In [None]:
import matplotlib.pyplot as plt

# Plot all colors
fig, ax = plt.subplots(figsize=(10, 2))
for i, (name, hex_color) in enumerate(thyAgeing_colors.items()):
    ax.add_patch(plt.Rectangle((i, 0), 1, 1, color=hex_color))
    ax.text(i + 0.5, -0.5, name, ha='center', va='center', fontsize=10)

ax.set_xlim(0, len(thyAgeing_colors))
ax.set_ylim(-1, 1)
ax.axis('off')
plt.tight_layout()
plt.show()

In [None]:
from plotting.utils import create_diverging_palette
create_diverging_palette(thyAgeing_colors['teal'], thyAgeing_colors['orange'], s = 90, l = 60, as_cmap = True)

In [None]:
sns.blend_palette([thyAgeing_colors['teal'], thyAgeing_colors['yellow'], thyAgeing_colors['orange'], thyAgeing_colors['magenta']], as_cmap=True, n_colors=10)

In [None]:
sns.blend_palette([thyAgeing_colors['yellow'], thyAgeing_colors['orange'], thyAgeing_colors['magenta']], 
                  as_cmap=True, n_colors=10)

In [None]:
sns.blend_palette([thyAgeing_colors['blue'], thyAgeing_colors['purple'], thyAgeing_colors['magenta'], thyAgeing_colors['orange'], thyAgeing_colors['yellow']], 
                  as_cmap=True, n_colors=10)

In [None]:
sns.blend_palette([thyAgeing_colors['teal'],'white',thyAgeing_colors['orange']], as_cmap=True, n_colors=10)

## VDJ usage

### Recently matured T cells

Type-agnostic:

In [None]:
df = adata.obs[['donor', col_age_group,'sample', col_cell_type_fine, 'v_call_abT_VDJ_main', 'd_call_abT_VDJ_main','j_call_abT_VDJ_main', 'v_call_abT_VJ_main', 'j_call_abT_VJ_main']].copy()
df = df.loc[df[col_cell_type_fine].isin(['T_CD8_naive', 'T_CD4_naive', 'T_Treg'])]
df = df.melt(id_vars = ['donor', col_age_group,'sample'], value_vars=['v_call_abT_VDJ_main', 'd_call_abT_VDJ_main', 'j_call_abT_VDJ_main', 'v_call_abT_VJ_main', 'j_call_abT_VJ_main'], var_name = 'locus', value_name = 'gene')
df = df.groupby(['donor', col_age_group,'sample','locus'], observed = True)['gene'].value_counts(normalize=True).reset_index(name = 'prop')
df = df.groupby(['donor', col_age_group, 'locus','gene'], observed = True).agg(mean_prop = ('prop', 'mean')).reset_index()
df = df.loc[(df['mean_prop'] > 0.01) & (~df['gene'].isin(['None', 'No_contig']))]
df.head()

In [None]:
# Create VDJ gene order
vdj_order = df[['locus', 'gene']].drop_duplicates()
vdj_order['chain'] = vdj_order['locus'].apply(lambda x: 'TRA' if 'VJ' in x else 'TRB' if 'VDJ' in x else pd.NA)
vdj_order.dropna(inplace=True)
vdj_order['keep'] = vdj_order.apply(lambda x: True if 'No' in x['gene'] else True if x['chain'] in x['gene'] else False, axis=1)
vdj_order = vdj_order.loc[vdj_order['keep']]
vdj_order['gene_n1'] = vdj_order['gene'].str.extract(r'(\d+)-?')
vdj_order['gene_n2'] = vdj_order['gene'].str.extract(r'-(\d+)')
vdj_order.sort_values(['locus', 'gene_n1', 'gene_n2'], inplace=True)
vdj_order = vdj_order.groupby('locus')['gene'].apply(lambda x: x.tolist()).to_dict()

In [None]:
for locus in df['locus'].unique():

    gene_short = 'TRA' if 'VJ' in locus else 'TRB' 
    gene_short += locus.split('_')[0].upper()
    plot_grouped_boxplot(data = df.loc[(df['locus'] == locus)], x = 'gene', y = 'mean_prop', order = vdj_order[locus], hue = col_age_group, hue_order = col_age_group_levels, 
                         palette = get_tint_palette(thyAgeing_colors['magenta']), add_jitter = False,
                        x_label = 'Cell population', y_label = 'Frequency', legend_title = 'Age group', add_stats = True, format_percent = True, figsize = calc_figsize(width = len(vdj_order[locus])*5, height = 40),
                        save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_newTSplit_{gene_short}_freq_'
                        )
    plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_newTSplit_{gene_short}_freq.pdf', bbox_inches='tight', dpi=300)

#### PCA

In [None]:
min_n_cells = 10
n_cells_df = adata.obs[['donor', col_age_group, col_cell_type_fine]].groupby(['donor', col_age_group, col_cell_type_fine]).size().reset_index(name='n_cells')
n_cells_df = n_cells_df.loc[n_cells_df[col_cell_type_fine].isin(['T_CD8_naive', 'T_CD4_naive', 'T_Treg'])]
donors_removed = n_cells_df.loc[n_cells_df['n_cells'] < min_n_cells, 'donor'].unique()
print(f"Removing cells from donors: {donors_removed}")

# Extract VDJ usage
df = adata.obs[['donor', col_age_group,'sample', col_cell_type_fine, 'v_call_abT_VDJ_main', 'd_call_abT_VDJ_main','j_call_abT_VDJ_main', 'v_call_abT_VJ_main', 'j_call_abT_VJ_main']].copy()
df = df.loc[(df[col_cell_type_fine].isin(['T_CD8_naive', 'T_CD4_naive', 'T_Treg'])) & (~df['donor'].isin(donors_removed))]
df = df.melt(id_vars = ['donor', col_age_group,'sample', col_cell_type_fine], value_vars=['v_call_abT_VDJ_main', 'd_call_abT_VDJ_main', 'j_call_abT_VDJ_main', 'v_call_abT_VJ_main', 'j_call_abT_VJ_main'], var_name = 'locus', value_name = 'gene')
df = df.groupby(['donor', col_age_group,'sample',col_cell_type_fine,'locus'], observed = True)['gene'].value_counts(normalize=True).reset_index(name = 'prop')
df = df.groupby(['donor', col_age_group, col_cell_type_fine, 'locus','gene'], observed = True).agg(mean_prop = ('prop', 'mean')).reset_index()
df = df.loc[(df['mean_prop'] > 0.01) & (~df['gene'].isin(['None', 'No_contig']))]

vdj_usage_df = df.pivot_table(index=['donor', col_age_group, col_cell_type_fine], columns='gene', values='mean_prop').fillna(0)
vdj_usage_df

In [None]:
len(pairs)

In [None]:
from sklearn.decomposition import PCA
import itertools
from matplotlib.backends.backend_pdf import PdfPages

# Perform PCA
pca = PCA(n_components=5)
pca_result = pca.fit_transform(vdj_usage_df.values)

# Plot all combinations of the first four PCs
pairs = list(itertools.combinations(range(5), 2))

# Create a DataFrame for plotting
plot_df = pd.DataFrame(pca_result, columns=[f'PC{i+1}' for i in range(5)], index=vdj_usage_df.index)
plot_df['age_group'] = plot_df.index.get_level_values('age_group')
plot_df['age_group'] = pd.Categorical(plot_df['age_group'], categories=col_age_group_levels, ordered=True)
plot_df['cell_type'] = plot_df.index.get_level_values('taa_l4')
plot_df['cell_type'] = pd.Categorical(plot_df['cell_type'], categories=col_cell_type_fine_levels, ordered=True)
plot_df['cell_type'] = plot_df['cell_type'].cat.remove_unused_categories()

with PdfPages(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_newTSplit_vdjUsage_pca.pdf') as pdf:
    # Age group plot
    fig, axes = plt.subplots(3, 4, figsize=calc_figsize(width=180, height=120))
    axes = axes.flatten()
    handles, labels = None, None
    for ax, (i, j) in zip(axes, pairs):
        sc = sns.scatterplot(data=plot_df, x=f'PC{i+1}', y=f'PC{j+1}', hue='age_group', palette='Set2', ax=ax, legend=True, s = 10)
        ax.set_title(f'PC{i+1} vs PC{j+1}')
        if handles is None or labels is None:
            handles, labels = sc.get_legend_handles_labels()
        sc.get_legend().remove()
    plt.tight_layout(rect=[0, 0, 0.85, 1], pad=0.5)
    # Add legend in a separate panel on the right
    fig.legend(handles, labels, title='Age group', loc='center left', bbox_to_anchor=(0.88, 0.5))
    pdf.savefig(fig)

    # Cell type plot
    fig, axes = plt.subplots(3, 4, figsize=calc_figsize(width=180, height=120))
    axes = axes.flatten()
    handles, labels = None, None
    for ax, (i, j) in zip(axes, pairs):
        sc = sns.scatterplot(
            data=plot_df,
            x=f'PC{i+1}',
            y=f'PC{j+1}',
            hue='cell_type',
            palette='tab10',
            ax=ax,
            legend=True,
            s = 10,
        )
        ax.set_title(f'PC{i+1} vs PC{j+1}')
        if handles is None or labels is None:
            handles, labels = sc.get_legend_handles_labels()
        sc.get_legend().remove()
    plt.tight_layout(rect=[0, 0, 0.85, 1], pad = 0.5)
    fig.legend(handles, labels, title='Cell type', loc='center left', bbox_to_anchor=(0.88, 0.5))
    pdf.savefig(fig)

In [None]:
from plotting.utils import thyAgeing_colors
# Plot proportion of variance explained per principal component
explained_var = pca.explained_variance_ratio_
plt.figure(figsize=(6, 4))
sns.barplot(x=[f'PC{i+1}' for i in range(len(explained_var))], y=explained_var, color=thyAgeing_colors['magenta'])
plt.ylabel('Proportion of variance explained')
plt.xlabel('Principal Component')
plt.title('Variance Explained by Principal Components')
plt.ylim(0, 1)
plt.tight_layout()
plt.show()

In [None]:
from scipy.stats import f_oneway, kruskal

# Test association of each PC with age_group and cell_type (taa_l4)
results = []
for pc in ['PC1', 'PC2', 'PC3', 'PC4', 'PC5']:
    # Age group (categorical)
    groups_age = [plot_df.loc[plot_df['age_group'] == ag, pc].values for ag in plot_df['age_group'].unique()]
    #stat_age, pval_age = f_oneway(*groups_age) if all(len(g) > 1 for g in groups_age) else (None, None)
    stat_age_kw, pval_age_kw = kruskal(*groups_age) if all(len(g) > 1 for g in groups_age) else (None, None)
    
    # Cell type (categorical)
    groups_ct = [plot_df.loc[plot_df['cell_type'] == ct, pc].values for ct in plot_df['cell_type'].unique()]
    #stat_ct, pval_ct = f_oneway(*groups_ct) if all(len(g) > 1 for g in groups_ct) else (None, None)
    stat_ct_kw, pval_ct_kw = kruskal(*groups_ct) if all(len(g) > 1 for g in groups_ct) else (None, None)
    
    results.append({
        'PC': pc,
        'age_group_KW_stat': stat_age_kw,
        'age_group_KW_p': pval_age_kw,
        'cell_type_KW_stat': stat_ct_kw,
        'cell_type_KW_p': pval_ct_kw
    })

results_df = pd.DataFrame(results)
print(results_df)

In [None]:
# Get PCA loadings for PC
with PdfPages(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_newTSplit_vdjUsage_pca_loadings.pdf') as pdf:
        
    for pc_oi in ['PC1', 'PC2', 'PC3', 'PC4', 'PC5']:
        pc_num = int(pc_oi.replace('PC', '')) - 1
        pc_loadings = pd.Series(pca.components_[pc_num], index=vdj_usage_df.columns)
        pc_loadings_sorted = pc_loadings.sort_values(ascending=False)

        pos_loadings = pc_loadings_sorted.head(10).to_frame(name='loading').reset_index().reset_index(names='row_number')
        neg_loadings = pc_loadings_sorted.tail(10).to_frame(name='loading').sort_values(by = 'loading', ascending = True).reset_index().reset_index(names='row_number')

        fig, axes = plt.subplots(1, 2, figsize=calc_figsize(width = 80, height = 60), gridspec_kw={'wspace': 0})

        # Negative loadings (left)
        p = sns.barplot(data=neg_loadings, x='loading', y='row_number', color=thyAgeing_colors['magenta'], orient='h', ax=axes[0])
        axes[0].set_title(f'Top 10 Negative {pc_oi} Loadings')
        axes[0].set_yticks([])
        axes[0].set_yticklabels([])
        p.set_xlim(-0.5, 0)  # Adjust x-axis limit for better visibility
        p.set_ylabel('')
        for i, row in neg_loadings.iterrows():
            axes[0].text(row['loading'] - 0.01, i, row['gene'], va='center', ha='right', fontsize=6)

        # Positive loadings (right)
        p = sns.barplot(data=pos_loadings, x='loading', y='row_number', color=thyAgeing_colors['magenta'], orient='h', ax=axes[1])
        axes[1].set_title(f'Top 10 Positive {pc_oi} Loadings')
        axes[1].set_yticks([])
        axes[1].set_yticklabels([])
        p.set_xlim(0, 0.5)  # Adjust x-axis limit for better visibility
        p.set_ylabel('')
        for i, row in pos_loadings.iterrows():
            axes[1].text(row['loading'] + 0.01, i, row['gene'], va='center', ha='left', fontsize= 6)
            
        plt.tight_layout()
        pdf.savefig(fig)

### Pre-selection (abT entry)

In [None]:
df = adata.obs[['donor', col_age_group,'sample', col_cell_type_fine, 'v_call_abT_VDJ_main', 'd_call_abT_VDJ_main','j_call_abT_VDJ_main', 'v_call_abT_VJ_main', 'j_call_abT_VJ_main']].copy()
df = df.loc[df[col_cell_type_fine].isin(['T_αβT(entry)'])]
df = df.melt(id_vars = ['donor', col_age_group,'sample'], value_vars=['v_call_abT_VDJ_main', 'd_call_abT_VDJ_main', 'j_call_abT_VDJ_main', 'v_call_abT_VJ_main', 'j_call_abT_VJ_main'], var_name = 'locus', value_name = 'gene')
df = df.groupby(['donor', col_age_group,'sample','locus'], observed = True)['gene'].value_counts(normalize=True).reset_index(name = 'prop')
df = df.groupby(['donor', col_age_group, 'locus','gene'], observed = True).agg(mean_prop = ('prop', 'mean')).reset_index()
df = df.loc[(df['mean_prop'] > 0.01) & (~df['gene'].isin(['None', 'No_contig']))]

# Create VDJ gene order
vdj_order = df[['locus', 'gene']].drop_duplicates()
vdj_order['chain'] = vdj_order['locus'].apply(lambda x: 'TRA' if 'VJ' in x else 'TRB' if 'VDJ' in x else pd.NA)
vdj_order.dropna(inplace=True)
vdj_order['keep'] = vdj_order.apply(lambda x: True if 'No' in x['gene'] else True if x['chain'] in x['gene'] else False, axis=1)
vdj_order = vdj_order.loc[vdj_order['keep']]
vdj_order['gene_n1'] = vdj_order['gene'].str.extract(r'(\d+)-?')
vdj_order['gene_n2'] = vdj_order['gene'].str.extract(r'-(\d+)')
vdj_order.sort_values(['locus', 'gene_n1', 'gene_n2'], inplace=True)
vdj_order = vdj_order.groupby('locus')['gene'].apply(lambda x: x.tolist()).to_dict()

for locus in df['locus'].unique():

    gene_short = 'TRA' if 'VJ' in locus else 'TRB' 
    gene_short += locus.split('_')[0].upper()
    plot_grouped_boxplot(data = df.loc[(df['locus'] == locus)], x = 'gene', y = 'mean_prop', order = vdj_order[locus], hue = col_age_group, hue_order = col_age_group_levels, 
                         palette = get_tint_palette(thyAgeing_colors['magenta']), add_jitter = False,
                        x_label = 'Cell population', y_label = 'Frequency', legend_title = 'Age group', add_stats = True, format_percent = True, figsize = calc_figsize(width = len(vdj_order[locus])*5, height = 40),
                        save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_abTentrySplit_{gene_short}_freq_'
                        )
    plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_abTentrySplit_{gene_short}_freq.pdf', bbox_inches='tight', dpi=300)

### All stages (heatmap)

In [None]:
df = adata.obs[['donor', col_age_group,'sample', col_cell_type_fine, 'v_call_abT_VDJ_main', 'd_call_abT_VDJ_main','j_call_abT_VDJ_main', 'v_call_abT_VJ_main', 'j_call_abT_VJ_main']].copy()
df = df.loc[df[col_cell_type_fine].isin(['T_DN(Q)','T_DP(P)','T_DP(Q)','T_αβT(entry)','T_CD8_naive','T_CD4_naive','T_Treg'])]
df = df.melt(id_vars = ['donor', col_age_group,'sample', col_cell_type_fine], value_vars=['v_call_abT_VDJ_main', 'd_call_abT_VDJ_main', 'j_call_abT_VDJ_main', 'v_call_abT_VJ_main', 'j_call_abT_VJ_main'], var_name = 'locus', value_name = 'gene')

locus_map = {'v_call_abT_VDJ_main': 'TRBV', 'd_call_abT_VDJ_main': 'TRBD', 'j_call_abT_VDJ_main': 'TRBJ', 'v_call_abT_VJ_main': 'TRAV', 'j_call_abT_VJ_main': 'TRAJ'}
df['locus'] = df['locus'].map(locus_map)
df.dropna(inplace=True)

df = df.groupby(['donor', col_age_group,'sample',col_cell_type_fine,'locus'], observed = True)['gene'].value_counts(normalize=True).reset_index(name = 'prop')
df = df.groupby(['donor', col_age_group, col_cell_type_fine, 'locus','gene'], observed = True).agg(mean_prop = ('prop', 'mean')).reset_index()
df = df.groupby([col_age_group, col_cell_type_fine, 'locus','gene'], observed = True).agg(mean_prop = ('mean_prop', 'mean')).reset_index()

df = df.loc[df.apply(lambda x: x['gene'].startswith(x['locus']), axis = 1)]
df = df.loc[df[col_age_group] != 'aged']

df['locus'] = pd.Categorical(df['locus'], categories = ['TRBV', 'TRBD', 'TRBJ', 'TRAV', 'TRAJ'], ordered = True)
df[col_cell_type_fine] = pd.Categorical(df[col_cell_type_fine], categories = ['T_DN(Q)','T_DP(P)','T_DP(Q)','T_αβT(entry)','T_CD8_naive','T_CD4_naive','T_Treg'], ordered = True)

df.head()

In [None]:
# Create VDJ gene order
vdj_order = df[['locus', 'gene']].drop_duplicates()
vdj_order['chain'] = vdj_order['locus'].apply(lambda x: 'TRA' if 'TRA' in x else 'TRB' if 'TRB' in x else pd.NA)
vdj_order.dropna(inplace=True)
vdj_order['keep'] = vdj_order.apply(lambda x: True if 'No' in x['gene'] else True if x['chain'] in x['gene'] else False, axis=1)
vdj_order = vdj_order.loc[vdj_order['keep']]
vdj_order['gene_n1'] = vdj_order['gene'].str.extract(r'(\d+)-?').astype(int)
vdj_order['gene_n2'] = vdj_order['gene'].str.extract(r'-(\d+)').replace(pd.NA, 0).astype(int)
vdj_order.sort_values(['locus', 'gene_n1', 'gene_n2'], inplace=True)
vdj_order = vdj_order.groupby('locus')['gene'].apply(lambda x: x.tolist()).to_dict()

# Reverse order for TRA J gene
vdj_order['TRAJ'] = vdj_order['TRAJ'][::-1]

In [None]:
df.loc[(df['locus'].str.startswith(loc)) & (df[col_cell_type_fine].isin(df_sub[col_cell_type_fine].tolist()))]['mean_prop'].quantile(0.95)


In [None]:
loc = 'TRB'
for age in ['adult', 'paed', 'infant']:
    
    df_sub = df.loc[(df['locus'].str.startswith(loc)) & (df['gene'] != 'No_contig') & (df[col_age_group] == age)].copy()

    if loc == 'TRA':
        df_sub = df_sub.loc[~df_sub[col_cell_type_fine].isin(['T_DN(Q)', 'T_DP(P)'])]
        
    width_ratios = df_sub.groupby('locus', observed=True)['gene'].nunique()
    width_ratios[-1] += 10
    fig, axes = plt.subplots(1, width_ratios.shape[0], figsize=calc_figsize(width='double', height=50), 
                            gridspec_kw={'width_ratios': width_ratios.tolist(), 'wspace': 0.05})

    # Define loci
    loci = width_ratios.index.tolist()

    # Create a colormap normalized between 0 and the 95th percentile of the mean proportion for the selected locus
    cmap = sns.blend_palette([thyAgeing_colors['yellow'], thyAgeing_colors['orange'], thyAgeing_colors['magenta']], as_cmap=True)
    norm = plt.Normalize(vmin=0, vmax=df.loc[(df['locus'].str.startswith(loc)) & (df[col_cell_type_fine].isin(df_sub[col_cell_type_fine].tolist()))]['mean_prop'].quantile(0.95))

    # Plot each locus
    for i, locus in enumerate(loci):
        df_hm = df_sub[df_sub['locus'] == locus].pivot_table(index=col_cell_type_fine, columns='gene', values='mean_prop', fill_value=0)
        df_hm = df_hm.reindex(vdj_order[locus], axis=1, fill_value=0)
        sns.heatmap(df_hm, ax=axes[i], cmap=cmap, norm=norm, cbar = False, #cbar=i == 2, cbar_kws={'shrink': 0.5}, cmap=cmap, norm=norm
                    )

        axes[i].set_title(f'{locus}', fontweight='bold')
        axes[i].set_xlabel('Gene')
        axes[i].set_ylabel('Cell population')
        axes[i].tick_params(axis='x', rotation=90)
        axes[i].tick_params(axis='y', rotation=0)
        if i > 0:
            axes[i].set_ylabel('')
            axes[i].set_yticklabels([])
            axes[i].set_yticks([])

    # Set outer panel borders as black lines
    for ax in axes:
        for _, spine in ax.spines.items():
            spine.set_visible(True)
            spine.set_color('black')

    # Add a colorbar to the right of the last heatmap
    cbar = fig.colorbar(plt.cm.ScalarMappable(norm=norm, cmap=cmap), ax=axes, orientation='vertical', fraction=0.05, pad=0.0)
    cbar.set_label('Mean Proportion', rotation=270, labelpad=5)
    cbar.set_ticklabels([f'{round(c * 100)}%' for c in cbar.get_ticks()])
    fig.subplots_adjust(right=0.85)

    #plt.tight_layout(h_pad=0.5)
    plt.savefig(f'{plots_path}/vdj/cdr3Analysis/vdjUsage/thyAgeing_devStages_{loc}_{age}_comparison.pdf', bbox_inches='tight', dpi=300)
    plt.show()

### All stages (test statistic)

In [None]:
df = adata.obs[['donor', col_age_group,'sample', col_cell_type_fine, 'v_call_abT_VDJ_main', 'd_call_abT_VDJ_main','j_call_abT_VDJ_main', 'v_call_abT_VJ_main', 'j_call_abT_VJ_main']].copy()
df = df.loc[df[col_cell_type_fine].isin(['T_DN(Q)','T_DP(P)','T_DP(Q)','T_αβT(entry)','T_CD8_naive','T_CD4_naive','T_Treg'])]
df = df.melt(id_vars = ['donor', col_age_group,'sample', col_cell_type_fine], value_vars=['v_call_abT_VDJ_main', 'd_call_abT_VDJ_main', 'j_call_abT_VDJ_main', 'v_call_abT_VJ_main', 'j_call_abT_VJ_main'], var_name = 'locus', value_name = 'gene')

locus_map = {'v_call_abT_VDJ_main': 'TRBV', 'd_call_abT_VDJ_main': 'TRBD', 'j_call_abT_VDJ_main': 'TRBJ', 'v_call_abT_VJ_main': 'TRAV', 'j_call_abT_VJ_main': 'TRAJ'}
df['locus'] = df['locus'].map(locus_map)
df.dropna(inplace=True)

df = df.groupby(['donor', col_age_group,'sample',col_cell_type_fine,'locus'], observed = True)['gene'].value_counts(normalize=True).reset_index(name = 'prop')
df = df.groupby(['donor', col_age_group, col_cell_type_fine, 'locus','gene'], observed = True).agg(mean_prop = ('prop', 'mean')).reset_index()

df = df.loc[df.apply(lambda x: x['gene'].startswith(x['locus']), axis = 1)]
df = df.loc[df[col_age_group] != 'aged']

df['locus'] = pd.Categorical(df['locus'], categories = ['TRBV', 'TRBD', 'TRBJ', 'TRAV', 'TRAJ'], ordered = True)
df[col_cell_type_fine] = pd.Categorical(df[col_cell_type_fine], categories = ['T_DN(Q)','T_DP(P)','T_DP(Q)','T_αβT(entry)','T_CD8_naive','T_CD4_naive','T_Treg'], ordered = True)

df.head()

In [None]:
from statsmodels.stats.multitest import multipletests
from scipy.stats import mannwhitneyu,ranksums

df_test = df.loc[df[col_age_group].isin(['adult', 'infant'])].copy()
df_test = df_test.groupby(['gene', col_cell_type_fine, col_age_group], observed = True)['mean_prop'].agg(list).reset_index(name = 'mean_prop')
df_test['n_donors'] = df_test['mean_prop'].apply(lambda x: len(x))
df_test = df_test.loc[df_test['n_donors'] > 1]
df_test = df_test.pivot_table(index = ['gene', col_cell_type_fine], columns = col_age_group, values = 'mean_prop', aggfunc='first').reset_index()
df_test['n_groups'] = df_test.apply(lambda x: sum([1 for i in [x['adult'], x['infant']] if isinstance(i, list)]), axis = 1)
df_test = df_test.loc[df_test['n_groups'] == 2]
df_test[['stat', 'pval']] = df_test.apply(lambda x: ranksums(x['adult'], x['infant']), axis = 1, result_type='expand')
df_test['pval_adj'] = multipletests(df_test['pval'], method='fdr_bh')[1]
df_test['locus'] = df_test['gene'].str.extract(r'(TR[ABVJD]+)')
df_test['locus'] = pd.Categorical(df_test['locus'], categories = ['TRBV', 'TRBD', 'TRBJ', 'TRAV', 'TRAJ'], ordered = True)
df_test

In [None]:
df_test['pval_adj'].describe() # No significant differences found

In [None]:
vmin, vmax, vcenter

In [None]:
from matplotlib import colors as mcolors

loc = 'TRB'

df_sub = df_test.loc[(df_test['locus'].str.startswith(loc))].copy()
if loc == 'TRA':
    df_sub = df_sub.loc[~df_sub[col_cell_type_fine].isin(['T_DN(Q)', 'T_DP(P)'])] 
    width_ratios = df_sub.groupby('locus', observed=True)['gene'].nunique()
    width_ratios[-1] += 40 # TRA
else:
    width_ratios = df_sub.groupby('locus', observed=True)['gene'].nunique()
    
fig, axes = plt.subplots(1, width_ratios.shape[0], figsize=calc_figsize(width=247, height=40), 
                        gridspec_kw={'width_ratios': width_ratios.tolist(), 'wspace': 0.05})

# Define loci
loci = width_ratios.index.tolist()

# Create a colormap 
max_abs = round(np.array(abs(df_sub['stat'].min()), abs(df_sub['stat'].max())).max(), ndigits = 1)
vmin, vmax, vcenter = -max_abs, max_abs, 0
if df_sub['stat'].min() < 0:
    normalize = mcolors.TwoSlopeNorm(vcenter=vcenter, vmin=vmin, vmax=vmax)
    cmap = sns.blend_palette([thyAgeing_colors['teal'],'white',thyAgeing_colors['orange']], as_cmap=True, n_colors=10)
    #sns.diverging_palette(220, 20, center="light", as_cmap=True, s=100)
else:   
    normalize = plt.Normalize(vmin=vmin, vmax=vmax)
    cmap = sns.color_palette("Spectral_r", as_cmap=True)

# Plot each locus
for i, locus in enumerate(loci):
    df_hm = df_sub[df_sub['locus'] == locus].pivot_table(index=col_cell_type_fine, columns='gene', values='stat', fill_value=0)
    df_hm = df_hm.reindex(vdj_order[locus], axis=1, fill_value=0)
    sns.heatmap(df_hm, ax=axes[i], cmap=cmap, norm=normalize, cbar = False, #cbar=i == 2, cbar_kws={'shrink': 0.5}, cmap=cmap, norm=norm
                )

    axes[i].set_title(f'{locus}', fontweight='bold')
    axes[i].set_xlabel('Gene')
    axes[i].set_ylabel('Cell population')
    axes[i].tick_params(axis='x', rotation=90)
    axes[i].tick_params(axis='y', rotation=0)
    if i > 0:
        axes[i].set_ylabel('')
        axes[i].set_yticklabels([])
        axes[i].set_yticks([])

# Set outer panel borders as black lines
for ax in axes:
    for _, spine in ax.spines.items():
        spine.set_visible(True)
        spine.set_color('black')

# Add a colorbar to the right of the last heatmap
cbar = fig.colorbar(plt.cm.ScalarMappable(norm=normalize, cmap=cmap), ax=axes, orientation='vertical', fraction=0.05, pad=0.0)
cbar.set_ticks(np.linspace(vmin, vmax, 5))
cbar.set_ticklabels([f'{vmin:.1f}', f'{vmin + (vmax - vmin) / 4:.1f}', f'{vcenter:.1f}', f'{vmax - (vmax - vmin) / 4:.1f}', f'{vmax:.1f}'])
cbar.set_label('Test statistic', rotation=270, labelpad=5)
fig.subplots_adjust(right=0.85)

plt.savefig(f'{plots_path}/vdj/cdr3Analysis/vdjUsage/thyAgeing_devStages_{loc}_ranksums.pdf', bbox_inches='tight', dpi=300)

### All stages (variational diversity)

In [None]:
df = adata.obs[['donor', col_age_group,'sample' ,'sex', col_cell_type_fine, 'v_call_abT_VDJ_main', 'd_call_abT_VDJ_main','j_call_abT_VDJ_main', 'v_call_abT_VJ_main', 'j_call_abT_VJ_main']].copy()
df = df.loc[df[col_cell_type_fine].isin(['T_DN(Q)','T_DP(P)','T_DP(Q)','T_αβT(entry)','T_CD8_naive','T_CD4_naive','T_Treg'])]
df.head()

In [None]:
def get_bootstrap_rep_distance(data : pd.DataFrame, col_condition : str = 'age_group', ctrl_group : str = 'infant'):
    
    df = data.copy()
    
    # Get controls
    df_ctrl = df.loc[df[col_condition] == ctrl_group].copy()
    df_ctrl = df_ctrl.melt(id_vars = ['donor', col_condition, col_cell_type_fine], value_vars=['v_call_abT_VDJ_main', 'd_call_abT_VDJ_main', 'j_call_abT_VDJ_main', 'v_call_abT_VJ_main', 'j_call_abT_VJ_main'], var_name = 'locus', value_name = 'gene')
    locus_map = {'v_call_abT_VDJ_main': 'TRBV', 'd_call_abT_VDJ_main': 'TRBD', 'j_call_abT_VDJ_main': 'TRBJ', 'v_call_abT_VJ_main': 'TRAV', 'j_call_abT_VJ_main': 'TRAJ'}
    df_ctrl['locus'] = df_ctrl['locus'].map(locus_map)
    df_ctrl = df_ctrl.loc[df_ctrl['gene'] != 'No_contig']

    df_ctrl = df_ctrl.groupby(['donor', col_condition, 'locus'], observed = True)['gene'].value_counts(normalize=True).reset_index(name = 'prop')
    df_ctrl = df_ctrl.groupby('gene').agg(mean_prop = ('prop', 'mean')).reset_index()
    
    # Get bootstrap estimates
    bootstrap_res = {}
    for i in range(100):
        df_sub = df.groupby('donor').sample(n=100, replace=True).copy()

        df_sub = df_sub.melt(id_vars = ['donor', col_condition], value_vars=['v_call_abT_VDJ_main', 'd_call_abT_VDJ_main', 'j_call_abT_VDJ_main', 'v_call_abT_VJ_main', 'j_call_abT_VJ_main'], var_name = 'locus', value_name = 'gene')
        locus_map = {'v_call_abT_VDJ_main': 'TRBV', 'd_call_abT_VDJ_main': 'TRBD', 'j_call_abT_VDJ_main': 'TRBJ', 'v_call_abT_VJ_main': 'TRAV', 'j_call_abT_VJ_main': 'TRAJ'}
        df_sub['locus'] = df_sub['locus'].map(locus_map)
        df_sub = df_sub.loc[df_sub['gene'] != 'No_contig']

        df_sub = df_sub.groupby(['donor', col_condition, 'locus'])['gene'].value_counts(normalize=True).reset_index(name = 'prop')
        df_sub = df_sub.pivot_table(index = ['donor', col_condition], columns = 'gene', values = 'prop', aggfunc='mean', fill_value=0)
        
        # Add colcums for genes in control
        for gene in df_ctrl['gene'].unique():
            if gene not in df_sub.columns:
                df_sub[gene] = 0
        df_sub = df_sub[df_ctrl['gene'].unique()]

        # Calculate distance
        mat_ctrl = np.tile(np.array(df_ctrl['mean_prop']), (df_sub.shape[0], 1))
        mat_test = df_sub.values
        dist = pd.DataFrame(np.absolute(mat_ctrl - mat_test).sum(axis=1),index = df_sub.index, columns = ['dist'])

        bootstrap_res[i] = dist
        
    bootstrap_df = pd.concat(bootstrap_res).reset_index().drop('level_0', axis=1)
    bootstrap_df = bootstrap_df.groupby(['donor', col_condition]).agg(mean_dist = ('dist', 'mean'), sd_dist = ('dist', 'std')).reset_index()
    
    return bootstrap_df

In [None]:
rep_dist = {k:get_bootstrap_rep_distance(data = df.loc[df[col_cell_type_fine] == k]) for k in df[col_cell_type_fine].unique()} 
rep_dist_df = pd.concat(rep_dist).reset_index(names = ['cell_type', 'x']).drop('x', axis=1)
rep_dist_df.head()

In [None]:
df

In [None]:
plot_grouped_boxplot(data = rep_dist_df, x = 'cell_type', y = 'mean_dist', order = [c for c in col_cell_type_fine_levels if c in rep_dist_df['cell_type'].tolist()], 
                     hue = col_age_group, hue_order = col_age_group_levels, palette = get_tint_palette(thyAgeing_colors['magenta']),
                     x_label = 'Cell population', y_label = 'TCR distance', legend_title = 'Age group', add_stats = True, format_percent = False, figsize = calc_figsize(width = 75, height = 45),
                     save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_devStages_TCR_vdjDistance_',
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1.1), 'loc':'upper right', 'ncol' : 2},
                     )
plt.xticks(rotation=45, ha='right', y=0.03)
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/vdjUsage/thyAgeing_devStages_TCR_vdjDistance.pdf', bbox_inches='tight', dpi=300)

In [None]:
from plotting.utils import thyAgeing_colors, get_tint_palette, plot_grouped_boxplot, calc_figsize
plot_grouped_boxplot(data = rep_dist_df.loc[rep_dist_df['cell_type'].isin(['T_CD8_naive', 'T_CD4_naive', 'T_Treg'])], x = 'cell_type', y = 'mean_dist', order = [c for c in col_cell_type_fine_levels if c in ['T_CD8_naive', 'T_CD4_naive', 'T_Treg']], 
                     hue = col_age_group, hue_order = col_age_group_levels, palette = get_tint_palette(thyAgeing_colors['magenta']),
                     x_label = 'Cell population', y_label = 'TCR distance', legend_title = 'Age group', add_stats = True, format_percent = False, figsize = calc_figsize(width = 40, height = 45),
                     save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_devStages_TCR_vdjDistance_',
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1.1), 'loc':'upper right', 'ncol' : 2},
                     )
plt.xticks(rotation=45, ha='right', y=0.03)
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/vdjUsage/thyAgeing_matureStages_TCR_vdjDistance.pdf', bbox_inches='tight', dpi=300)

In [None]:
# Repertoire distance between male and female (all age groups)
rep_dist = {k:get_bootstrap_rep_distance(data = df.loc[df[col_cell_type_fine] == k], col_condition = 'sex', ctrl_group='M') for k in df[col_cell_type_fine].unique()} 
rep_dist_df = pd.concat(rep_dist).reset_index(names = ['cell_type', 'x']).drop('x', axis=1)
rep_dist_df.head()

plot_grouped_boxplot(data = rep_dist_df, x = 'cell_type', y = 'mean_dist', order = [c for c in col_cell_type_fine_levels if c in rep_dist_df['cell_type'].tolist()], 
                     hue = 'sex', hue_order = ['M', 'F'], palette = get_tint_palette(thyAgeing_colors['purple'], n = 2),
                     x_label = 'Cell population', y_label = 'TCR distance', legend_title = 'Gender', add_stats = True, format_percent = False, figsize = calc_figsize(width = 50, height = 45),
                     #save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_abTentrySplit_{gene_short}_freq_',
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'},
                     )
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/vdjUsage/thyAgeing_devStages_TCR_genderEffect_vdjDistance.pdf', bbox_inches='tight', dpi=300)

In [None]:
# Repertoire distance between male and female (only adult)
rep_dist = {k:get_bootstrap_rep_distance(data = df.loc[(df[col_cell_type_fine] == k) & (df[col_age_group] == 'adult')], col_condition = 'sex', ctrl_group='M') for k in df[col_cell_type_fine].unique()} 
rep_dist_df = pd.concat(rep_dist).reset_index(names = ['cell_type', 'x']).drop('x', axis=1)

plot_grouped_boxplot(data = rep_dist_df, x = 'cell_type', y = 'mean_dist', order = [c for c in col_cell_type_fine_levels if c in rep_dist_df['cell_type'].tolist()], 
                     hue = 'sex', hue_order = ['M', 'F'], palette = get_tint_palette(thyAgeing_colors['purple'], n = 2),
                     x_label = 'Cell population', y_label = 'TCR distance', legend_title = 'Gender', add_stats = True, format_percent = False, figsize = calc_figsize(width = 50, height = 45),
                     #save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_abTentrySplit_{gene_short}_freq_',
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'},
                     )
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/vdjUsage/thyAgeing_devStages_TCR_genderEffectAdult_vdjDistance.pdf', bbox_inches='tight', dpi=300)

## CDR3 length

### Across all cell types

In [None]:
df = cdr3_df.copy()
df = df.groupby(['donor', col_age_group,'sample', 'locus','productive'], observed = True).agg(mean_cdr3_length = ('cdr3_length', 'mean'), mean_cdr3_aa_length = ('cdr3_aa_length', 'mean'), mean_junction_length = ('junction_length', 'mean')).reset_index()
df = df.groupby(['donor', col_age_group, 'locus','productive'], observed = True).agg(mean_cdr3_length = ('mean_cdr3_length', 'mean'), mean_cdr3_aa_length = ('mean_cdr3_aa_length', 'mean'), mean_junction_length = ('mean_junction_length', 'mean')).reset_index()
df

In [None]:
# Productive rearrangements
plot_grouped_boxplot(data = df.loc[df['productive'] == 'T'], x = 'locus', y = 'mean_cdr3_length', hue = col_age_group, order = ['TRA','TRB'], hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'CDR3 length (nt)', legend_title = 'Age group', add_stats = True, format_percent = False, figsize = calc_figsize(width =30, height = 40),
                     save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_tSplit_cdr3ntLength_prod_', palette = get_tint_palette(thyAgeing_colors['magenta']),
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'})
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_tSplit_cdr3ntLength_prod.pdf', bbox_inches='tight', dpi=300)

plot_grouped_boxplot(data = df.loc[df['productive'] == 'T'], x = 'locus', y = 'mean_cdr3_aa_length', hue = col_age_group, order = ['TRA','TRB'], hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'CDR3 length (AA)', legend_title = 'Age group', add_stats = True, format_percent = False, figsize = calc_figsize(width =30, height = 40),
                     save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_tSplit_cdr3aaLength_prod_', palette = get_tint_palette(thyAgeing_colors['magenta']),
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'})
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_tSplit_cdr3aaLength_prod.pdf', bbox_inches='tight', dpi=300)

plot_grouped_boxplot(data = df.loc[df['productive'] == 'T'], x = 'locus', y = 'mean_junction_length', hue = col_age_group, order = ['TRA','TRB'], hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'Junction length (nt)', legend_title = 'Age group', add_stats = True, format_percent = False, figsize = calc_figsize(width =30, height = 40),
                     save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_tSplit_junctionLength_prod_', palette = get_tint_palette(thyAgeing_colors['magenta']),
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'})
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_tSplit_junctionLength_prod.pdf', bbox_inches='tight', dpi=300)

In [None]:
# Non-productive rearrangements
plot_grouped_boxplot(data = df.loc[df['productive'] == 'F'], x = 'locus', y = 'mean_cdr3_length', hue = col_age_group, order = ['TRA','TRB'], hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'CDR3 length (nt)', legend_title = 'Age group', add_stats = True, format_percent = False, figsize = calc_figsize(width =30, height = 40),
                     save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_tSplit_cdr3ntLength_nonprod_', palette = get_tint_palette(thyAgeing_colors['magenta']),
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'})
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_tSplit_cdr3ntLength_nonprod.pdf', bbox_inches='tight', dpi=300)

plot_grouped_boxplot(data = df.loc[df['productive'] == 'F'], x = 'locus', y = 'mean_cdr3_aa_length', hue = col_age_group, order = ['TRA','TRB'], hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'CDR3 length (AA)', legend_title = 'Age group', add_stats = True, format_percent = False, figsize = calc_figsize(width =30, height = 40),
                     save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_tSplit_cdr3aaLength_nonprod_', palette = get_tint_palette(thyAgeing_colors['magenta']),
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'})
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_tSplit_cdr3aaLength_nonprod.pdf', bbox_inches='tight', dpi=300)

plot_grouped_boxplot(data = df.loc[df['productive'] == 'F'], x = 'locus', y = 'mean_junction_length', hue = col_age_group, order = ['TRA','TRB'], hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'Junction length (nt)', legend_title = 'Age group', add_stats = True, format_percent = False, figsize = calc_figsize(width =30, height = 40),
                     save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_tSplit_junctionLength_nonprod_', palette = get_tint_palette(thyAgeing_colors['magenta']),
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'})
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_tSplit_junctionLength_nonprod.pdf', bbox_inches='tight', dpi=300)

### In developing thymocytes

In [None]:
df = cdr3_df.copy()
df = df.groupby(['donor', col_age_group,'sample', 'locus','productive', col_cell_type_fine], observed = True).agg(mean_cdr3_length = ('cdr3_length', 'mean'), mean_cdr3_aa_length = ('cdr3_aa_length', 'mean'), mean_junction_length = ('junction_length', 'mean')).reset_index()
df = df.groupby(['donor', col_age_group, 'locus','productive', col_cell_type_fine], observed = True).agg(mean_cdr3_length = ('mean_cdr3_length', 'mean'), mean_cdr3_aa_length = ('mean_cdr3_aa_length', 'mean'), mean_junction_length = ('mean_junction_length', 'mean')).reset_index()

df = df.loc[df[col_cell_type_fine].isin(t_nk_groupings['dev'])]
df

In [None]:
plot_grouped_boxplot(data = df.loc[(df['productive'] == 'T') & (df['locus'] == 'TRB')], x = col_cell_type_fine, y = 'mean_cdr3_length', hue = col_age_group, order = t_nk_groupings['dev'], hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'TRB CDR3 length (nt)', legend_title = 'Age group', add_stats = True, format_percent = False, figsize = calc_figsize(width = 80, height = 40),
                     save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_devSplit_TRB_cdr3ntLength_prod_', palette = get_tint_palette(thyAgeing_colors['magenta']),
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'})
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_devSplit_TRB_cdr3ntLength_prod.pdf', bbox_inches='tight', dpi=300)

plot_grouped_boxplot(data = df.loc[(df['productive'] == 'T') & (df['locus'] == 'TRA')], x = col_cell_type_fine, y = 'mean_cdr3_length', hue = col_age_group, order = t_nk_groupings['dev'], hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'TRA CDR3 length (nt)', legend_title = 'Age group', add_stats = True, format_percent = False, figsize = calc_figsize(width = 80, height = 40),
                     save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_devSplit_TRA_cdr3ntLength_prod_', palette = get_tint_palette(thyAgeing_colors['magenta']),
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'})
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_devSplit_TRA_cdr3ntLength_prod.pdf', bbox_inches='tight', dpi=300)

In [None]:
plot_grouped_boxplot(data = df.loc[(df['productive'] == 'T') & (df['locus'] == 'TRB')], x = col_cell_type_fine, y = 'mean_junction_length', hue = col_age_group, order = t_nk_groupings['dev'], hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'TRB junction length (nt)', legend_title = 'Age group', add_stats = True, format_percent = False, figsize = calc_figsize(width = 80, height = 40),
                     save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_devSplit_TRB_junctionLength_prod_', palette = get_tint_palette(thyAgeing_colors['magenta']),
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'})
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_devSplit_TRB_junctionLength_prod.pdf', bbox_inches='tight', dpi=300)

plot_grouped_boxplot(data = df.loc[(df['productive'] == 'T') & (df['locus'] == 'TRA')], x = col_cell_type_fine, y = 'mean_junction_length', hue = col_age_group, order = t_nk_groupings['dev'], hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'TRA junction length (nt)', legend_title = 'Age group', add_stats = True, format_percent = False, figsize = calc_figsize(width = 80, height = 40),
                     save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_devSplit_TRA_junctionLength_prod_', palette = get_tint_palette(thyAgeing_colors['magenta']),
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'})
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_devSplit_TRA_junctionLength_prod.pdf', bbox_inches='tight', dpi=300)

Productive and non-productive:

In [None]:
df = cdr3_df.copy()
df = df.groupby(['donor', col_age_group,'sample', 'locus', col_cell_type_fine], observed = True).agg(mean_cdr3_length = ('cdr3_length', 'mean'), mean_cdr3_aa_length = ('cdr3_aa_length', 'mean'), mean_junction_length = ('junction_length', 'mean')).reset_index()
df = df.groupby(['donor', col_age_group, 'locus', col_cell_type_fine], observed = True).agg(mean_cdr3_length = ('mean_cdr3_length', 'mean'), mean_cdr3_aa_length = ('mean_cdr3_aa_length', 'mean'), mean_junction_length = ('mean_junction_length', 'mean')).reset_index()

df = df.loc[df[col_cell_type_fine].isin(t_nk_groupings['dev'])]
df

In [None]:
from plotting.utils import plot_grouped_boxplot, calc_figsize, get_tint_palette
plot_grouped_boxplot(data = df.loc[(df['locus'] == 'TRB')], x = col_cell_type_fine, y = 'mean_cdr3_length', hue = col_age_group, order = t_nk_groupings['dev'], hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'TRB CDR3 length (nt)', legend_title = 'Age group', add_stats = True, format_percent = False, figsize = calc_figsize(width = 80, height = 40),
                     save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_devSplit_TRB_cdr3ntLength_all_', palette = get_tint_palette(thyAgeing_colors['magenta']),
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'})
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_devSplit_TRB_cdr3ntLength_all.pdf', bbox_inches='tight', dpi=300)

plot_grouped_boxplot(data = df.loc[(df['locus'] == 'TRA')], x = col_cell_type_fine, y = 'mean_cdr3_length', hue = col_age_group, order = t_nk_groupings['dev'], hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'TRA CDR3 length (nt)', legend_title = 'Age group', add_stats = True, format_percent = False, figsize = calc_figsize(width = 80, height = 40),
                     save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_devSplit_TRA_cdr3ntLength_all_', palette = get_tint_palette(thyAgeing_colors['magenta']),
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'})
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_devSplit_TRA_cdr3ntLength_all.pdf', bbox_inches='tight', dpi=300)

### In newly matured T cells

In [None]:
df = cdr3_df.copy()
df = df.groupby(['donor', col_age_group,'sample', 'locus','productive', col_cell_type_fine], observed = True).agg(mean_cdr3_length = ('cdr3_length', 'mean'), mean_cdr3_aa_length = ('cdr3_aa_length', 'mean'), mean_junction_length = ('junction_length', 'mean')).reset_index()
df = df.groupby(['donor', col_age_group, 'locus','productive', col_cell_type_fine], observed = True).agg(mean_cdr3_length = ('mean_cdr3_length', 'mean'), mean_cdr3_aa_length = ('mean_cdr3_aa_length', 'mean'), mean_junction_length = ('mean_junction_length', 'mean')).reset_index()

df = df.loc[df[col_cell_type_fine].isin(t_nk_groupings['newT'])]
df

In [None]:
plot_grouped_boxplot(data = df.loc[(df['productive'] == 'T') & (df['locus'] == 'TRB')], x = col_cell_type_fine, y = 'mean_cdr3_length', hue = col_age_group, order = t_nk_groupings['newT'], hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'TRB CDR3 length (nt)', legend_title = 'Age group', add_stats = True, format_percent = False, figsize = calc_figsize(width = 40, height = 40),
                     save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_newTSplit_TRB_cdr3ntLength_prod_', palette = get_tint_palette(thyAgeing_colors['magenta']),
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'})
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_newTSplit_TRB_cdr3ntLength_prod.pdf', bbox_inches='tight', dpi=300)

plot_grouped_boxplot(data = df.loc[(df['productive'] == 'T') & (df['locus'] == 'TRA')], x = col_cell_type_fine, y = 'mean_cdr3_length', hue = col_age_group, order = t_nk_groupings['newT'], hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'TRA CDR3 length (nt)', legend_title = 'Age group', add_stats = True, format_percent = False, figsize = calc_figsize(width = 40, height = 40),
                     save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_newTSplit_TRA_cdr3ntLength_prod_', palette = get_tint_palette(thyAgeing_colors['magenta']),
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'})
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_newTSplit_TRA_cdr3ntLength_prod.pdf', bbox_inches='tight', dpi=300)

In [None]:
plot_grouped_boxplot(data = df.loc[(df['productive'] == 'T') & (df['locus'] == 'TRB')], x = col_cell_type_fine, y = 'mean_junction_length', hue = col_age_group, order = t_nk_groupings['newT'], hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'TRB junction length (nt)', legend_title = 'Age group', add_stats = True, format_percent = False, figsize = calc_figsize(width = 40, height = 40),
                     save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_newTSplit_TRB_junctionLength_prod_', palette = get_tint_palette(thyAgeing_colors['magenta']),
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'})
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_newTSplit_TRB_junctionLength_prod.pdf', bbox_inches='tight', dpi=300)

plot_grouped_boxplot(data = df.loc[(df['productive'] == 'T') & (df['locus'] == 'TRA')], x = col_cell_type_fine, y = 'mean_junction_length', hue = col_age_group, order = t_nk_groupings['newT'], hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'TRA junction length (nt)', legend_title = 'Age group', add_stats = True, format_percent = False, figsize = calc_figsize(width = 40, height = 40),
                     save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_newTSplit_TRA_junctionLength_prod_', palette = get_tint_palette(thyAgeing_colors['magenta']),
                     legend_kwargs = {'bbox_to_anchor':(1.05, 1), 'loc':'upper left'})
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_newTSplit_TRA_junctionLength_prod.pdf', bbox_inches='tight', dpi=300)

## Physicochemical properties of CDR3

Kidera factors [(Kidera, 1985)](https://link.springer.com/article/10.1007/BF01025492):
- KF1: Helix/bend preference,
- KF2: Side-chain size,
- KF3: Extended structure preference,
- KF4: Hydrophobicity,
- KF5: Double-bend preference,
- KF6: Partial specific volume,
- KF7: Flat extended preference,
- KF8: Occurrence in alpha region,
- KF9: pK-C,
- KF10: Surrounding hydrophobicity

In [None]:
import peptides

kf_columns = [f'kf{i}' for i in range(1, 11)]
kf_values = cdr3_df['cdr3_aa'].apply(lambda x: peptides.Peptide(x).kidera_factors())
kf_df = pd.DataFrame(kf_values.tolist(), columns=kf_columns)
cdr3_df = pd.concat([cdr3_df, kf_df], axis=1)

In [None]:
df = cdr3_df.copy()
df = df.loc[df[col_cell_type_fine].isin(t_nk_groupings['newT'])]
df = df.melt(id_vars = ['donor', 'age_group', 'sample', col_cell_type_fine, 'locus', 'productive'], value_vars = kf_columns, var_name = 'kf', value_name = 'kf_value')
df = df.groupby(['donor', 'age_group','sample', col_cell_type_fine, 'locus','productive', 'kf'], observed = True).agg(mean_kf_value = ('kf_value', 'mean')).reset_index()
df = df.groupby(['donor', 'age_group', col_cell_type_fine, 'locus','productive', 'kf'], observed = True).agg(mean_kf_value = ('mean_kf_value', 'mean')).reset_index()
df

In [None]:
from plotting.utils import plot_faceted_grouped_boxplot,thyAgeing_greys
data = df.loc[(df['locus'] == 'TRB') & (df['productive'] == 'T')]

p = plot_faceted_grouped_boxplot(data = data, x = col_cell_type_fine, y = 'mean_kf_value', hue = col_age_group, hue_order = col_age_group_levels, order = t_nk_groupings['newT'], 
                             facet_kwargs = dict(col = 'kf', col_wrap = 5,col_order = [f'kf{i}' for i in range(1, 11)]),palette = get_tint_palette(thyAgeing_colors['magenta']),
                             add_stats = True, save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_devSplit_TRB_kidera_prod_', 
                             format_percent = False, format_log = False, x_label = 'Cell population', y_label = 'Mean TRB Kidera Factor', legend_title='Age group', figsize = calc_figsize(width = 100, height=65),
                             ylim = None, #legend_kwargs = dict(loc = "upper left", bbox_to_anchor=(1, 1)) 
                             )
for ax in p.axes.flat:
    ax.set_facecolor("#efefef")
p.tight_layout()
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_newTSplit_TRB_kidera_prod.pdf', bbox_inches='tight', dpi=300)

In [None]:
from plotting.utils import plot_faceted_grouped_boxplot
data = df.loc[(df['locus'] == 'TRA') & (df['productive'] == 'T')]

p = plot_faceted_grouped_boxplot(data = data, x = col_cell_type_fine, y = 'mean_kf_value', hue = col_age_group, hue_order = col_age_group_levels, order = t_nk_groupings['newT'], 
                             facet_kwargs = dict(col = 'kf', col_wrap = 5, col_order = [f'kf{i}' for i in range(1, 11)]),
                             add_stats = True, save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_devSplit_TRA_kidera_prod_', 
                             format_percent = False, format_log = False, x_label = 'Cell population', y_label = 'Mean TRA Kidera Factor', legend_title='Age group', figsize = calc_figsize(width = 100, height=65),
                             ylim = None, #legend_kwargs = dict(loc = "upper left", bbox_to_anchor=(1, 1)), 
                             palette = get_tint_palette(thyAgeing_colors['magenta'])
                             )
for ax in p.axes.flat:
    ax.set_facecolor("#efefef")
p.tight_layout()
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_newTSplit_TRA_kidera_prod.pdf', bbox_inches='tight', dpi=300)

### PCA of Kidera factors (newly matured cells) - TRA+TRB

In [None]:
from sklearn.decomposition import PCA
from typing import Optional, List

def run_pca(data, n_components=50):
    pca = PCA(n_components=n_components)
    principal_components = pca.fit_transform(data)
    explained_variance = {f'PC{i+1}': f"{pca.explained_variance_ratio_[i]*100:.1f}%" for i in range(len(pca.explained_variance_ratio_))}
    loadings = {f'PC{i+1}': pca.components_[i] for i in range(pca.components_.shape[0])}

    principal_components = pd.DataFrame(principal_components, index=data.index, columns=[f'PC{i+1}' for i in range(principal_components.shape[1])]).reset_index()
    
    for c in data.index.names:
        principal_components[c] = principal_components[c].astype('category').cat.remove_unused_categories()

    return principal_components, explained_variance, loadings

def plot_pairwise_pcs(plot_df, pairs, hue = col_cell_type_fine, path = None):

    with PdfPages(path) as pdf:
        # Age group plot
        fig, axes = plt.subplots(3, 4, figsize=calc_figsize(width=180, height=120))
        axes = axes.flatten()
        handles, labels = None, None
        for ax, (i, j) in zip(axes, pairs):
            sc = sns.scatterplot(data=plot_df, x=f'PC{i+1}', y=f'PC{j+1}', hue='age_group', palette='Set2', ax=ax, legend=True, s = 10)
            ax.set_title(f'PC{i+1} vs PC{j+1}')
            if handles is None or labels is None:
                handles, labels = sc.get_legend_handles_labels()
            sc.get_legend().remove()
        plt.tight_layout(rect=[0, 0, 0.85, 1], pad=0.5)
        # Add legend in a separate panel on the right
        fig.legend(handles, labels, title='Age group', loc='center left', bbox_to_anchor=(0.88, 0.5))
        pdf.savefig(fig)

        # Cell type plot
        fig, axes = plt.subplots(3, 4, figsize=calc_figsize(width=180, height=120))
        axes = axes.flatten()
        handles, labels = None, None
        for ax, (i, j) in zip(axes, pairs):
            sc = sns.scatterplot(
                data=plot_df,
                x=f'PC{i+1}',
                y=f'PC{j+1}',
                hue=hue,
                palette='tab10',
                ax=ax,
                legend=True,
                s = 10,
            )
            ax.set_title(f'PC{i+1} vs PC{j+1}')
            if handles is None or labels is None:
                handles, labels = sc.get_legend_handles_labels()
            sc.get_legend().remove()
        plt.tight_layout(rect=[0, 0, 0.85, 1], pad = 0.5)
        fig.legend(handles, labels, title='Cell type', loc='center left', bbox_to_anchor=(0.88, 0.5))
        pdf.savefig(fig)
    pdf.close()

Prep data

In [None]:
df = cdr3_df.copy()
df = df.loc[df['productive'] == 'T']
df = df.groupby(by = ['donor', col_age_group, col_cell_type_fine, col_cell_type_broad, 'names', 'locus'], observed=True)[kf_columns].mean().reset_index()
df = df[df['names'].map(df['names'].value_counts()) == 2]
df = df.groupby(by = ['donor', col_age_group, col_cell_type_fine, col_cell_type_broad, 'names'], observed=True)[kf_columns].sum().reset_index()

min_n_cells = 10
n_cells_df = df[['donor', col_age_group, col_cell_type_fine]].groupby(['donor', col_age_group, col_cell_type_fine], observed = True).size().reset_index(name='n_cells')
n_cells_df = n_cells_df.loc[n_cells_df[col_cell_type_fine].isin(['T_CD8_naive', 'T_CD4_naive', 'T_Treg'])]
donors_removed = n_cells_df.loc[n_cells_df['n_cells'] < min_n_cells, 'donor'].unique()
print(f"Removing cells from donors: {donors_removed}")

In [None]:
df = df.copy()
df = df.loc[(df[col_cell_type_fine].isin(t_nk_groupings['newT'])) & (~df['donor'].isin(donors_removed))]
df = df.melt(id_vars = ['donor', 'age_group', col_cell_type_fine], value_vars = kf_columns, var_name = 'kf', value_name = 'kf_value')
df = df.groupby(['donor', 'age_group', col_cell_type_fine, 'kf'], observed = True).agg(mean_kf_value = ('kf_value', 'mean')).reset_index()

kidera_df = df.pivot_table(
    index=['donor', 'age_group', col_cell_type_fine], columns='kf', values='mean_kf_value'
)

# Ensure the index levels are categorical and ordered
kidera_df.index = kidera_df.index.set_levels([
    kidera_df.index.levels[0],  # donor
    pd.CategoricalIndex(
        kidera_df.index.levels[1],
        categories=col_age_group_levels,
        ordered=True
    ),
    pd.CategoricalIndex(
        kidera_df.index.levels[2],
        categories=col_cell_type_fine_levels,
        ordered=True
    )
])

kidera_df

In [None]:
from sklearn.decomposition import PCA
import itertools
from matplotlib.backends.backend_pdf import PdfPages

# Perform PCA
plot_df, explained_variance, loadings = run_pca(kidera_df, n_components=5)
plot_pairwise_pcs(plot_df, pairs = list(itertools.combinations(range(5), 2)), path = f'{plots_path}/vdj/cdr3Analysis/thyAgeing_newTSplit_kidera_pca.pdf')

In [None]:
from scipy.stats import f_oneway, kruskal

# Test association of each PC with age_group and cell_type (taa_l4)
results = []
for pc in ['PC1', 'PC2', 'PC3', 'PC4', 'PC5']:
    # Age group (categorical)
    groups_age = [plot_df.loc[plot_df['age_group'] == ag, pc].values for ag in plot_df['age_group'].unique()]
    #stat_age, pval_age = f_oneway(*groups_age) if all(len(g) > 1 for g in groups_age) else (None, None)
    stat_age_kw, pval_age_kw = kruskal(*groups_age) if all(len(g) > 1 for g in groups_age) else (None, None)
    
    # Cell type (categorical)
    groups_ct = [plot_df.loc[plot_df[col_cell_type_fine] == ct, pc].values for ct in plot_df[col_cell_type_fine].unique()]
    #stat_ct, pval_ct = f_oneway(*groups_ct) if all(len(g) > 1 for g in groups_ct) else (None, None)
    stat_ct_kw, pval_ct_kw = kruskal(*groups_ct) if all(len(g) > 1 for g in groups_ct) else (None, None)
    
    results.append({
        'PC': pc,
        'age_group_KW_stat': stat_age_kw,
        'age_group_KW_p': pval_age_kw,
        'cell_type_KW_stat': stat_ct_kw,
        'cell_type_KW_p': pval_ct_kw
    })

results_df = pd.DataFrame(results)
print(results_df)

In [None]:
kidera_pca_ct = {}
kidera_pca_ct_var = {}
kidera_pca_ct_loadings = {}
kidera_pca_ct_assoc = {}

for ct in ['T_CD8_naive', 'T_CD4_naive', 'T_Treg']:
    plot_df, var , loadings = run_pca(kidera_df.loc[(kidera_df.index.get_level_values(col_cell_type_fine) == ct) &
                                        (kidera_df.index.get_level_values(col_age_group) != 'aged')], n_components=5)
    kidera_pca_ct[ct] = plot_df
    kidera_pca_ct_var[ct] = var
    kidera_pca_ct_loadings[ct] = loadings
    
    plot_pairwise_pcs(pd.concat(kidera_pca_ct), pairs = list(itertools.combinations(range(5), 2)), hue = 'age_group', path = f'{plots_path}/vdj/cdr3Analysis/thyAgeing_newTSplit_kidera_pca_{ct}.pdf')
    
    results = []
    for pc in ['PC1', 'PC2', 'PC3', 'PC4', 'PC5']:
        # Age group (categorical)
        groups_age = [plot_df.loc[plot_df['age_group'] == ag, pc].values for ag in plot_df['age_group'].unique()]
        #stat_age, pval_age = f_oneway(*groups_age) if all(len(g) > 1 for g in groups_age) else (None, None)
        stat_age_kw, pval_age_kw = kruskal(*groups_age) if all(len(g) > 1 for g in groups_age) else (None, None)
        
        results.append({
            'PC': pc,
            'age_group_KW_stat': stat_age_kw,
            'age_group_KW_p': pval_age_kw,
        })

    kidera_pca_ct_assoc[ct] = pd.DataFrame(results)
    kidera_pca_ct_assoc[ct]['age_group_KW_significant'] = kidera_pca_ct_assoc[ct]['age_group_KW_p'] < 0.05

In [None]:
pd.concat(kidera_pca_ct_assoc).to_csv(f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_newTSplit_kidera_pca_assoc_perCellType.csv')
pd.concat(kidera_pca_ct).to_csv(f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_newTSplit_kidera_pca_perCellType.csv')
pd.concat(kidera_pca_ct_assoc)

In [None]:
from plotting.utils import calc_figsize, get_tint_palette, thyAgeing_colors


with PdfPages(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_newTSplit_kidera_pca_assocPCs_boxplot.pdf') as pdf:
    for ct in ['T_CD8_naive', 'T_CD4_naive', 'T_Treg']:
        plot_df = kidera_pca_ct[ct].copy()
        pcoi = kidera_pca_ct_assoc[ct].sort_values('age_group_KW_stat', ascending = False)['PC'][:2].tolist()

        fig, axes = plt.subplots(2, 2, figsize=calc_figsize(width=60, height=65), sharex=False, sharey=False,
                                width_ratios=[1, 0.15], height_ratios=[0.15, 1], gridspec_kw={'wspace': 0, 'hspace': 0.1})
        plt.subplots_adjust(wspace=0, hspace=0)

        # Boxplot: PC1 by age_group
        sns.boxplot(
            data=plot_df,
            x=pcoi[0],
            y='age_group',
            palette=get_tint_palette(thyAgeing_colors['magenta'])[:3],
            ax=axes[0, 0],
            showcaps = False,
            medianprops={"color": "white"},
            whiskerprops = {"color": 'black', 'linestyle': 'solid'}, 
            showfliers=False,
            boxprops = {'linewidth' : 0},
            width=0.5
        )
        axes[0, 0].set_xlim(-0.25, 0.25)
        axes[0, 0].set_ylabel('Age Group')
        # Remove all spines except left
        for spine in ['top', 'right', 'bottom']:
            axes[0, 0].spines[spine].set_visible(False)
        axes[0, 0].set_xticks([])
        axes[0, 0].set_xticklabels([])
        axes[0, 0].set_xlabel('')
        axes[0, 0].legend_.remove() if axes[0, 0].legend_ else None
        sns.despine(ax=axes[0, 0], offset=2, trim=True)

        # Boxplot: PC4 by cell_type
        p = sns.boxplot(
            data=plot_df,
            y=pcoi[1],
            x='age_group',
            palette=get_tint_palette(thyAgeing_colors['magenta'])[:3],
            ax=axes[1, 1],
            showcaps = False,
            medianprops={"color": "white"},
            whiskerprops = {"color": 'black', 'linestyle': 'solid'}, 
            showfliers=False,
            boxprops = {'linewidth' : 0},
            width=0.5
        )
        axes[1, 1].set_xlabel('Age group')
        # Remove all spines except left
        for spine in ['top', 'right', 'left']:
            axes[1, 1].spines[spine].set_visible(False)
        axes[1, 1].set_yticks([])
        axes[1, 1].set_yticklabels([])
        axes[1, 1].set_ylabel('')
        axes[1, 1].set_ylim(-0.2, 0.2)
        axes[1, 1].tick_params(axis='x', rotation=90, labeltop=False, labelbottom=True)
        axes[1, 1].set_xticklabels(axes[1, 1].get_xticklabels(), rotation=90, ha='center', va='top')
        sns.despine(ax=axes[1, 1], offset=2, trim=True)

        # Scatter plot: PC1 vs PC4
        sc = sns.scatterplot(
            data=plot_df,
            x=pcoi[0],
            y=pcoi[1],
            hue='age_group',
            palette=get_tint_palette(thyAgeing_colors['magenta'])[:3],
            #style='cell_type',
            #markers=['1', '_', '|'],
            legend=False,
            alpha=1,
            s=10,
            zorder=2,
            ax=axes[1, 0]
        )
        #sc.set_title('PC1 vs PC4')
        sc.set_xlabel(f'{pcoi[0]} - {kidera_pca_ct_var[ct][pcoi[0]]}')
        sc.set_ylabel(f'{pcoi[1]} - {kidera_pca_ct_var[ct][pcoi[1]]}')
        sc.set_xlim(-0.25, 0.25)
        sc.set_ylim(-0.2, 0.2)
        sc.spines['top'].set_visible(False)
        sc.spines['right'].set_visible(False)

        # Add PCA loadings as arrows
        x_loadings = pd.Series(kidera_pca_ct_loadings[ct][pcoi[0]], index=kidera_df.columns)
        y_loadings = pd.Series(kidera_pca_ct_loadings[ct][pcoi[1]], index=kidera_df.columns)
        loadings_df = pd.DataFrame({pcoi[0]: x_loadings, pcoi[1]: y_loadings})
        loadings_df['kf'] = loadings_df.index.str.upper()
        for i, row in loadings_df.iterrows():
            sc.arrow(0, 0, row[pcoi[0]] * 0.25, row[pcoi[1]] * 0.25, color='black', width=0.001, head_width=0.005, length_includes_head=True, alpha=0.5, zorder=1)
            sc.text(row[pcoi[0]] * 0.28, row[pcoi[1]] * 0.28, row['kf'], fontsize=6, color='black', ha='center', va='center', alpha=0.7, zorder=2)
        sns.despine(ax=sc, offset=2, trim=True)

        # Remove unused axis (axes[0, 1])
        axes[0, 1].text(0.5, 0.5, ct, ha='center', va='center', fontsize=8, fontweight='bold', transform=axes[0, 1].transAxes)
        axes[0,1].set_xticks([])
        axes[0,1].set_yticks([])
        sns.despine(ax=axes[0, 1], left =True, bottom=True)

        plt.tight_layout()
        pdf.savefig(fig)
pdf.close()

### PCA of Kidera factors (newly matured cells) - By chain

In [None]:
min_n_cells = 10
n_cells_df = cdr3_df[['donor', col_age_group, col_cell_type_fine]].groupby(['donor', col_age_group, col_cell_type_fine], observed = True).size().reset_index(name='n_cells')
n_cells_df = n_cells_df.loc[n_cells_df[col_cell_type_fine].isin(['T_CD8_naive', 'T_CD4_naive', 'T_Treg'])]
donors_removed = n_cells_df.loc[n_cells_df['n_cells'] < min_n_cells, 'donor'].unique()
print(f"Removing cells from donors: {donors_removed}")

df = cdr3_df.copy()
df = df.loc[(df[col_cell_type_fine].isin(t_nk_groupings['newT'])) & (~df['donor'].isin(donors_removed))]
df = df.melt(id_vars = ['donor', 'age_group', 'sample', col_cell_type_fine, 'locus', 'productive'], value_vars = kf_columns, var_name = 'kf', value_name = 'kf_value')
df = df.groupby(['donor', 'age_group','sample', col_cell_type_fine, 'locus','productive', 'kf'], observed = True).agg(mean_kf_value = ('kf_value', 'mean')).reset_index()
df = df.groupby(['donor', 'age_group', col_cell_type_fine, 'locus','productive', 'kf'], observed = True).agg(mean_kf_value = ('mean_kf_value', 'mean')).reset_index()
df

In [None]:
# Extracting Kidera factors for TRA and TRB
kidera_df_tra = df.loc[(df['productive'] == 'T') & (df['locus'] == 'TRA')].pivot_table(
    index=['donor', 'age_group', col_cell_type_fine], columns='kf', values='mean_kf_value'
)
kidera_df_trb = df.loc[(df['productive'] == 'T') & (df['locus'] == 'TRB')].pivot_table(
    index=['donor', 'age_group', col_cell_type_fine], columns='kf', values='mean_kf_value'
)

# Ensure the index levels are categorical and ordered
for kidera_df in [kidera_df_tra, kidera_df_trb]:
    kidera_df.index = kidera_df.index.set_levels([
        kidera_df.index.levels[0],  # donor
        pd.CategoricalIndex(
            kidera_df.index.levels[1],
            categories=col_age_group_levels,
            ordered=True
        ),
        pd.CategoricalIndex(
            kidera_df.index.levels[2],
            categories=col_cell_type_fine_levels,
            ordered=True
        )
    ])

kidera_df_trb

#### TRA

Overall:

In [None]:
from sklearn.decomposition import PCA
from typing import Optional, List

def run_pca(data, n_components=50):
    pca = PCA(n_components=n_components)
    principal_components = pca.fit_transform(data)
    explained_variance = {f'PC{i+1}': f"{pca.explained_variance_ratio_[i]*100:.1f}%" for i in range(len(pca.explained_variance_ratio_))}
    loadings = {f'PC{i+1}': pca.components_[i] for i in range(pca.components_.shape[0])}

    principal_components = pd.DataFrame(principal_components, index=data.index, columns=[f'PC{i+1}' for i in range(principal_components.shape[1])]).reset_index()
    
    for c in data.index.names:
        principal_components[c] = principal_components[c].astype('category').cat.remove_unused_categories()

    return principal_components, explained_variance, loadings

def plot_pairwise_pcs(plot_df, pairs, hue = col_cell_type_fine, path = None):

    with PdfPages(path) as pdf:
        # Age group plot
        fig, axes = plt.subplots(3, 4, figsize=calc_figsize(width=180, height=120))
        axes = axes.flatten()
        handles, labels = None, None
        for ax, (i, j) in zip(axes, pairs):
            sc = sns.scatterplot(data=plot_df, x=f'PC{i+1}', y=f'PC{j+1}', hue='age_group', palette='Set2', ax=ax, legend=True, s = 10)
            ax.set_title(f'PC{i+1} vs PC{j+1}')
            if handles is None or labels is None:
                handles, labels = sc.get_legend_handles_labels()
            sc.get_legend().remove()
        plt.tight_layout(rect=[0, 0, 0.85, 1], pad=0.5)
        # Add legend in a separate panel on the right
        fig.legend(handles, labels, title='Age group', loc='center left', bbox_to_anchor=(0.88, 0.5))
        pdf.savefig(fig)

        # Cell type plot
        fig, axes = plt.subplots(3, 4, figsize=calc_figsize(width=180, height=120))
        axes = axes.flatten()
        handles, labels = None, None
        for ax, (i, j) in zip(axes, pairs):
            sc = sns.scatterplot(
                data=plot_df,
                x=f'PC{i+1}',
                y=f'PC{j+1}',
                hue=hue,
                palette='tab10',
                ax=ax,
                legend=True,
                s = 10,
            )
            ax.set_title(f'PC{i+1} vs PC{j+1}')
            if handles is None or labels is None:
                handles, labels = sc.get_legend_handles_labels()
            sc.get_legend().remove()
        plt.tight_layout(rect=[0, 0, 0.85, 1], pad = 0.5)
        fig.legend(handles, labels, title='Cell type', loc='center left', bbox_to_anchor=(0.88, 0.5))
        pdf.savefig(fig)
    pdf.close()

In [None]:
plot_df, explained_variance, loadings = run_pca(kidera_df_trb, n_components=5)


In [None]:
from sklearn.decomposition import PCA
import itertools
from matplotlib.backends.backend_pdf import PdfPages

# Perform PCA
plot_df, explained_variance = run_pca(kidera_df_tra, n_components=5)
plot_pairwise_pcs(plot_df, pairs = list(itertools.combinations(range(5), 2)), path = f'{plots_path}/vdj/cdr3Analysis/thyAgeing_newTSplit_TRA_kidera_pca.pdf')

In [None]:
from plotting.utils import thyAgeing_colors
# Plot proportion of variance explained per principal component
explained_var = pca.explained_variance_ratio_
plt.figure(figsize=(6, 4))
sns.barplot(x=[f'PC{i+1}' for i in range(len(explained_var))], y=explained_var, color=thyAgeing_colors['magenta'])
plt.ylabel('Proportion of variance explained')
plt.xlabel('Principal Component')
plt.title('Variance Explained by Principal Components')
plt.ylim(0, 1)
plt.tight_layout()
plt.show()

In [None]:
from scipy.stats import f_oneway, kruskal

# Test association of each PC with age_group and cell_type (taa_l4)
results = []
for pc in ['PC1', 'PC2', 'PC3', 'PC4', 'PC5']:
    # Age group (categorical)
    groups_age = [plot_df.loc[plot_df['age_group'] == ag, pc].values for ag in plot_df['age_group'].unique()]
    #stat_age, pval_age = f_oneway(*groups_age) if all(len(g) > 1 for g in groups_age) else (None, None)
    stat_age_kw, pval_age_kw = kruskal(*groups_age) if all(len(g) > 1 for g in groups_age) else (None, None)
    
    # Cell type (categorical)
    groups_ct = [plot_df.loc[plot_df[col_cell_type_fine] == ct, pc].values for ct in plot_df[col_cell_type_fine].unique()]
    #stat_ct, pval_ct = f_oneway(*groups_ct) if all(len(g) > 1 for g in groups_ct) else (None, None)
    stat_ct_kw, pval_ct_kw = kruskal(*groups_ct) if all(len(g) > 1 for g in groups_ct) else (None, None)
    
    results.append({
        'PC': pc,
        'age_group_KW_stat': stat_age_kw,
        'age_group_KW_p': pval_age_kw,
        'cell_type_KW_stat': stat_ct_kw,
        'cell_type_KW_p': pval_ct_kw
    })

results_df = pd.DataFrame(results)
print(results_df)

Split by cell type:

In [None]:
kidera_pca_tra_ct = {}
kidera_pca_tra_ct_var = {}
kidera_pca_tra_ct_loadings = {}
kidera_pca_tra_ct_assoc = {}

for ct in ['T_CD8_naive', 'T_CD4_naive', 'T_Treg']:
    plot_df, var , loadings = run_pca(kidera_df_tra.loc[(kidera_df_tra.index.get_level_values(col_cell_type_fine) == ct) &
                                        (kidera_df_tra.index.get_level_values(col_age_group) != 'aged')], n_components=5)
    kidera_pca_tra_ct[ct] = plot_df
    kidera_pca_tra_ct_var[ct] = var
    kidera_pca_tra_ct_loadings[ct] = loadings
    
    plot_pairwise_pcs(pd.concat(kidera_pca_tra_ct), pairs = list(itertools.combinations(range(5), 2)), hue = 'age_group', path = f'{plots_path}/vdj/cdr3Analysis/thyAgeing_newTSplit_TRA_kidera_pca_{ct}.pdf')
    
    results = []
    for pc in ['PC1', 'PC2', 'PC3', 'PC4', 'PC5']:
        # Age group (categorical)
        groups_age = [plot_df.loc[plot_df['age_group'] == ag, pc].values for ag in plot_df['age_group'].unique()]
        #stat_age, pval_age = f_oneway(*groups_age) if all(len(g) > 1 for g in groups_age) else (None, None)
        stat_age_kw, pval_age_kw = kruskal(*groups_age) if all(len(g) > 1 for g in groups_age) else (None, None)
        
        results.append({
            'PC': pc,
            'age_group_KW_stat': stat_age_kw,
            'age_group_KW_p': pval_age_kw,
        })

    kidera_pca_tra_ct_assoc[ct] = pd.DataFrame(results)
    kidera_pca_tra_ct_assoc[ct]['age_group_KW_significant'] = kidera_pca_tra_ct_assoc[ct]['age_group_KW_p'] < 0.05

In [None]:
pd.concat(kidera_pca_tra_ct_assoc).to_csv(f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_newTSplit_TRA_kidera_pca_assoc_perCellType.csv')
pd.concat(kidera_pca_tra_ct).to_csv(f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_newTSplit_TRA_kidera_pca_perCellType.csv')
pd.concat(kidera_pca_tra_ct_assoc)

In [None]:
from plotting.utils import calc_figsize, get_tint_palette, thyAgeing_colors


with PdfPages(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_newTSplit_TRA_kidera_pca_assocPCs_boxplot.pdf') as pdf:
    for ct in ['T_CD8_naive', 'T_CD4_naive', 'T_Treg']:
        plot_df = kidera_pca_tra_ct[ct].copy()
        pcoi = kidera_pca_tra_ct_assoc[ct].sort_values('age_group_KW_stat', ascending = False)['PC'][:2].tolist()

        fig, axes = plt.subplots(2, 2, figsize=calc_figsize(width=60, height=65), sharex=False, sharey=False,
                                width_ratios=[1, 0.15], height_ratios=[0.15, 1], gridspec_kw={'wspace': 0, 'hspace': 0.1})
        plt.subplots_adjust(wspace=0, hspace=0)

        # Boxplot: PC1 by age_group
        sns.boxplot(
            data=plot_df,
            x=pcoi[0],
            y='age_group',
            palette=get_tint_palette(thyAgeing_colors['magenta'])[:3],
            ax=axes[0, 0],
            showcaps = False,
            medianprops={"color": "white"},
            whiskerprops = {"color": 'black', 'linestyle': 'solid'}, 
            showfliers=False,
            boxprops = {'linewidth' : 0},
            width=0.5
        )
        axes[0, 0].set_xlim(-0.25, 0.25)
        axes[0, 0].set_ylabel('Age Group')
        # Remove all spines except left
        for spine in ['top', 'right', 'bottom']:
            axes[0, 0].spines[spine].set_visible(False)
        axes[0, 0].set_xticks([])
        axes[0, 0].set_xticklabels([])
        axes[0, 0].set_xlabel('')
        axes[0, 0].legend_.remove() if axes[0, 0].legend_ else None
        sns.despine(ax=axes[0, 0], offset=2, trim=True)

        # Boxplot: PC4 by cell_type
        p = sns.boxplot(
            data=plot_df,
            y=pcoi[1],
            x='age_group',
            palette=get_tint_palette(thyAgeing_colors['magenta'])[:3],
            ax=axes[1, 1],
            showcaps = False,
            medianprops={"color": "white"},
            whiskerprops = {"color": 'black', 'linestyle': 'solid'}, 
            showfliers=False,
            boxprops = {'linewidth' : 0},
            width=0.5
        )
        axes[1, 1].set_xlabel('Age group')
        # Remove all spines except left
        for spine in ['top', 'right', 'left']:
            axes[1, 1].spines[spine].set_visible(False)
        axes[1, 1].set_yticks([])
        axes[1, 1].set_yticklabels([])
        axes[1, 1].set_ylabel('')
        axes[1, 1].set_ylim(-0.2, 0.2)
        axes[1, 1].tick_params(axis='x', rotation=90, labeltop=False, labelbottom=True)
        axes[1, 1].set_xticklabels(axes[1, 1].get_xticklabels(), rotation=90, ha='center', va='top')
        sns.despine(ax=axes[1, 1], offset=2, trim=True)

        # Scatter plot: PC1 vs PC4
        sc = sns.scatterplot(
            data=plot_df,
            x=pcoi[0],
            y=pcoi[1],
            hue='age_group',
            palette=get_tint_palette(thyAgeing_colors['magenta'])[:3],
            #style='cell_type',
            #markers=['1', '_', '|'],
            legend=False,
            alpha=1,
            s=10,
            zorder=2,
            ax=axes[1, 0]
        )
        #sc.set_title('PC1 vs PC4')
        sc.set_xlabel(f'{pcoi[0]} - {kidera_pca_tra_ct_var[ct][pcoi[0]]}')
        sc.set_ylabel(f'{pcoi[1]} - {kidera_pca_tra_ct_var[ct][pcoi[1]]}')
        sc.set_xlim(-0.25, 0.25)
        sc.set_ylim(-0.2, 0.2)
        sc.spines['top'].set_visible(False)
        sc.spines['right'].set_visible(False)

        # Add PCA loadings as arrows
        x_loadings = pd.Series(kidera_pca_tra_ct_loadings[ct][pcoi[0]], index=kidera_df_trb.columns)
        y_loadings = pd.Series(kidera_pca_tra_ct_loadings[ct][pcoi[1]], index=kidera_df_trb.columns)
        loadings_df = pd.DataFrame({pcoi[0]: x_loadings, pcoi[1]: y_loadings})
        loadings_df['kf'] = loadings_df.index.str.upper()
        for i, row in loadings_df.iterrows():
            sc.arrow(0, 0, row[pcoi[0]] * 0.25, row[pcoi[1]] * 0.25, color='black', width=0.001, head_width=0.005, length_includes_head=True, alpha=0.5, zorder=1)
            sc.text(row[pcoi[0]] * 0.28, row[pcoi[1]] * 0.28, row['kf'], fontsize=6, color='black', ha='center', va='center', alpha=0.7, zorder=2)
        sns.despine(ax=sc, offset=2, trim=True)

        # Remove unused axis (axes[0, 1])
        axes[0, 1].text(0.5, 0.5, ct, ha='center', va='center', fontsize=8, fontweight='bold', transform=axes[0, 1].transAxes)
        axes[0,1].set_xticks([])
        axes[0,1].set_yticks([])
        sns.despine(ax=axes[0, 1], left =True, bottom=True)

        plt.tight_layout()
        pdf.savefig(fig)
pdf.close()

#### TRB

Overall:

In [None]:
from sklearn.decomposition import PCA
import itertools
from matplotlib.backends.backend_pdf import PdfPages

# Perform PCA
# Perform PCA
plot_df, explained_variance = run_pca(kidera_df_trb, n_components=5)
plot_pairwise_pcs(plot_df, pairs = list(itertools.combinations(range(5), 2)), path = f'{plots_path}/vdj/cdr3Analysis/thyAgeing_newTSplit_TRB_kidera_pca.pdf')

In [None]:
from scipy.stats import f_oneway, kruskal

# Test association of each PC with age_group and cell_type (taa_l4)
results = []
for pc in ['PC1', 'PC2', 'PC3', 'PC4', 'PC5']:
    # Age group (categorical)
    groups_age = [plot_df.loc[plot_df['age_group'] == ag, pc].values for ag in plot_df['age_group'].unique()]
    #stat_age, pval_age = f_oneway(*groups_age) if all(len(g) > 1 for g in groups_age) else (None, None)
    stat_age_kw, pval_age_kw = kruskal(*groups_age) if all(len(g) > 1 for g in groups_age) else (None, None)
    
    # Cell type (categorical)
    groups_ct = [plot_df.loc[plot_df[col_cell_type_fine] == ct, pc].values for ct in plot_df[col_cell_type_fine].unique()]
    #stat_ct, pval_ct = f_oneway(*groups_ct) if all(len(g) > 1 for g in groups_ct) else (None, None)
    stat_ct_kw, pval_ct_kw = kruskal(*groups_ct) if all(len(g) > 1 for g in groups_ct) else (None, None)
    
    results.append({
        'PC': pc,
        'age_group_KW_stat': stat_age_kw,
        'age_group_KW_p': pval_age_kw,
        'cell_type_KW_stat': stat_ct_kw,
        'cell_type_KW_p': pval_ct_kw
    })

results_df = pd.DataFrame(results)
print(results_df)

In [None]:
with PdfPages(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_newTSplit_TRB_kidera_pca_loadings.pdf') as pdf:
    for pc_oi in ['PC1', 'PC2', 'PC3', 'PC4', 'PC5']:
        pc_num = int(pc_oi.replace('PC', '')) - 1
        pc_loadings = pd.Series(pca.components_[pc_num], index=kidera_df_trb.columns).reset_index(name ='loading')
        pc_loadings['kf'] = pd.Categorical(pc_loadings['kf'].str.upper(), categories=[f'KF{i}' for i in range(1, 11)], ordered=True)
        pc_loadings = pc_loadings.sort_values(by='kf')

        fig = plt.figure(figsize=calc_figsize(width=60, height=50))
        p = sns.barplot(data=pc_loadings, x='loading', y='kf', color=thyAgeing_colors['magenta'], orient='h')
        p.set_title(f'PCA Loadings for {pc_oi}')
        plt.axvline(x=0, color='black', linestyle='solid', linewidth=0.5)
        plt.tight_layout()
        pdf.savefig(fig)
        plt.close(fig)


In [None]:
from plotting.utils import calc_figsize, get_tint_palette, thyAgeing_colors, thyAgeing_greys
fig, axes = plt.subplots(2, 2, figsize=calc_figsize(width=60, height=65), sharex=False, sharey=False,
                         width_ratios=[1, 0.15], height_ratios=[0.15, 1], gridspec_kw={'wspace': 0, 'hspace': 0.1})
plt.subplots_adjust(wspace=0, hspace=0)

# Boxplot: PC1 by age_group
sns.boxplot(
    data=plot_df,
    x='PC1',
    y='age_group',
    palette=get_tint_palette(thyAgeing_colors['magenta']),
    ax=axes[0, 0],
    showcaps = False,
    medianprops={"color": "white"},
    whiskerprops = {"color": 'black', 'linestyle': 'solid'}, 
    showfliers=False,
    boxprops = {'linewidth' : 0},
    width=0.5
)
axes[0, 0].set_xlim(-0.25, 0.25)
axes[0, 0].set_ylabel('Age Group')
# Remove all spines except left
for spine in ['top', 'right', 'bottom']:
    axes[0, 0].spines[spine].set_visible(False)
axes[0, 0].set_xticks([])
axes[0, 0].set_xticklabels([])
axes[0, 0].set_xlabel('')
axes[0, 0].legend_.remove() if axes[0, 0].legend_ else None
sns.despine(ax=axes[0, 0], offset=2, trim=True)

# Boxplot: PC4 by cell_type
p = sns.boxplot(
    data=plot_df,
    x='cell_type',
    y='PC4',
    color=thyAgeing_greys['grey2'],
    ax=axes[1, 1],
    showfliers=False,
    showcaps = False,
    fill = False,
    medianprops={"color": "black"},
    whiskerprops = {"color": 'black', 'linestyle': 'solid'}, 
    boxprops = {'linewidth' : 0.25, 'color': 'black'},
    width=0.5
)
sns.scatterplot(
    data=plot_df,
    x='cell_type',
    y='PC4',
    hue='cell_type',
    style='cell_type',
    #jitter = True,
    #markers=['1', '_', '|'],
    palette=['black', 'black', 'black'],
    legend=False,
    ax=axes[1, 1],
    s=10,
    alpha=1,
    zorder=2
)
axes[1, 1].set_xlabel('Cell Type')
# Remove all spines except left
for spine in ['top', 'right', 'left']:
    axes[1, 1].spines[spine].set_visible(False)
axes[1, 1].set_yticks([])
axes[1, 1].set_yticklabels([])
axes[1, 1].set_ylabel('')
axes[1, 1].set_ylim(-0.2, 0.2)
axes[1, 1].tick_params(axis='x', rotation=90, labeltop=False, labelbottom=True)
axes[1, 1].set_xticklabels(axes[1, 1].get_xticklabels(), rotation=90, ha='center', va='top')
sns.despine(ax=axes[1, 1], offset=2, trim=True)

# Scatter plot: PC1 vs PC4
sc = sns.scatterplot(
    data=plot_df,
    x='PC1',
    y='PC4',
    hue='age_group',
    palette=get_tint_palette(thyAgeing_colors['magenta']),
    style='cell_type',
    #markers=['1', '_', '|'],
    legend=False,
    alpha=1,
    s=10,
    zorder=2,
    ax=axes[1, 0]
)
#sc.set_title('PC1 vs PC4')
sc.set_xlabel('PC1')
sc.set_ylabel('PC4')
sc.set_xlim(-0.25, 0.25)
sc.set_ylim(-0.2, 0.2)
sc.spines['top'].set_visible(False)
sc.spines['right'].set_visible(False)

# Add PCA loadings as arrows
pc1_loadings = pd.Series(pca.components_[0], index=kidera_df_trb.columns)
pc4_loadings = pd.Series(pca.components_[3], index=kidera_df_trb.columns)
loadings_df = pd.DataFrame({'PC1': pc1_loadings, 'PC4': pc4_loadings})
loadings_df['kf'] = loadings_df.index.str.upper()
for i, row in loadings_df.iterrows():
    sc.arrow(0, 0, row['PC1'] * 0.25, row['PC4'] * 0.25, color='black', width=0.001, head_width=0.005, length_includes_head=True, alpha=0.5, zorder=1)
    sc.text(row['PC1'] * 0.28, row['PC4'] * 0.28, row['kf'], fontsize=6, color='black', ha='center', va='center', alpha=0.7, zorder=2)
sns.despine(ax=sc, offset=2, trim=True)

# Remove unused axis (axes[0, 1])
axes[0, 1].axis('off')

plt.tight_layout()
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_newTSplit_TRB_kidera_pca_PC1_PC4_boxplot.pdf', bbox_inches='tight', dpi=300)

Split by cell type:

In [None]:
kidera_pca_trb_ct = {}
kidera_pca_trb_ct_var = {}
kidera_pca_trb_ct_loadings = {}
kidera_pca_trb_ct_assoc = {}

for ct in ['T_CD8_naive', 'T_CD4_naive', 'T_Treg']:
    plot_df, var , loadings = run_pca(kidera_df_trb.loc[(kidera_df_trb.index.get_level_values(col_cell_type_fine) == ct) &
                                        (kidera_df_trb.index.get_level_values(col_age_group) != 'aged')], n_components=5)
    kidera_pca_trb_ct[ct] = plot_df
    kidera_pca_trb_ct_var[ct] = var
    kidera_pca_trb_ct_loadings[ct] = loadings
    
    plot_pairwise_pcs(pd.concat(kidera_pca_trb_ct), pairs = list(itertools.combinations(range(5), 2)), hue = 'age_group', path = f'{plots_path}/vdj/cdr3Analysis/thyAgeing_newTSplit_TRB_kidera_pca_{ct}.pdf')
    
    results = []
    for pc in ['PC1', 'PC2', 'PC3', 'PC4', 'PC5']:
        # Age group (categorical)
        groups_age = [plot_df.loc[plot_df['age_group'] == ag, pc].values for ag in plot_df['age_group'].unique()]
        #stat_age, pval_age = f_oneway(*groups_age) if all(len(g) > 1 for g in groups_age) else (None, None)
        stat_age_kw, pval_age_kw = kruskal(*groups_age) if all(len(g) > 1 for g in groups_age) else (None, None)
        
        results.append({
            'PC': pc,
            'age_group_KW_stat': stat_age_kw,
            'age_group_KW_p': pval_age_kw,
        })

    kidera_pca_trb_ct_assoc[ct] = pd.DataFrame(results)
    kidera_pca_trb_ct_assoc[ct]['age_group_KW_significant'] = kidera_pca_trb_ct_assoc[ct]['age_group_KW_p'] < 0.05

In [None]:
pd.concat(kidera_pca_trb_ct_assoc).to_csv(f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_newTSplit_TRB_kidera_pca_assoc_perCellType.csv')
pd.concat(kidera_pca_trb_ct).to_csv(f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_newTSplit_TRB_kidera_pca_perCellType.csv')
pd.concat(kidera_pca_trb_ct_assoc)

In [None]:
from plotting.utils import calc_figsize, get_tint_palette, thyAgeing_colors


with PdfPages(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_newTSplit_TRB_kidera_pca_assocPCs_boxplot.pdf') as pdf:
    for ct in ['T_CD8_naive', 'T_CD4_naive', 'T_Treg']:
        plot_df = kidera_pca_trb_ct[ct].copy()
        pcoi = kidera_pca_trb_ct_assoc[ct].sort_values('age_group_KW_stat', ascending = False)['PC'][:2].tolist()

        fig, axes = plt.subplots(2, 2, figsize=calc_figsize(width=60, height=65), sharex=False, sharey=False,
                                width_ratios=[1, 0.15], height_ratios=[0.15, 1], gridspec_kw={'wspace': 0, 'hspace': 0.1})
        plt.subplots_adjust(wspace=0, hspace=0)

        # Boxplot: PC1 by age_group
        sns.boxplot(
            data=plot_df,
            x=pcoi[0],
            y='age_group',
            palette=get_tint_palette(thyAgeing_colors['magenta'])[:3],
            ax=axes[0, 0],
            showcaps = False,
            medianprops={"color": "white"},
            whiskerprops = {"color": 'black', 'linestyle': 'solid'}, 
            showfliers=False,
            boxprops = {'linewidth' : 0},
            width=0.5
        )
        axes[0, 0].set_xlim(-0.25, 0.25)
        axes[0, 0].set_ylabel('Age Group')
        # Remove all spines except left
        for spine in ['top', 'right', 'bottom']:
            axes[0, 0].spines[spine].set_visible(False)
        axes[0, 0].set_xticks([])
        axes[0, 0].set_xticklabels([])
        axes[0, 0].set_xlabel('')
        axes[0, 0].legend_.remove() if axes[0, 0].legend_ else None
        sns.despine(ax=axes[0, 0], offset=2, trim=True)

        # Boxplot: PC4 by cell_type
        p = sns.boxplot(
            data=plot_df,
            y=pcoi[1],
            x='age_group',
            palette=get_tint_palette(thyAgeing_colors['magenta'])[:3],
            ax=axes[1, 1],
            showcaps = False,
            medianprops={"color": "white"},
            whiskerprops = {"color": 'black', 'linestyle': 'solid'}, 
            showfliers=False,
            boxprops = {'linewidth' : 0},
            width=0.5
        )
        axes[1, 1].set_xlabel('Age group')
        # Remove all spines except left
        for spine in ['top', 'right', 'left']:
            axes[1, 1].spines[spine].set_visible(False)
        axes[1, 1].set_yticks([])
        axes[1, 1].set_yticklabels([])
        axes[1, 1].set_ylabel('')
        axes[1, 1].set_ylim(-0.2, 0.2)
        axes[1, 1].tick_params(axis='x', rotation=90, labeltop=False, labelbottom=True)
        axes[1, 1].set_xticklabels(axes[1, 1].get_xticklabels(), rotation=90, ha='center', va='top')
        sns.despine(ax=axes[1, 1], offset=2, trim=True)

        # Scatter plot: PC1 vs PC4
        sc = sns.scatterplot(
            data=plot_df,
            x=pcoi[0],
            y=pcoi[1],
            hue='age_group',
            palette=get_tint_palette(thyAgeing_colors['magenta'])[:3],
            #style='cell_type',
            #markers=['1', '_', '|'],
            legend=False,
            alpha=1,
            s=10,
            zorder=2,
            ax=axes[1, 0]
        )
        #sc.set_title('PC1 vs PC4')
        sc.set_xlabel(f'{pcoi[0]} - {kidera_pca_trb_ct_var[ct][pcoi[0]]}')
        sc.set_ylabel(f'{pcoi[1]} - {kidera_pca_trb_ct_var[ct][pcoi[1]]}')
        sc.set_xlim(-0.25, 0.25)
        sc.set_ylim(-0.2, 0.2)
        sc.spines['top'].set_visible(False)
        sc.spines['right'].set_visible(False)

        # Add PCA loadings as arrows
        x_loadings = pd.Series(kidera_pca_trb_ct_loadings[ct][pcoi[0]], index=kidera_df_trb.columns)
        y_loadings = pd.Series(kidera_pca_trb_ct_loadings[ct][pcoi[1]], index=kidera_df_trb.columns)
        loadings_df = pd.DataFrame({pcoi[0]: x_loadings, pcoi[1]: y_loadings})
        loadings_df['kf'] = loadings_df.index.str.upper()
        for i, row in loadings_df.iterrows():
            sc.arrow(0, 0, row[pcoi[0]] * 0.25, row[pcoi[1]] * 0.25, color='black', width=0.001, head_width=0.005, length_includes_head=True, alpha=0.5, zorder=1)
            sc.text(row[pcoi[0]] * 0.28, row[pcoi[1]] * 0.28, row['kf'], fontsize=6, color='black', ha='center', va='center', alpha=0.7, zorder=2)
        sns.despine(ax=sc, offset=2, trim=True)

        # Remove unused axis (axes[0, 1])
        axes[0, 1].text(0.5, 0.5, ct, ha='center', va='center', fontsize=8, fontweight='bold', transform=axes[0, 1].transAxes)
        axes[0,1].set_xticks([])
        axes[0,1].set_yticks([])
        sns.despine(ax=axes[0, 1], left =True, bottom=True)

        plt.tight_layout()
        pdf.savefig(fig)
pdf.close()

### Kidera shift at transitions

In [None]:
df = cdr3_df.copy()
df = df.loc[(df[col_cell_type_fine].isin(['T_DN(Q)', 'T_DP(Q)', 'T_αβT(entry)', 'T_CD8_naive', 'T_CD4_naive', 'T_Treg'])) & (df['productive'] == 'T')]
#df[col_cell_type_fine] = df[col_cell_type_fine].apply(lambda x: 'newT' if x in t_nk_groupings['newT'] else x)
df = df.melt(id_vars = ['donor', 'age_group', 'sample', col_cell_type_fine, 'locus', 'productive'], value_vars = kf_columns, var_name = 'kf', value_name = 'kf_value')
df = df.groupby(['donor', 'age_group','sample', col_cell_type_fine, 'locus','productive', 'kf'], observed = True).agg(mean_kf_value = ('kf_value', 'mean')).reset_index()
df = df.pivot_table(index = ['donor', col_age_group, 'sample', 'locus', 'productive', 'kf'], columns = [col_cell_type_fine], values = 'mean_kf_value', aggfunc = 'mean').reset_index()
df

In [None]:
transitions = [
    ('T_DN(Q)', 'T_DP(Q)'),
    ('T_DP(Q)', 'T_αβT(entry)'),
    ('T_αβT(entry)', 'T_CD8_naive'),
    ('T_αβT(entry)', 'T_CD4_naive'),
    ('T_αβT(entry)', 'T_Treg')
]

transitions_concat = [f'{start}->{end}' for start, end in transitions]
for start, end in transitions:
    df[f'{start}->{end}'] = df[end] - df[start]
    
df = df.groupby(['donor', 'age_group', 'locus','productive', 'kf'], observed = True)[[f'{start}->{end}' for start, end in transitions]].mean().reset_index()
df = df.melt(id_vars = ['donor', 'age_group', 'locus','productive', 'kf'], value_vars = [f'{start}->{end}' for start, end in transitions], var_name = 'transition', value_name = 'kf_shift')
df.tail()

In [None]:
from plotting.utils import plot_faceted_grouped_boxplot
data = df.loc[(df['locus'] == 'TRB')]
data['transition'] = data['transition'].apply(lambda x: x.replace('T_', ''))

p = plot_faceted_grouped_boxplot(data = data, x = 'transition', y = 'kf_shift', hue = col_age_group, hue_order = col_age_group_levels, order = [t.replace('T_', '') for t in transitions_concat], 
                             facet_kwargs = dict(col = 'kf', col_wrap = 5, col_order = ['kf1', 'kf2', 'kf3', 'kf4', 'kf5', 'kf6', 'kf7', 'kf8', 'kf9', 'kf10']),
                             add_stats = True, save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_devSplit_TRB_kideraShift_', 
                             format_percent = False, format_log = False, x_label = 'Transition', y_label = 'Kidera mean shift TRB', legend_title='Age group', #figsize = calc_figsize(width = 'single', height_ratio = 0.5),
                             ylim = None, #legend_kwargs = dict(loc = "upper left", bbox_to_anchor=(1, 1))
                             palette = get_tint_palette(thyAgeing_colors['magenta']), figsize = calc_figsize(width = 150, height = 80),
                             )
plt.subplots_adjust(wspace=0.2)
for ax in p.axes.flat:
    ax.set_facecolor("#efefef")
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_devSplit_TRB_kideraShift.pdf', bbox_inches='tight', dpi=300)

In [None]:
from plotting.utils import plot_faceted_grouped_boxplot
data = df.loc[(df['locus'] == 'TRA')]
data = df.loc[(df['transition'] != 'T_DN(Q)->T_DP(Q)')]

data['transition'] = data['transition'].apply(lambda x: x.replace('T_', ''))

p = plot_faceted_grouped_boxplot(data = data, x = 'transition', y = 'kf_shift', hue = col_age_group, hue_order = col_age_group_levels, order = [t.replace('T_', '') for t in transitions_concat if t.replace('T_', '') in data['transition'].tolist()], 
                             facet_kwargs = dict(col = 'kf', col_wrap = 5, col_order = ['kf1', 'kf2', 'kf3', 'kf4', 'kf5', 'kf6', 'kf7', 'kf8', 'kf9', 'kf10']),
                             add_stats = True, save_stats = f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_devSplit_TRA_kideraShift_', 
                             format_percent = False, format_log = False, x_label = 'Transition', y_label = 'Kidera mean shift TRA', legend_title='Age group', figsize = calc_figsize(width = 150, height = 80),
                             ylim = None, #legend_kwargs = dict(loc = "upper left", bbox_to_anchor=(1, 1))
                             palette = get_tint_palette(thyAgeing_colors['magenta'])
                             )
plt.subplots_adjust(wspace=0.2)
for ax in p.axes.flat:
    ax.set_facecolor("#efefef")
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_devSplit_TRA_kideraShift.pdf', bbox_inches='tight', dpi=300)

## Expression of VDJ recombination enzymes

In [None]:
vdj_enzymes = {'β-selection': ['PTCRA'],
               'Recombinase': ['RAG1', 'RAG2'],
               'DNA helicase' : ['XRCC5', 'XRCC6'],
               'X-link repair': ['PRKDC', 'DCLRE1C', 'LIG4', 'XRCC4'],
               'TDT' : ['DNTT']}

vdj_enzymes_genes = [g for genes in vdj_enzymes.values() for g in genes]

### Age effect

In [None]:
import pickle

# Load the differential expression analysis (DEA) data
deg_path = '/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/data/analyses/dea/thyAgeing_dea_taa_l4_adult_vs_infant_ageEffect.pkl'
with open(deg_path, 'rb') as file:
    deg = pickle.load(file)

# Display the loaded data
deg_df = pd.concat(deg).reset_index(names = ['cell_type', 'gene_name'])

deg_df.head()

In [None]:
degs_oi = deg_df.loc[deg_df['gene_name'].isin(vdj_enzymes_genes) & (deg_df['cell_type'].str.startswith('T_D'))]

degs_oi.loc[(deg_df['padj'] < 0.1)].sort_values(by = 'log2FoldChange', ascending = False)

In [None]:
degs_oi.to_csv(f'{data_path}/analyses/vdj/cdr3Analysis/thyAgeing_vdjEnzymes_ageEffect_degs.csv', index=False)

In [None]:
degs_oi['log2FoldChange'].describe()

In [None]:
from scipy.cluster.hierarchy import linkage, leaves_list
from plotting.utils import thyAgeing_colors

df = degs_oi.pivot_table(index = 'cell_type', columns = 'gene_name', values = 'log2FoldChange')
df_annot = degs_oi.pivot_table(index = 'cell_type', columns = 'gene_name', values = 'padj')
df_annot = df_annot.applymap(lambda x: '*' if x < 0.05 else '')


col_order = [g for g in vdj_enzymes_genes if g in df.columns]
row_order = [#'T_DN(early)',
             'T_DN(P)', 'T_DN(Q)', 'T_DP(P)', 'T_DP(Q)']

# Reorder the columns of the dataframe
df = df.loc[row_order, col_order]

# Plot the reordered heatmap
p = sns.heatmap(df, cmap=sns.blend_palette([thyAgeing_colors['teal'],'white',thyAgeing_colors['orange']], as_cmap=True),
                center=0, vmin=-2, vmax=2, cbar_kws={'label': 'log2FC'}, xticklabels=True, yticklabels=True,
                annot = df_annot.loc[row_order, col_order], fmt='', annot_kws={'size': 8, 'weight': 'bold'},)
p.set_xlabel('Cell type')
p.set_ylabel('Gene')
p.tick_params(axis='x', rotation=90)
p.tick_params(axis='y', rotation=0)
p.tick_params(axis='both', which='both', length=0)
p.figure.set_size_inches(calc_figsize(width=60, height=30))
p.figure.tight_layout(rect=[0, 0, 1, 0.95], pad = 0)
plt.savefig(f'{plots_path}/vdj/cdr3Analysis/thyAgeing_devSplit_vdjRecomb_ageEffect_degs.pdf', bbox_inches='tight')

### Gender effect (adult)

In [None]:
import pickle

# Load the differential expression analysis (DEA) data
deg_path = f'{general_data_path}/analyses/dea/thyAgeing_dea_taa_l4_F_vs_M_genderEffect_adult.pkl'
with open(deg_path, 'rb') as file:
    deg = pickle.load(file)

# Display the loaded data
deg_df = pd.concat(deg).reset_index(names = ['cell_type', 'gene_name'])

degs_oi = deg_df.loc[deg_df['gene_name'].isin(vdj_enzymes_genes + ['PTRCA']) & (deg_df['cell_type'].str.startswith('T_D'))]

degs_oi.loc[(deg_df['padj'] < 0.1)].sort_values(by = 'log2FoldChange', ascending = False)

In [None]:
degs_oi

In [None]:
from scipy.cluster.hierarchy import linkage, leaves_list

df = degs_oi.pivot_table(index = 'cell_type', columns = 'gene_name', values = 'log2FoldChange')
df_annot = degs_oi.pivot_table(index = 'cell_type', columns = 'gene_name', values = 'padj')
df_annot = df_annot.applymap(lambda x: '*' if x < 0.05 else '')

# Perform hierarchical clustering on the columns
linkage_matrix = linkage(df.T.fillna(0), method='ward')
column_order = leaves_list(linkage_matrix)

# Perform hierarchical clustering on the rows
row_linkage_matrix = linkage(df.fillna(0), method='ward')
row_order = leaves_list(row_linkage_matrix)

# Reorder the columns of the dataframe
df = df.iloc[row_order, column_order]

# Plot the reordered heatmap
p = sns.heatmap(df, cmap=sns.blend_palette([thyAgeing_colors['teal'],'white',thyAgeing_colors['orange']], as_cmap=True),
                center=0, vmin=-2, vmax=2, cbar_kws={'label': 'log2FC'}, xticklabels=True, yticklabels=True,
                annot = df_annot.iloc[row_order, column_order], fmt='', annot_kws={'size': 8, 'weight': 'bold'},)
p.set_xlabel('Cell type')
p.set_ylabel('Gene')
p.tick_params(axis='x', rotation=90)
p.tick_params(axis='y', rotation=0)
p.figure.set_size_inches(calc_figsize(width=60, height=40))
p.figure.tight_layout(rect=[0, 0, 1, 0.95], pad = 0)
plt.savefig(f'{plots_path}/vdj/thyAgeing_devSplit_vdjRecomb_genderEffect_degs.pdf', bbox_inches='tight')

In [None]:
session_info.show()