# Thymus agein atlas: T cell clonotype analysis

In [None]:
import os
import sys
import session_info
from datetime import datetime
today = datetime.today().strftime('%Y-%m-%d')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import scanpy as sc
import anndata as ad
import hdf5plugin
import dandelion as ddl

import warnings
warnings.filterwarnings("ignore", category=ad.ImplicitModificationWarning)

# Add repo path to sys path (allows to access scripts and metadata from repo)
#repo_path,_ = os.path.split(os.path.split(os.getcwd())[0])
repo_path = '/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/T_NK_compartment'
sys.path.insert(1, repo_path) 
sys.path.insert(2, '/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts')

# Autoreload custom scripts
%load_ext autoreload
%autoreload 2

# Define paths
plots_path = f'{repo_path}/plots/'
data_path = f'{repo_path}/data/'
model_path = os.path.join(repo_path, 'models')
general_data_path = '/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/data'

print('Dir for plots: {}'.format(plots_path))
print('Dir for data: {}'.format(data_path))

# Formatting
from matplotlib import font_manager
font_manager.fontManager.addfont("/nfs/team205/ny1/ThymusSpatialAtlas/software/Arial.ttf")
plt.style.use('/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts/plotting/thyAgeing.mplstyle')

# Import custom scripts
from utils import get_latest_version,update_obs,freq_by_donor
from anno_levels import get_ct_levels, get_ct_palette, age_group_levels, age_group_palette, t_nk_groupings
from plotting.utils import plot_grouped_boxplot, calc_figsize, thyAgeing_colors, get_tint_palette, get_chroma_palette, create_blend_palette

In [None]:
# Define paths
plots_path = f'{repo_path}/plots'
data_path = f'{repo_path}/data'
model_path = os.path.join(repo_path, 'models')
general_data_path = '/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/data'

print('Dir for plots: {}'.format(plots_path))
print('Dir for data: {}'.format(data_path))

## Load data

In [None]:
# Load adata
object_version = 'v5_2025-04-03'
adata = ad.read_h5ad(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_{object_version}.zarr')

# Add new annotations to adata
ct_anno = pd.read_csv(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_v4_2025-02-04_curatedAnno_v10.csv', index_col = 0)
for c in ct_anno.columns:
    if c in adata.obs.columns:
        adata.obs.drop(c, axis = 1, inplace = True)
adata.obs = adata.obs.join(ct_anno)

# Filter data (only include annotated cells)
adata = adata[~adata.obs['taa_l5'].str.contains('locnt|-sp|explore', na = True)]

# Update metadata
latest_meta_path = get_latest_version(dir = f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
latest_meta = pd.read_excel(latest_meta_path)
update_obs(adata, latest_meta, on = 'index', ignore_warning = True)

# Add vdj data
meta_tcr = pd.read_csv(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_v9_2025-03-28_tcrab_v6.csv', index_col = 0)
adata.obs = adata.obs.join(meta_tcr)

# Exclude non-T cells
adata = adata[~adata.obs['rearrangement_status_VDJ'].isna()]

# Load TCRab data
vdj_version = 'v6_2025-04-03'
tcrab_info = ddl.read_h5ddl(f'{data_path}/objects/vdj/thymusAgeing_tcrabFiltered_{vdj_version}.h5ddl')

In [None]:
# Define columns
col_cell_type_broad = 'taa_l3'
col_cell_type_fine = 'taa_l4'
col_cell_type_broad_levels = get_ct_levels(col_cell_type_broad, taa_l1 = ['T', 'NK'])
col_cell_type_fine_levels = get_ct_levels(col_cell_type_fine, taa_l1 = ['T', 'NK'])
col_age_group = 'age_group'
col_age_group_levels = eval(f'{col_age_group}_levels')

## Recirculating T cells

In [None]:
# Keep most expressed productive chains for each cell
tcrab_df = tcrab_info.data.copy()
tcrab_df = tcrab_df.loc[tcrab_df['productive'] == 'T']
tcrab_df = tcrab_df.loc[tcrab_df['locus'].isin(['TRA', 'TRB'])]
tcrab_df['index'] = tcrab_df['sequence_id'].str.split('-').str[0]
tcrab_df['barcode'] = tcrab_df.apply(lambda x: x['sequence_id'].removeprefix(f"{x['index']}-").split('_')[0], axis=1)
tcrab_df['names'] = tcrab_df['index'] + '-' + tcrab_df['barcode']
tcrab_df = tcrab_df.loc[tcrab_df['names'].isin(adata.obs_names[adata.obs['taa_l2'] == 'T_recirc'])]
tcrab_df = tcrab_df.loc[tcrab_df.groupby(['names', 'locus'])['umi_count'].idxmax()]
tcrab_df = tcrab_df[['names','index','barcode', 'umi_count', 'locus', 'v_call', 'd_call', 'j_call', 'cdr3_aa', 'cdr3']]
tcrab_df = tcrab_df.merge(adata.obs[['donor', 'index', 'taa_l4', 'age_group']], left_on='names', right_index=True, how = 'left')

# Pivot table
tcrab_df = tcrab_df.pivot_table(index=['names','barcode', 'donor', 'taa_l4'], columns='locus', values=['umi_count', 'v_call', 'd_call', 'j_call', 'cdr3_aa', 'cdr3'], aggfunc = 'first')
# Collapse columns names
tcrab_df.columns = ['_'.join(col).strip() for col in tcrab_df.columns.values]

tcrab_df

In [None]:
# Convert to cell2tcr format
cell2tcr_columns = ['individual', 'IR_VDJ_1_junction_aa','IR_VDJ_1_v_call','IR_VDJ_1_j_call','IR_VJ_1_junction_aa','IR_VJ_1_v_call','IR_VJ_1_j_call']
cell2tcr_df = tcrab_df.reset_index().rename(columns = dict(zip(['donor', 'cdr3_aa_TRB', 'v_call_TRB', 'j_call_TRB', 'cdr3_aa_TRA', 'v_call_TRA', 'j_call_TRA'], cell2tcr_columns)))
cell2tcr_df = cell2tcr_df[cell2tcr_columns + ['names']]
cell2tcr_df = cell2tcr_df.astype('string')

cell2tcr_df

In [None]:
# Read tsv
vdj_genes = pd.read_csv(f'{data_path}/analyses/clonotypes/alphabeta_gammadelta_db.tsv', sep = '\t')
vdj_genes = vdj_genes.loc[vdj_genes['organism'] == 'human',:]
vdj_genes['id'] = vdj_genes['id'].astype(str)

excluded_genes_list = []
for gene in ['IR_VDJ_1_v_call','IR_VDJ_1_j_call','IR_VJ_1_v_call','IR_VJ_1_j_call']:
    
    # For genes with ambiguous assignments, take first assignment
    cell2tcr_df[gene] = cell2tcr_df[gene].str.split(',').str[0]
    cell2tcr_df.dropna(subset=[gene], inplace = True)
    
    excluded_genes = np.setdiff1d(cell2tcr_df[gene].unique(), vdj_genes['id']).tolist()
    print(f'Excluded for {gene}: {excluded_genes}')
    excluded_genes_list.append(excluded_genes)
    
    cell2tcr_df = cell2tcr_df.loc[~cell2tcr_df[gene].isin(excluded_genes),:]

### TRB with known antigen specificity

In [None]:
import cell2tcr
# get all TCR-beta chain matches with IEDB.org database
cell2tcr_df = cell2tcr_df.loc[cell2tcr_df['IR_VDJ_1_junction_aa'].notnull(),:]
scores = cell2tcr.db_match(cell2tcr_df['IR_VDJ_1_junction_aa'].values)

# annotate original df
cell2tcr.db_annotate(cell2tcr_df, scores, 'IR_VDJ_1_junction_aa')
cell2tcr_df.to_csv(f'{data_path}/analyses/clonotypes/thyAgeing_recircSplit_TRB_knownEpitopes.csv', index = False)

In [None]:
cell2tcr_df = pd.read_csv(f'{data_path}/analyses/clonotypes/thyAgeing_recircSplit_TRB_knownEpitopes.csv')
cell2tcr_df['has_iedb'] = cell2tcr_df['organism'].notnull()
cell2tcr_df = cell2tcr_df.merge(adata.obs[['donor', col_age_group, col_cell_type_fine]], left_on = 'names', right_index=True, how = 'left')

In [None]:
# Proportion of cells with known epitopes
df = cell2tcr_df.groupby(['donor', col_age_group, col_cell_type_fine], observed=True)['has_iedb'].value_counts(normalize=True).to_frame('freq').reset_index()
df = df.loc[df['has_iedb'] == True,:] # Only keep rows with IEDB matches

plot_grouped_boxplot(data = df, x = col_cell_type_fine, y = 'freq', hue = col_age_group, order = [c for c in col_cell_type_fine_levels if c in df[col_cell_type_fine].tolist()], hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'Proportion of known TRB', legend_title = 'Age group', add_stats = True, format_percent = True, figsize = calc_figsize(width = 70, height = 60),
                     save_stats = f'{data_path}/analyses/vdj/clonotypes/thyAgeing_recircSplit_TRB_propKnownEpitopes', palette = get_tint_palette(thyAgeing_colors['magenta']),
                     )
plt.savefig(f'{plots_path}/vdj/clonotypes/thyAgeing_recircSplit_TRB_propKnownEpitopes.pdf', bbox_inches='tight', dpi=300)

In [None]:
# Number of cells with known epitopes
df = cell2tcr_df.groupby(['donor', col_age_group, col_cell_type_fine], observed=True)['has_iedb'].value_counts().to_frame('n').reset_index()
df = df.loc[df['has_iedb'] == True,:] # Only keep rows with IEDB matches

plot_grouped_boxplot(data = df, x = col_cell_type_fine, y = 'n', hue = col_age_group, order = [c for c in col_cell_type_fine_levels if c in df[col_cell_type_fine].tolist()], hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'Number of known TRB', legend_title = 'Age group', add_stats = True, format_percent = False, figsize = calc_figsize(width = 70, height = 60),
                     save_stats = f'{data_path}/analyses/vdj/clonotypes/thyAgeing_recircSplit_TRB_nKnownEpitopes', palette = get_tint_palette(thyAgeing_colors['magenta']),
                     )
plt.savefig(f'{plots_path}/vdj/clonotypes/thyAgeing_recircSplit_TRB_nKnownEpitopes.png', bbox_inches='tight', dpi=300)

In [None]:
cell2tcr_df['organism'].unique()

In [None]:
cell2tcr_df.loc[cell2tcr_df['organism'] == 'Homo sapiens (human)'].groupby(['antigen', 'age_group']).size()

In [None]:
cell2tcr_df.loc[cell2tcr_df['antigen'] == 'Insulin']


In [None]:
cell2tcr_df.loc[(cell2tcr_df[col_cell_type_fine] == 'T_CD8_age-assoc') & (cell2tcr_df['has_iedb'] == True),:].groupby(['age_group'])['organism'].value_counts()

In [None]:
cell2tcr_df.loc[(cell2tcr_df[col_cell_type_fine] == 'T_Treg_tr') & (cell2tcr_df['has_iedb'] == True),:].groupby(['age_group'])['organism'].value_counts()

In [None]:
cell2tcr_df.loc[(cell2tcr_df[col_cell_type_fine] == 'T_CD8_rm') & (cell2tcr_df['has_iedb'] == True),:].groupby(['age_group'])['organism'].value_counts()

In [None]:
df = cell2tcr_df.groupby(['donor','age_group'])['organism'].value_counts().to_frame('n').reset_index()
df_wide = df.pivot(index=['age_group', 'donor'], columns='organism', values='n').transpose()

plt.figure(figsize=calc_figsize(width = 180, height = 100))
ax = sns.heatmap(
    df_wide.T, vmax=50, linewidths=.05, xticklabels=True, yticklabels=True,
    cmap=sns.blend_palette(
        [thyAgeing_colors['blue'], thyAgeing_colors['purple'], thyAgeing_colors['magenta'], thyAgeing_colors['orange'], thyAgeing_colors['yellow']],
        as_cmap=True, n_colors=10
    ),
    cbar_kws={'label': 'N(cells)'}
)
plt.xlabel('Antigen organism')
plt.ylabel('Donor')
plt.savefig(f'{plots_path}/vdj/clonotypes/thyAgeing_recircSplit_TRB_knownEpitopes_heatmap.pdf', bbox_inches='tight', dpi=300)

Compare with proportion of known TRB in healthy blood:

In [None]:
adata_pbmc = ad.read_h5ad('/nfs/team205/ld21/public/PBMC_COVID_19_6_studies_GEX_VDJ.h5ad', backed = 'r')
adata_pbmc = adata_pbmc[(adata_pbmc.obs['severity'].isin(['healthy', 'control'])) & (adata_pbmc.obs['study'] == 'Yoshida'),:]
adata_pbmc = adata_pbmc.to_memory()

adata_pbmc.obs['age'] = round(adata_pbmc.obs['age'].astype(float))
age_mapping = {range(0, 4): 'infant', range(4, 21): 'paed', range(21, 56): 'adult', range(56, 100): 'aged'}
adata_pbmc.obs['age_group'] = adata_pbmc.obs['age'].apply(lambda x: next((v for k, v in age_mapping.items() if x in k), 'unknown'))

adata_pbmc.obs[['subject', 'age_group']].drop_duplicates().groupby('age_group').size()

In [None]:
# Convert to cell2tcr format
tcrab_df = adata_pbmc.obs[['subject', 'cdr3_b_aa','v_b_gene', 'j_b_gene', 'cdr3_a_aa', 'v_a_gene', 'j_a_gene', 'age_group']].copy()
cell2tcr_columns = ['individual', 'IR_VDJ_1_junction_aa','IR_VDJ_1_v_call','IR_VDJ_1_j_call','IR_VJ_1_junction_aa','IR_VJ_1_v_call','IR_VJ_1_j_call']
cell2tcr_df = tcrab_df.reset_index().rename(columns = dict(zip(['subject', 'cdr3_b_aa','v_b_gene', 'j_b_gene', 'cdr3_a_aa', 'v_a_gene', 'j_a_gene'], cell2tcr_columns)))
cell2tcr_df = cell2tcr_df.astype('string')

cell2tcr_df

In [None]:
# Read tsv
vdj_genes = pd.read_csv(f'{data_path}/analyses/clonotypes/alphabeta_gammadelta_db.tsv', sep = '\t')
vdj_genes = vdj_genes.loc[vdj_genes['organism'] == 'human',:]
vdj_genes['id'] = vdj_genes['id'].astype(str)

excluded_genes_list = []
for gene in ['IR_VDJ_1_v_call','IR_VDJ_1_j_call','IR_VJ_1_v_call','IR_VJ_1_j_call']:
    
    # For genes with ambiguous assignments, take first assignment
    cell2tcr_df[gene] = cell2tcr_df[gene].str.split(',').str[0]
    cell2tcr_df.dropna(subset=[gene], inplace = True)
    
    excluded_genes = np.setdiff1d(cell2tcr_df[gene].unique(), vdj_genes['id']).tolist()
    print(f'Excluded for {gene}: {excluded_genes}')
    excluded_genes_list.append(excluded_genes)
    
    cell2tcr_df = cell2tcr_df.loc[~cell2tcr_df[gene].isin(excluded_genes),:]

In [None]:
# get all TCR-beta chain matches with IEDB.org database
cell2tcr_df = cell2tcr_df.loc[cell2tcr_df['IR_VDJ_1_junction_aa'].notnull(),:]
scores = cell2tcr.db_match(cell2tcr_df['IR_VDJ_1_junction_aa'].values)

# annotate original df
cell2tcr.db_annotate(cell2tcr_df, scores, 'IR_VDJ_1_junction_aa')
cell2tcr_df.to_csv(f'{data_path}/analyses/clonotypes/thyAgeing_pbmcYoshida_TRB_knownEpitopes.csv', index = False)

In [None]:
yoshida_cell2tcr_df = pd.read_csv(f'{data_path}/analyses/clonotypes/thyAgeing_pbmcYoshida_TRB_knownEpitopes.csv')
yoshida_cell2tcr_df = yoshida_cell2tcr_df.set_index('index')
yoshida_cell2tcr_df['has_iedb'] = yoshida_cell2tcr_df['organism'].notnull()
yoshida_cell2tcr_df = yoshida_cell2tcr_df.join(adata_pbmc.obs[['celltypist_majority']])

# Proportion of cells with known epitopes
df = yoshida_cell2tcr_df.groupby(['individual', 'age_group', 'celltypist_majority'], observed=True)['has_iedb'].value_counts(normalize=True).to_frame('freq').reset_index()

df

In [None]:
# Proportion of cells with known epitopes
df = yoshida_cell2tcr_df.groupby(['individual', 'age_group', 'celltypist_majority'], observed=True)['has_iedb'].value_counts(normalize=True).to_frame('freq').reset_index()
df = df.loc[df['has_iedb'] == True,:] # Only keep rows with IEDB matches

plot_grouped_boxplot(data = df, x = 'celltypist_majority', y = 'freq', hue = col_age_group, order = df['celltypist_majority'].unique(), hue_order = col_age_group_levels, 
                     x_label = 'Cell population', y_label = 'Proportion of known TRB', legend_title = 'Age group', add_stats = True, format_percent = True, figsize = calc_figsize(width = 110, height = 60),
                     save_stats = f'{data_path}/analyses/vdj/clonotypes/thyAgeing_pbmcYoshida_TRB_propKnownEpitopes', palette = get_tint_palette(thyAgeing_colors['magenta']),
                     )
plt.savefig(f'{plots_path}/vdj/clonotypes/thyAgeing_pbmcYoshida_TRB_propKnownEpitopes.png', bbox_inches='tight', dpi=300)

### Clonotype analysis

In [None]:
# Prepare TCR df
tcr_df = tcrab_info.data[['sequence_id','locus', 'productive','umi_count', 'v_call', 'd_call', 'j_call', 'cdr3_aa','cdr3']].copy()
tcr_df['barcode'] = tcr_df['sequence_id'].str.split('_|-').str[-3]
tcr_df['contig_id'] = tcr_df['sequence_id'].str.split('_').str[-1]
tcr_df['index'] = tcr_df['sequence_id'].str.split('-').str[0]
tcr_df = tcr_df.merge(latest_meta[['index', 'donor', col_age_group]], on = 'index', how = 'left')
tcr_df.head()

#### Beta chain analysis

In [None]:
cell2tcr_columns = ['individual', 'IR_VDJ_1_junction_aa','IR_VDJ_1_v_call','IR_VDJ_1_j_call','IR_VJ_1_junction_aa','IR_VJ_1_v_call','IR_VJ_1_j_call']
tcrdist_columns = ['individual', 'IR_VDJ_1_junction_aa','IR_VDJ_1_v_call','IR_VDJ_1_j_call','IR_VJ_1_junction_aa','IR_VJ_1_v_call','IR_VJ_1_j_call']

In [None]:
trb_df = tcr_df[tcr_df['locus'] == 'TRB'].copy()
trb_df = trb_df[['barcode','index','donor', 'umi_count', 'v_call', 'd_call', 'j_call', 'cdr3_aa', 'cdr3', 'productive']] \
    .rename(columns = {'donor':'subject' ,'umi_count' : 'count', 'v_call' : 'v_b_gene', 'd_call' : 'd_b_gene', 'j_call' : 'j_b_gene', 'cdr3_aa' : 'cdr3_b_aa', 'cdr3' : 'cdr3_b_nucseq'})
trb_df.index = trb_df['index'] + '-' + trb_df['barcode']
trb_df = trb_df.drop(columns = ['d_b_gene'])

trb_df.head()

Identify clonotypes

In [None]:
# Read allowed vjd genes tsv
vdj_genes = pd.read_csv(f'{data_path}/analyses/clonotypes/alphabeta_gammadelta_db.tsv', sep = '\t')
vdj_genes = vdj_genes.loc[vdj_genes['organism'] == 'human',:]

excluded_genes_list = []
for gene in ['v_b_gene','j_b_gene']:
    
    # For genes with ambiguous assignments, take first assignment
    trb_df[gene] = trb_df[gene].str.split(',').str[0]
    
    excluded_genes = np.setdiff1d(trb_df[gene].unique(), vdj_genes['id']).tolist()
    print(f'Excluded for {gene}: {excluded_genes}')
    excluded_genes_list.append(excluded_genes)
    
    trb_df = trb_df.loc[~trb_df[gene].isin(excluded_genes),:]

In [None]:
#trb_df.to_csv(f'{data_path}/analyses/clonotypes/thyAgeing_TRB_clonotypes.csv')
trb_df = pd.read_csv(f'{data_path}/analyses/clonotypes/thyAgeing_TRB_clonotypes.csv', index_col = 0)

In [None]:
# Run TRB clonotypes analysis
!bsub < /nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/T_NK_compartment/scripts/trb_clonotypes/run_motifs.sh 

In [None]:
trb_clones = pd.read_csv(f'{data_path}/analyses/clonotypes/thyAgeing_trb_clones.csv', index_col=0)
trb_clones = trb_clones.merge(latest_meta[['donor', col_age_group, 'sex']].drop_duplicates(), left_on = 'subject', right_on = 'donor', how='left')
trb_clones.head()

Number of clones:

In [None]:
# Plot boxplot of trb_clones with specified parameters
trb_clones = trb_clones.sort_values(by=col_age_group, key=lambda x: pd.Categorical(x, categories=col_age_group_levels, ordered=True))
plt.figure(figsize=calc_figsize(width=110, height=60))
sns.boxplot(data=trb_clones, x='donor', y='count', hue='age_group',
            hue_order = col_age_group_levels, 
            palette=get_tint_palette(thyAgeing_colors['magenta']),
            showfliers=False)
plt.xlabel('Donor')
plt.ylabel('N(clones)')
plt.legend(title='Age Group')
plt.xticks(rotation=90)
plt.tight_layout()
plt.yscale('log')
plt.savefig(f'{plots_path}/vdj/clonotypes/thyAgeing_trb_nClonesByDonor_boxplot.pdf', bbox_inches='tight', dpi=300)

In [None]:
df = trb_clones.groupby(['donor', col_age_group], observed=True)['count'].agg(['mean', 'median']).reset_index()
df['dummyVar'] = 'dummy'
plot_grouped_boxplot(data = df, x = 'dummyVar', y = 'mean', hue = col_age_group, order = ['dummy'], hue_order = col_age_group_levels, 
                     x_label = '', y_label = 'Mean clone size', legend_title = 'Age group', add_stats = True, format_percent = False, 
                     figsize = calc_figsize(width = 30, height = 40),
                     save_stats = f'{data_path}/analyses/vdj/clonotypes/thyAgeing_trb_cloneSizeByDonor_mean', 
                     palette = get_tint_palette(thyAgeing_colors['magenta']),
                     )
plt.savefig(f'{plots_path}/vdj/clonotypes/thyAgeing_trb_cloneSizeByDonor_mean.pdf', bbox_inches='tight', dpi=300)

plot_grouped_boxplot(data = df, x = 'dummyVar', y = 'median', hue = col_age_group, order = ['dummy'], hue_order = col_age_group_levels, 
                     x_label = '', y_label = 'Median clone size', legend_title = 'Age group', add_stats = True, format_percent = False, 
                     figsize = calc_figsize(width = 30, height = 40),
                     save_stats = f'{data_path}/analyses/vdj/clonotypes/thyAgeing_trb_cloneSizeByDonor_median', 
                     palette = get_tint_palette(thyAgeing_colors['magenta']),
                     )
plt.savefig(f'{plots_path}/vdj/clonotypes/thyAgeing_trb_cloneSizeByDonor_median.pdf', bbox_inches='tight', dpi=300)

Number of neighbors:

In [None]:
# Plot boxplot of trb_clones with specified parameters
trb_clones = trb_clones.sort_values(by=col_age_group, key=lambda x: pd.Categorical(x, categories=col_age_group_levels, ordered=True))
plt.figure(figsize=calc_figsize(width=110, height=60))
sns.boxplot(data=trb_clones, x='donor', y='n_neighbors', hue='age_group',
            hue_order = col_age_group_levels, 
            palette=get_tint_palette(thyAgeing_colors['magenta']),
            showfliers=False)
plt.xlabel('Donor')
plt.ylabel('N(neighbours)')
plt.legend(title='Age Group')
plt.xticks(rotation=90)
plt.tight_layout()
plt.yscale('log')
plt.savefig(f'{plots_path}/vdj/clonotypes/thyAgeing_trb_nNeighboursByDonor_boxplot.pdf', bbox_inches='tight', dpi=300)

In [None]:
df = trb_clones.groupby(['donor', col_age_group], observed=True)['n_neighbors'].agg(['mean', 'median']).reset_index()
df['dummyVar'] = 'dummy'
plot_grouped_boxplot(data = df, x = 'dummyVar', y = 'mean', hue = col_age_group, order = ['dummy'], hue_order = col_age_group_levels, 
                     x_label = '', y_label = 'Mean N(neighbours)', legend_title = 'Age group', add_stats = True, format_percent = False, 
                     figsize = calc_figsize(width = 30, height = 40),
                     save_stats = f'{data_path}/analyses/vdj/clonotypes/thyAgeing_trb_nNeighboursByDonor_mean', 
                     palette = get_tint_palette(thyAgeing_colors['magenta']),
                     )
plt.savefig(f'{plots_path}/vdj/clonotypes/thyAgeing_trb_nNeighboursByDonor_mean.pdf', bbox_inches='tight', dpi=300)

plot_grouped_boxplot(data = df, x = 'dummyVar', y = 'median', hue = col_age_group, order = ['dummy'], hue_order = col_age_group_levels, 
                     x_label = '', y_label = 'Median N(neighbours)', legend_title = 'Age group', add_stats = True, format_percent = False, 
                     figsize = calc_figsize(width = 30, height = 40),
                     save_stats = f'{data_path}/analyses/vdj/clonotypes/thyAgeing_trb_nNeighboursByDonor_median', 
                     palette = get_tint_palette(thyAgeing_colors['magenta']),
                     )
plt.savefig(f'{plots_path}/vdj/clonotypes/thyAgeing_trb_nNeighboursByDonor_median.pdf', bbox_inches='tight', dpi=300)

Public clones:

In [None]:
df = trb_clones.groupby(['donor', col_age_group], observed=True)['qpublic'].value_counts(normalize=True).to_frame('freq').reset_index()
df = df.loc[df['qpublic'] == True,:] 
df = df.sort_values(by=col_age_group, key=lambda x: pd.Categorical(x, categories=col_age_group_levels, ordered=True))
plt.figure(figsize=calc_figsize(width=110, height=60))
sns.barplot(data=df, x='donor', y='freq', hue='age_group',
            hue_order = col_age_group_levels, 
            palette=get_tint_palette(thyAgeing_colors['magenta']))
plt.xlabel('Donor')
plt.ylabel('Proportion of public clones')
plt.legend(title='Age Group')
plt.xticks(rotation=90)
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f'{y:.0%}'))
plt.tight_layout()
plt.savefig(f'{plots_path}/vdj/clonotypes/thyAgeing_trb_publicClonesByDonor_barplot.pdf', bbox_inches='tight', dpi=300)

In [None]:
df['qpublic'] = df['qpublic'].astype(str)
plot_grouped_boxplot(data = df, x = 'qpublic', y = 'freq', hue = col_age_group, order = ['True'], hue_order = col_age_group_levels, 
                     x_label = '', y_label = 'Prop(Public TRB)', legend_title = 'Age group', add_stats = True, format_percent = True, 
                     figsize = calc_figsize(width = 30, height = 40),
                     save_stats = f'{data_path}/analyses/vdj/clonotypes/thyAgeing_trb_propPublicClonesByDonor', 
                     palette = get_tint_palette(thyAgeing_colors['magenta']),
                     )
plt.savefig(f'{plots_path}/vdj/clonotypes/thyAgeing_trb_propPublicClonesByDonor_boxplot.pdf', bbox_inches='tight', dpi=300)

In [None]:
# object_version = 'v8_2024-11-07'
# adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}.zarr')

# leiden_clus = pd.read_csv(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}_leidenClusters.csv', index_col=0)
# if any(leiden_clus.columns.isin(adata.obs.columns)):
#     adata.obs.drop(leiden_clus.columns, axis = 1, inplace = True)
# adata.obs = adata.obs.join(leiden_clus)
# adata.obs[leiden_clus.columns] = adata.obs[leiden_clus.columns].astype('category')

# # Update metadata
# latest_meta_path = get_latest_version(dir = f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
# latest_meta = pd.read_excel(latest_meta_path)
# update_obs(adata, latest_meta, on = 'index', ignore_warning = True)
# # Add vdj data
# meta_tcr = pd.read_csv(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}_tcrab.csv')
# adata.obs = adata.obs.join(meta_tcr.set_index('names'))

In [None]:
df = adata.obs[['v_call_VDJ_main', 'j_call_VDJ_main', 'productive_abT_VDJ', 'donor']].copy()
df.loc[df['j_call_VDJ_main'] != 'No_contig'].head()

In [None]:
trb_df.head()

In [None]:
tcr_neighbors = trb_df.reset_index(names='name').merge(trb_clones[['subject', 'v_b_gene', 'j_b_gene', 'cdr3_b_aa', 'cdr3_b_nucseq', 'productive', 'n_neighbors', 'n_subject', 'qpublic']], on = ['subject', 'v_b_gene', 'j_b_gene', 'cdr3_b_aa', 'cdr3_b_nucseq', 'productive']).set_index('name')
tcr_neighbors = tcr_neighbors.iloc[tcr_neighbors.reset_index().groupby('name')['count'].idxmax().to_list()]

tcr_neighbors

In [None]:
import math 
adata.obs = adata.obs.join(tcr_neighbors[['n_neighbors', 'n_subject', 'qpublic']])
adata.obs['n_neighbors'] = adata.obs['n_neighbors']
adata.obs['log_n_neighbors'] = adata.obs['n_neighbors'].apply(math.log)

sc.pl.umap(adata, color = ['log_n_neighbors', 'n_subject', 'qpublic'], ncols = 3, cmap = 'Reds')

In [None]:
tcr_neighbors.index.duplicated().sum()

#### Paired chain analysis

In [None]:
# Identify cells with paired productive chains
tcrab_df = tcr_df.loc[tcr_df['productive'] == 'T',:].copy()
tcrab_barcodes = tcrab_df.groupby('barcode').agg(n_chains = ('locus', 'nunique'))
tcrab_barcodes = tcrab_barcodes.index[tcrab_barcodes['n_chains'] == 2].tolist()
tcrab_df = tcrab_df[tcrab_df['barcode'].isin(tcrab_barcodes)]

tcrab_df

In [None]:
# Keep most expressed chains for each cell
tcrab_df = tcrab_df.loc[tcrab_df.groupby(['barcode', 'locus'])['umi_count'].idxmax()]
tcrab_df = tcrab_df[['barcode','donor', 'umi_count', 'locus', 'v_call', 'd_call', 'j_call', 'cdr3_aa', 'cdr3']]

# Pivot table
tcrab_df = tcrab_df.pivot_table(index=['barcode', 'donor'], columns='locus', values=['umi_count', 'v_call', 'd_call', 'j_call', 'cdr3_aa', 'cdr3'], aggfunc = 'first')

# Rename relevant columns
tcrab_df.columns = ['_'.join(col).strip() for col in tcrab_df.columns.values]
tcrab_df = tcrab_df.reset_index().rename(columns = {'donor': 'individual', 'cdr3_aa_TRB':'IR_VDJ_1_junction_aa', 'v_call_TRB':'IR_VDJ_1_v_call', 'j_call_TRB':'IR_VDJ_1_j_call', 'cdr3_aa_TRA':'IR_VJ_1_junction_aa', 'v_call_TRA':'IR_VJ_1_v_call', 'j_call_TRA':'IR_VJ_1_j_call'})

# Subset data frame
cell2tcr_cols = ['barcode','individual', 'IR_VDJ_1_junction_aa', 'IR_VDJ_1_v_call', 'IR_VDJ_1_j_call', 'IR_VJ_1_junction_aa', 'IR_VJ_1_v_call', 'IR_VJ_1_j_call']
tcrab_df = tcrab_df[cell2tcr_cols].set_index('barcode')

tcrab_df.head()

In [None]:
# Read tsv
vdj_genes = pd.read_csv(f'{data_path}/analyses/clonotypes/alphabeta_gammadelta_db.tsv', sep = '\t')
vdj_genes = vdj_genes.loc[vdj_genes['organism'] == 'human',:]
vdj_genes['id'] = vdj_genes['id'].astype(str)

excluded_genes_list = []
for gene in ['IR_VDJ_1_v_call','IR_VDJ_1_j_call','IR_VJ_1_v_call','IR_VJ_1_j_call']:
    
    # For genes with ambiguous assignments, take first assignment
    tcrab_df[gene] = tcrab_df[gene].str.split(',').str[0]
    tcrab_df.dropna(subset=[gene], inplace = True)
    
    excluded_genes = np.setdiff1d(tcrab_df[gene].unique(), vdj_genes['id']).tolist()
    print(f'Excluded for {gene}: {excluded_genes}')
    excluded_genes_list.append(excluded_genes)
    
    tcrab_df = tcrab_df.loc[~tcrab_df[gene].isin(excluded_genes),:]

In [None]:
# Save TCR data
tcrab_df.to_csv(f'{data_path}/analyses/clonotypes/thyAgeing_tcrabPaired_cell2tcr.csv')

In [None]:
# Run TCR paired chain clonotype analysis
!bsub < /nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/T_NK_compartment/scripts/cell2tcr_clonotypes/run_motifs.sh

In [None]:
tcrab_clones = pd.read_csv(f'{data_path}/analyses/clonotypes/thyAgeing_tcrabPaired_cell2target_clones.csv')
tcrab_clones.head()

In [None]:
tcrab_clones.columns

In [None]:
motif_count = tcrab_clones.groupby(['motif', 'individual']).size().to_frame('count').reset_index()
motif_count.sort_values('count', ascending = False).head()

In [None]:
tcrab_clones = tcrab_df.copy()
motifs = tcrab_clones.merge(adata.obs[['taa_l3', 'age_group2']].reset_index(names = 'index'), left_on = 'barcode', right_on = 'index')
motifs = motifs.groupby('motif').agg(n_cells = ('barcode', 'nunique'),
                                  n_donors = ('individual', 'nunique'),
                                  cell_states = ('taa_l3', lambda x: ', '.join(x.unique())),
                                  age_groups = ('age_group2', lambda x: ', '.join(x.unique())))
motifs = motifs.loc[motifs['n_cells'] > 1,:]

motifs.iloc[:50]

In [None]:
sum(motifs['n_cells'] > 1)

In [None]:
sum(motifs['n_donors'] > 1)

Motifs are rarely shared between CD4 and CD8 populations

In [None]:
session_info.show()