# Thymus ageing | B cell compartment: Load IG data

In [None]:
import sys 
import os
from datetime import datetime
today = datetime.today().strftime('%Y-%m-%d')

import pandas as pd
import numpy as np
import scanpy as sc
import anndata as ad
import hdf5plugin
import dandelion as ddl

import matplotlib.pyplot as plt
import seaborn as sns

# Add repo path to sys path (allows to access scripts and metadata from repo)
repo_path = '/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/B_compartment'
sys.path.insert(1, repo_path) 
sys.path.insert(2, '/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts')

# Add R libs path
os.environ['R_LIBS_USER'] = '/nfs/team205/lm25/condaEnvs/thymusAgeing/lib/R/library'

# Formatting
from matplotlib import font_manager
font_manager.fontManager.addfont("/nfs/team205/ny1/ThymusSpatialAtlas/software/Arial.ttf")
plt.style.use('/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts/plotting/thyAgeing.mplstyle')

%load_ext rpy2.ipython
%reload_ext autoreload
%autoreload 2

In [None]:
%%capture output
%%R

library(tidyverse)
library(patchwork)
library(magrittr)

source('/nfs/team205/lm25/customScripts/visualisation/customTheme.R')

options(max.print=150)

In [None]:
# Define plot and path dirs
plots_path = os.path.join(repo_path, 'plots')
data_path = os.path.join(repo_path, 'data')
model_path = os.path.join(repo_path, 'models')
general_data_path = '/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/data'

print(f'Plot path: {plots_path}')
print(f'Data path: {data_path}')

In [None]:
from plotting.utils import plot_grouped_boxplot,calc_figsize,thyAgeing_colors,thyAgeing_greys
from utils import get_latest_version,update_obs,freq_by_donor

## Read in BCR data

In [None]:
# Update metadata
from utils import get_latest_version,update_obs

latest_meta_path = get_latest_version(dir = f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
meta = pd.read_excel(latest_meta_path)

In [None]:
# BCR
bcr = []
no_bcr = {'missing_dandelion' : [],
          'missing_raw' : []}
bcr_dir = '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_general/thymus_vdj/BCR'
for lib in meta['library_bcr'].tolist():
    
    if not pd.isna(lib):
        bcr_path = f'{bcr_dir}/{lib}'  

        if os.path.exists(bcr_path):
            if 'dandelion' in os.listdir(bcr_path):
                vdj = ddl.read_10x_airr(f'{bcr_path}/dandelion/all_contig_dandelion.tsv')
                bcr.append(vdj)
            else:
                no_bcr['missing_dandelion'].append(lib)
                print(f'No BCR dandelion results folder for {lib}')
        else:
            no_bcr['missing_raw'].append(lib)
            print(f'No BCR path for {lib}')

In [None]:
# Number of BCR libraries
len(bcr)

In [None]:
import pprint

pprint.pprint(no_bcr) # Missing raw is due to wrong library type

In [None]:
ddl.concat(bcr).write(f'{data_path}/objects/vdj/thymusAgeing_bcr_v2_{today}.h5ddl')

## Aligning BCR data with B cell adata

In [None]:
# Load data
object_version = 'v4_2024-11-06'
adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_bSplit_scvi_{object_version}.zarr')
vdj_version = 'v3_2025-02-19'
bcr = ddl.read_h5ddl(f'{data_path}/objects/vdj/thymusAgeing_bcr_v2_{today}.h5ddl')

In [None]:
# Check whether there are overlapping barcodes
np.intersect1d(np.array(bcr.metadata.index), adata.obs_names)

In [None]:
# BCR: Align bcr and adata
bcr_filtered, adata_filtered = ddl.pp.check_contigs(bcr, adata, productive_only = False, library_type = 'ig')

In [None]:
# Save new metadata
adata_filtered.obs.loc[:, 'has_contig':'rearrangement_status_VJ'].to_csv(f'{data_path}/objects/rna/thyAgeing_bSplit_scvi_{object_version}_bcr_{vdj_version}.csv')

In [None]:
bcr_filtered.write(f'{data_path}/objects/vdj/thymusAgeing_bcrFiltered_{vdj_version}.h5ddl')

In [None]:
# Number of BCRs per study
adata_filtered.obs.loc[adata_filtered.obs.has_contig != 'No_contig'].groupby('study').size()

## BCR properties

In [None]:
# Load VDJ data
vdj_version = 'v3_2025-02-19'
bcr = ddl.read_h5ddl(f'{data_path}/objects/vdj/thymusAgeing_bcrFiltered_{vdj_version}.h5ddl')

In [None]:
# Load adata
object_version = 'v4_2024-11-06'
adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_bSplit_scvi_{object_version}.zarr')

# # Add BCR meta
# bcr_meta = pd.read_csv(f'{data_path}/objects/rna/thyAgeing_bSplit_scvi_{object_version}_bcr_{vdj_version}.csv', index_col = 0)
# adata.obs = adata.obs.join(bcr_meta)

# adata = adata[~pd.isna(adata.obs.has_contig), :]

### Clonotype calling

In [None]:
# Update metadata
from utils import get_latest_version,update_obs

latest_meta_path = get_latest_version(dir = f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
latest_meta = pd.read_excel(latest_meta_path)

In [None]:
# Add donor info to BCR data
bcr.data = bcr.data.merge(latest_meta[['donor', 'library_bcr']], left_on = 'sample_id', right_on = 'library_bcr')
bcr.metadata = bcr.metadata.merge(latest_meta[['donor', 'library_bcr']], left_on = 'sample_id', right_on = 'library_bcr')
bcr.data['donor'].head()

In [None]:
# Check all seqs have donor info
bcr.data['donor'].isna().sum()

In [None]:
# Automatically determine threshold
ddl.pp.calculate_threshold(bcr,
                           mode = 'single-cell',
                           fields = 'donor',
                           threshold_method = 'gmm',
                           threshold_model = 'gamma-gamma',
                           cutoff = 'optimal',
                           model = 'hh_s1f', # human Ig-specific single nucleotide model similar to a transition/transversion model
                           ncpu = 4, 
                           save_plot = f'{plots_path}/preprocessing/vdj/thyAgeing_bSplit_bcr_changeoThreshold.png',
                           figsize = (20, 10))

In [None]:
# Define clones using changeo
ddl.tl.define_clones(bcr, 
                     model = 'hh_s1f',
                     doublets = 'count',
                     key_added="changeo_clone_id",
                     additional_args = ['--gf', 'donor'], 
                     ncpu = 4,
                     out_dir = f'{data_path}/analysis/vdj/clonotypes')

In [None]:
bcr.metadata.groupby('changeo_clone_id').size().sort_values(ascending=False)

In [None]:
ddl.tl.clone_size(bcr, clone_key = 'changeo_clone_id', key_added = 'changeo_clone_size')

In [None]:
ddl.tl.generate_network(bcr, clone_key = 'changeo_clone_id', network_key = 'changeo_network', num_cores = 4)

### Mutation quantification

In [None]:
# Check germline mask is present in data
bcr.data[["v_call_genotyped", "germline_alignment_d_mask"]]

In [None]:
# Quantify mutational load
ddl.pp.quantify_mutations(bcr, 
                          combine=False, # Returns silent and replacement mutations separately
                          split_locus = True # Returns results for each locus separately
                          )

# Quantify mutational load
ddl.pp.quantify_mutations(bcr, 
                          combine=True, # Returns silent and replacement mutations separately
                          split_locus = True # Returns results for each locus separately
                          )

# Quantify mutational load
ddl.pp.quantify_mutations(bcr, 
                          combine=True, # Returns silent and replacement mutations separately
                          split_locus = True # Returns results for each locus separately
                          )



### Clonal diversity

In [None]:
ddl.pl.clone_rarefaction(adata, color="donor", clone_key="changeo_clone_id", figsize = (10, 5),
                         save = f'{plots_path}/preprocessing/vdj/thyAgeing_bSplit_bcr_cloneRarefaction.png'
                         )

In [None]:
ddl.tl.clone_diversity(
    bcr, groupby="donor", method="gini", metric="clone_centrality", clone_key="changeo_clone_id",
)

### Transfer to adata and save

In [None]:
ddl.tl.transfer(adata, bcr)

In [None]:
# Save updated BCR data
bcr.write(f'{data_path}/objects/vdj/thymusAgeing_bcrFiltered_{vdj_version}.h5ddl', compression="gzip")

In [None]:
# Save updated metadata
adata.obs.loc[:, 'changeo_clone_id':'clone_centrality_gini'].to_csv(f'{data_path}/objects/rna/thyAgeing_bSplit_scvi_{object_version}_bcr_{vdj_version}.csv')

In [None]:
adata.obs[['isotype', 'isotype_status']].value_counts()

In [None]:
from scanpy.plotting.palettes import default_28, default_102

sc.set_figure_params(figsize=[4, 4])
ddl.pl.clone_network(
    adata,
    color=[
        "donor",
        "changeo_clone_size",
        "mu_count_IGH",
        "mu_count_IGL",
    ],
    ncols=2,
    legend_loc="none",
    legend_fontoutline=3,
    edges_width=1,
    palette=default_28 + default_102,
    color_map="viridis",
    size=20,
)

In [None]:
sc.set_figure_params(figsize=[4, 4.5])
ddl.pl.clone_network(
    adata,
    color=["locus_status", "chain_status", 'isotype_status'],
    ncols=2,
    legend_fontoutline=3,
    edges_width=1,
    size=20,
    wspace = 0.5,
)

### Somatic hypermutation analysis

In [None]:
# Load adata
object_version = 'v5_2025-04-16'
adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_bSplit_scvi_{object_version}.zarr')

# Add new annotations to adata
ct_anno = pd.read_csv(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_v4_2025-02-04_curatedAnno_v10.csv', index_col = 0)
for c in [c for c in ct_anno.columns if c in adata.obs.columns]:
    adata.obs.drop(columns = c, inplace = True)
adata.obs = adata.obs.join(ct_anno, how = 'left')
adata = adata[(adata.obs['anno_status'] == 'include') | (adata.obs['qc_status'] == 'PASS'), :]

# Update metadata
latest_meta_path = get_latest_version(dir = f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
latest_meta = pd.read_excel(latest_meta_path)
update_obs(adata, latest_meta, on = 'index', ignore_warning = True)

vdj_version = 'v3_2025-02-19'
bcr = ddl.read_h5ddl(f'{data_path}/objects/vdj/thymusAgeing_bcrFiltered_{vdj_version}.h5ddl')

In [None]:
# Specify cell type columns
from anno_levels import get_ct_levels,age_group_levels
col_cell_type_broad = 'taa_l4'
col_cell_type_fine = 'taa_l5'
col_cell_type_broad_levels = get_ct_levels(level = col_cell_type_broad, taa_l1 = 'B')
col_cell_type_fine_levels = get_ct_levels(level = col_cell_type_fine, taa_l1 = 'B')
col_age_group = 'age_group'
col_age_group_levels = eval(f'{col_age_group}_levels')

%R -i plots_path,col_cell_type_broad,col_cell_type_fine,col_cell_type_broad_levels,col_cell_type_fine_levels,col_age_group,col_age_group_levels,data_path

In [None]:
# Only include annotated and non-developing B cells
adata = adata[~adata.obs['taa_l5'].isin(['B_GC-like-locnt','B_dev_thy','B_dev'])]
bcr_subset = ddl.Dandelion(bcr.data[bcr.data['cell_id'].isin(adata.obs_names)], bcr.metadata)

# Add donor column and write to airr
bcr_subset.data = bcr_subset.data.merge(adata.obs[[col_age_group, col_cell_type_fine]], left_on = 'cell_id',right_index = True, how = 'left')
bcr_subset.write_airr(filename=f'{data_path}/analyses/vdj/hypermutation/thymusAgeing_bcrFiltered_{vdj_version}_airr.tsv')

Perform somatic hypermutation analysis using the immcantation framework, see:
- https://immcantation.readthedocs.io/en/stable/getting_started/10x_tutorial.html
- https://dowser.readthedocs.io/en/latest/vignettes/Resolve-Light-Chains-Vignette/

In [None]:
%%R -i data_path,vdj_version

# Load bcr data
bcr = read_tsv(file.path(data_path, paste0('/analyses/vdj/hypermutation/thymusAgeing_bcrFiltered_', vdj_version, '_airr.tsv')))

# Remove any cells without a cell type and with stop codons
bcr = bcr %>% filter(!is.na(!!rlang::sym(col_cell_type_fine)),
                      !stop_codon)

In [None]:
%%R

bcr %>% colnames()

In [None]:
%%R

# Remove cells with multiple heavy chains
multi_heavy <- table(dplyr::filter(bcr, locus == "IGH")$cell_id)
multi_heavy_cells <- names(multi_heavy)[multi_heavy > 1]

print(paste('Removing', length(multi_heavy_cells), 'cells with multiple heavy chains'))

bcr = bcr %>%
dplyr::filter(!cell_id %in% multi_heavy_cells)

nrow(bcr)

In [None]:
%%R

# Remove cells without any heavy chain
heavy_cells <- dplyr::filter(bcr, locus == "IGH")$cell_id
light_cells <- dplyr::filter(bcr, locus == "IGK" | locus == "IGL")$cell_id
no_heavy_cells <- light_cells[which(!light_cells %in% heavy_cells)]

print(paste('Removing', length(no_heavy_cells), 'cells without heavy chain'))

bcr = bcr %>%
dplyr::filter(!cell_id %in% no_heavy_cells)

nrow(bcr)

In [None]:
%%R

bcr %>%
rstatix::freq_table(donor)

In [None]:
%%R
# calculate and plot the rank-abundance curve
abund <- alakazam::estimateAbundance(bcr, clone = 'changeo_clone_id',group = "donor")

abund_plot <- alakazam::plot(abund, silent=T)
abund_plot
ggsave(file.path(plots_path, 'vdjAnalysis/clonotypes/thyAgeing_bSplit_bcr_rankAbundance.png'), width = 10, height = 5, dpi = 300)

In [None]:
%%R

# # calculate and plot the rank-abundance curve
# div <- alakazam::alphaDiversity(data.frame(abund@abundance), clone = 'changeo_clone_id',group = "donor", nboot = 100)
# plot(div, silent=T) + facet_wrap("donor", ncol = 3)
# ggsave(file.path(plots_path, 'vdjAnalysis/clonotypes/thyAgeing_bSplit_bcr_diversity.png'), width = 10, height = 5, dpi = 300)

In [None]:
%%R

# Remove sequences assigned to no clones
bcr <- dplyr::filter(bcr, !is.na(changeo_clone_id))

bcr <- dowser::resolveLightChains(bcr,
                                  nproc = 4,
                                  clone = "changeo_clone_id",
                                  cell = "cell_id",
)

In [None]:
%%R
# Remove sequences assigned to no clones
bcr <- dplyr::filter(bcr, !is.na(changeo_clone_id))

# Read in IMGT reference sequences
references <- dowser::readIMGT(dir = "/nfs/team205/lm25/thymus_projects/thymus_ageing_atlas/B_compartment/data/references/immcantation_imgt/human/vdj")

# Reconstruct germlines
results <- dowser::createGermlines(bcr, references, 
                                   clone = 'clone_subgroup_id', # clone_subgroup_id for heavy and light paired analysis, changeo_clone_id for heavy only
                                   fields = c("donor"), 
                                   nproc = 1)

In [None]:
%%R

# Format clones
clones <- dowser::formatClones(results,
                               clone = 'changeo_clone_id',
                               traits = c('donor'),
                               text_fields = c(col_cell_type_fine),
                               cell = 'cell_id',
                               chain="HL", # HL for paired analysis
                               nproc=1, 
                               collapse = FALSE, 
                               split_light = TRUE, 
                               minseq = 3,
                               )

head(clones)

In [None]:
%%R 

# Save clones
clones %>% saveRDS(file.path(data_path, paste0('analyses/vdj/hypermutation/thymusAgeing_bcrFiltered_', vdj_version,'_shmClones.rds')))

In [None]:
%%R

Sys.setenv(IGPHYML_PATH = "/nfs/team205/lm25/condaEnvs/thymusAgeing/share/igphyml/motifs")

trees <- dowser::getTrees(clones, nproc = 8, build="igphyml", exec="/nfs/team205/lm25/condaEnvs/thymusAgeing/bin/igphyml", collapse = TRUE)

trees

In [None]:
%%R

trees 

In [None]:
%%R

# Add donor info and save
trees <- trees %>%
dplyr::left_join(bcr %>% dplyr::distinct(donor, clone_subgroup_id), by = c('clone_id' = 'clone_subgroup_id')) 

# Save clones
trees %>% saveRDS(file.path(data_path, paste0('analyses/vdj/hypermutation/thymusAgeing_bcrFiltered_', vdj_version,'_shmTrees.rds')))

In [None]:
%%R

tree_plots <- dowser::plotTrees(trees, tips = "taa_l5", tipsize = 2)

dowser::treesToPDF(tree_plots, file.path(plots_path, 'vdjAnalysis', 'clonotypes', 'thyAgeing_bSplit_bLineageTrees.pdf'), nrow = 2, ncol = 2)

In [None]:
%%R -o trees_df

trees_df <- trees %>% dplyr::select(clone_id, donor, seqs)

In [None]:
trees_df = trees_df.merge(adata.obs[['donor', col_age_group, 'age_months']].drop_duplicates(), on = 'donor')

trees_df.head()

In [None]:
trees_df.to_csv(f'{data_path}/analyses/vdj/hypermutation/thymusAgeing_bcrFiltered_{vdj_version}_shmTrees.csv', index = False)

In [None]:
df = trees_df.groupby(['donor', 'age_group'], observed=True).size().reset_index(name='count')
df_order = trees_df.drop_duplicates(subset = ['donor', 'age_months']).sort_values('age_months')['donor']
df[col_age_group] = pd.Categorical(df[col_age_group], categories=['infant', 'paed', 'adult'], ordered=True)
df[col_age_group].cat.remove_unused_categories()
df

In [None]:
from plotting.utils import calc_figsize,get_tint_palette,thyAgeing_colors
from anno_levels import age_group_palette
plt.figure(figsize=calc_figsize(width = 50, height = 35))
sns.barplot(data = df, x = 'donor', y = 'count', hue = 'age_group', order = df_order, palette = get_tint_palette(thyAgeing_colors['magenta']),
            linewidth = 0)
plt.legend(title='Age group', bbox_to_anchor=(1.2, 1), loc='upper center')
plt.xticks(rotation=45, ha='right')
plt.xlabel('Donor')
plt.ylabel('N(SHM trees)')
sns.despine(offset=2, trim = True)
plt.yticks(ticks=np.arange(0, df['count'].max(), 5), labels=np.arange(0, df['count'].max(), 5))
plt.savefig(f'{plots_path}/vdjAnalysis/clonotypes/thyAgeing_bSplit_bLineageTrees_count.pdf')

In [None]:
df = trees_df.groupby(['donor', 'age_group'], observed=True).agg({'seqs' : 'sum'}).reset_index()
df_order = trees_df.drop_duplicates(subset = ['donor', 'age_months']).sort_values('age_months')['donor']
df[col_age_group] = pd.Categorical(df[col_age_group], categories=['infant', 'paed', 'adult'], ordered=True)
df[col_age_group].cat.remove_unused_categories()
df

In [None]:
from plotting.utils import calc_figsize
from anno_levels import age_group_palette
from plotting.utils import calc_figsize,get_tint_palette,thyAgeing_colors
from anno_levels import age_group_palette
plt.figure(figsize=calc_figsize(width = 50, height = 35))
sns.barplot(data = df, x = 'donor', y = 'seqs', hue = 'age_group', order = df_order, palette = get_tint_palette(thyAgeing_colors['magenta']),
            linewidth = 0)
plt.legend(title='Age group', bbox_to_anchor=(1.2, 1), loc='upper center')
plt.xticks(rotation=45, ha='right')
plt.xlabel('Donor')
plt.ylabel('N(SHM trees)')
sns.despine(offset=2, trim = True)
plt.yticks(ticks=np.arange(0, df['seqs'].max(), 10), labels=np.arange(0, df['seqs'].max(), 10))
plt.savefig(f'{plots_path}/vdjAnalysis/clonotypes/thyAgeing_bSplit_bLineageTreesCells_count.pdf')

In [None]:
%%R

# calculate switches along trees compared to 100 random permutations 
# this may take a while, and can be parallelized using nproc
switches = dowser::findSwitches(trees, permutations=100, trait='donor', 
  igphyml="/nfs/team205/lm25/condaEnvs/thymusAgeing/bin/igphyml", fixtrees=TRUE)

ps = dowser::testPS(switches$switches)

print(ps$means)