# VDJ data integration

In [None]:
import os
import sys
import session_info
from datetime import datetime
today = datetime.today().strftime('%Y-%m-%d')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import anndata as ad
import dandelion as ddl
import hdf5plugin

# Add repo path to sys path (allows to access scripts and metadata from repo)
#repo_path,_ = os.path.split(os.path.split(os.getcwd())[0])
repo_path = '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/T_NK_compartment'
sys.path.insert(1, repo_path) 
sys.path.insert(2, '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts')

# Add R libs path
#os.environ['LD_LIBRARY_PATH'] = '' # Uncomment on jhub
#os.environ['R_HOME'] = '/nfs/team205/lm25/condaEnvs/thymusAgeing/lib/R' # Uncomment on jhub
os.environ['R_LIBS_USER'] = f'{os.path.split(sys.path[0])[0]}/R/library'

%load_ext rpy2.ipython
%load_ext autoreload
%autoreload 2

from utils import get_latest_version

In [None]:
%%capture output
%%R

library(tidyverse)
library(patchwork)
library(magrittr)

source('/nfs/team205/lm25/customScripts/visualisation/customTheme.R')

options(max.print=150)

In [None]:
# Define plot and path dirs
plot_path = os.path.join(repo_path, 'plots')
data_path = os.path.join(repo_path, 'data')
model_path = os.path.join(repo_path, 'models')
general_data_path = '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/data'

## Read in TCR data

In [None]:
# Load adata
object_version = 'v9_2025-03-28'
adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}.zarr', backed='r')

# Update metadata
from utils import get_latest_version,update_obs

latest_meta_path = get_latest_version(dir = f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
latest_meta = pd.read_excel(latest_meta_path)
update_obs(adata, latest_meta, on = 'index', ignore_warning = True)

In [None]:
# Generate list of TCR libraries
tcr_path = '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_general/thymus_vdj/TCR'
tcr_libs = [p for p in os.listdir(tcr_path) if p in adata.obs['library_tcr_ab'].unique().tolist()+adata.obs['library_tcr_gd'].unique().tolist()]

np.array(tcr_libs)

In [None]:
len(tcr_libs)

In [None]:
with open(f'{data_path}/objects/vdj/thymusAgeing_tcrLibs_v6_{today}.txt', 'w') as f:
    for item in tcr_libs:
        f.write("%s\n" % item)

In [None]:
tcr_ddl = []
for lib in tcr_libs:
    path = f'{tcr_path}/{lib}'
    if os.path.exists(path) and 'dandelion' in os.listdir(path):
        vdj = ddl.read_10x_airr(f'{path}/dandelion/all_contig_dandelion.tsv')
        tcr_ddl.append(vdj)
        vdj.write(f'{path}/{lib}_tcr_{today}.h5ddl')
    else:
        print(f'No dandelion folder for {lib} in {path}')

In [None]:
import pickle

# Save tcr_ddl as a pickle file
with open(f'{data_path}/objects/vdj/thymusAgeing_tcrLibs_v6_{today}.pkl', 'wb') as pkl_file:
    pickle.dump(tcr_ddl, pkl_file)

## Aligning TCR data with T cell adata

In [None]:
import pickle

# Load adata
object_version = 'v9_2025-03-28'
adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}.zarr')

# Load the pickle file
with open(f'{data_path}/objects/vdj/thymusAgeing_tcrLibs_v6_2025-04-03.pkl', 'rb') as pkl_file:
    tcr_ddl = pickle.load(pkl_file)
tcr = ddl.concat(tcr_ddl, check_unique = False)

In [None]:
# Check whether there are overlapping barcodes
np.intersect1d(np.array(tcr.metadata.index), adata.obs_names).shape

In [None]:
tcr

### abTCR

In [None]:
# abTCR: Align tcr and adata
tcr_ab, adata_ab = ddl.pp.check_contigs(tcr, adata, productive_only = False, library_type = 'tr-ab')

In [None]:
# Save new metadata
adata_ab.obs.loc[:, 'has_contig':'rearrangement_status_VJ'].to_csv(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}_tcrab_v6.csv')

In [None]:
tcr_ab.write(f'{data_path}/objects/vdj/thymusAgeing_tcrabFiltered_v6_{today}.h5ddl')

In [None]:
# Number of TCRs per study
adata_ab.obs.loc[adata_ab.obs.has_contig != 'No_contig'].groupby('study').size()

### gdTCR

In [None]:
# gdTCR: Align tcr and adata
tcr_gd, adata_gd = ddl.pp.check_contigs(tcr, adata, productive_only = False, library_type = 'tr-gd')

In [None]:
# Save new metadata
adata_gd.obs.loc[:, 'has_contig':'rearrangement_status_VJ'].to_csv(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}_tcrgd.csv')

In [None]:
tcr_gd.write(f'{data_path}/objects/vdj/thymusAgeing_tcrgdFiltered_v5_{today}.h5ddl')

In [None]:
# Number of TCRs per study
adata_gd.obs.loc[adata_gd.obs.has_contig != 'No_contig'].groupby('study').size()

## Create summary of (un)productive TCRab and TCRgd

In [None]:
# Read TCR meta
meta_tcrgd = pd.read_csv(f'{data_path}/objects/thyAgeing_tSplit_scvi_v5_2024-04-03_tcrgd.csv', index_col = 0)
meta_tcrab = pd.read_csv(f'{data_path}/objects/thyAgeing_tSplit_scvi_v5_2024-04-03_tcrab.csv', index_col = 0)

In [None]:
# Check whether barcodes of cells with contigs overlap
np.intersect1d(meta_tcrgd.loc[meta_tcrgd['has_contig']=='True'].index,
               meta_tcrab.loc[meta_tcrab['has_contig']=='True'].index).shape

In [None]:
# There seem to be quite a few cells which have TCRgd and TCRab contigs
meta_tcrab.loc[meta_tcrgd['has_contig']=='True']

In [None]:
meta_tcrgd.loc[meta_tcrgd['has_contig']=='True']

In [None]:
# Merge meta data 
meta_cols = ['locus_status', 'chain_status']
meta_tcr = meta_tcrab[meta_cols].join(meta_tcrgd[meta_cols], lsuffix = '_tcrab', rsuffix = '_tcrgd')

In [None]:
# Write to file
meta_tcr.to_csv(f'{data_path}/objects/thyAgeing_tSplit_scvi_v5_2024-04-03_tcr_locusStatus.csv')

## Check availability/coverage for TCR data

### TCR matched TCR

In [None]:
# Load latest metadata
latest_meta = get_latest_version(f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
meta = pd.read_excel(latest_meta, index_col = False)

# Subet to ageing samples
meta = meta.loc[(meta['health_status'] == 'healthy') & (meta['age_group2'] != 'fetal')]
meta.head()

In [None]:
# Load TCR data
tcr = ddl.concat([ddl.read_h5ddl(f'{data_path}/objects/vdj/thymusAgeing_tcr_v4_2024-05-28_part{i}.h5ddl') for i in range(1,6)], check_unique = False)

tcr_ab = ddl.read_h5ddl(f'{data_path}/objects/vdj/thymusAgeing_tcrabFiltered_v4_2024-05-28.h5ddl')

In [None]:
# Add donor info to TCR metadata
tcr.metadata['index'] = [l.split('-')[0] for l in tcr.metadata.index]
tcr.metadata = tcr.metadata.reset_index(names='names')
tcr.metadata = tcr.metadata.merge(meta[['index','library', 'library_tcr_ab', 'library_tcr_gd', 'donor', 'age_group2']], on = 'index')
tcr.metadata = tcr.metadata.set_index('names')

In [None]:
# Add donor info to TCR metadata
tcr_ab.metadata['index'] = [l.split('-')[0] for l in tcr_ab.metadata.index]
tcr_ab.metadata = tcr_ab.metadata.reset_index(names='names')
tcr_ab.metadata = tcr_ab.metadata.merge(meta[['index','library', 'library_tcr_ab', 'library_tcr_gd', 'donor', 'age_group2']], on = 'index')
tcr_ab.metadata = tcr_ab.metadata.set_index('names')

In [None]:
# Inspect number of sequenced and matched TCRs
tcr_sequenced = tcr.metadata.groupby(['age_group2', 'donor', 'library']).size().to_frame('n_sequenced')
tcr_matched = tcr_ab.metadata.groupby(['age_group2', 'donor', 'library']).size().to_frame('n_matched')

tcr_overview = tcr_sequenced.join(tcr_matched, how = 'outer').fillna(0)
tcr_overview['prop_matched'] = tcr_overview['n_matched']/tcr_overview['n_sequenced']
tcr_overview = tcr_overview.reset_index()

tcr_overview

In [None]:
%%R -i tcr_overview -h 200 -w 300 -u mm

tcr_overview %>%
dplyr::mutate(age_group2 = factor(age_group2, levels = c('infant', 'paed(early)', 'paed(mid)', 'paed(late)', 'adult(early)', 'adult(mid)'))) %>%
tidyr::pivot_longer(cols = c('n_sequenced', 'n_matched', 'prop_matched'), names_to = 'type', values_to = 'count') %>%
ggplot(aes(x = library, y = count, fill = age_group2)) +
geom_bar(stat = 'identity') +
facet_grid(type ~ donor, scales = 'free', space = 'free_x') +
ggsci::scale_fill_locuszoom() +
scale_y_continuous(expand = expansion(add = 0, mult = c(0,0.1))) +
theme_simple(facet = T) +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
      strip.text.x = element_text(angle = 90, hjust = 0),
      legend.position = 'right')

In [None]:
%%R -i tcr_overview -h 200 -w 300 -u mm

tcr_overview %>%
dplyr::mutate(age_group2 = factor(age_group2, levels = c('infant', 'paed(early)', 'paed(mid)', 'paed(late)', 'adult(early)', 'adult(mid)'))) %>%
tidyr::pivot_longer(cols = c('n_sequenced', 'n_matched', 'prop_matched'), names_to = 'type', values_to = 'count') %>%
ggplot(aes(x = age_group2, y = count, fill = age_group2)) +
geom_boxplot(outlier.size = 0.5) +
ggforce::facet_col(~ type, scales = 'free_y', strip.position = 'right') +
ggsci::scale_fill_locuszoom(guide = 'none') +
scale_y_continuous(expand = expansion(add = 0, mult = c(0,0.1))) +
theme_simple(facet = T) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

In [None]:
tcr_overview['resend'] = tcr_overview['prop_matched'] < 0.3
tcr_overview.to_csv(f'{data_path}/metadata/thyAgeing_tcrStats_v4_2024-05-28.csv', index = False)

### TCR matched GEX

In [None]:
# Load data
scvi_embed_version = 'v2_2024-05-10'
adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_tSplitPbmc_scvi_{scvi_embed_version}.zarr')

# Add knn predictions to adata (original HTSA reference does not have uncertainties)
ct_labels = pd.read_csv(f'{data_path}/objects/thyAgeing_tSplit_scvi_v5_2024-04-03_curatedAnnot_v3.csv')
adata.obs = adata.obs.join(ct_labels.set_index('names'))

# Add vdj data
meta_tcr = pd.read_csv(f'{data_path}/objects/thyAgeing_tSplit_scvi_v5_2024-04-03_tcrab.csv')
adata.obs = adata.obs.join(meta_tcr.set_index('names'))

In [None]:
# Update metadata
from utils import get_latest_version,update_obs

latest_meta_path = get_latest_version(dir = f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
meta = pd.read_excel(latest_meta_path)
adata.obs = adata.obs.drop(columns = ['names'])
update_obs(adata, meta, on = 'index', ignore_warning = True)

In [None]:
# Subset to only have cells from thymus ageing (not COVID) study
tcr_stats = adata[~pd.isna(adata.obs['chain_status'])].obs.copy()[['donor', 'sample', 'library', 'library_tcr_ab', 'age_group2', 'sex', 'chain_status']]
tcr_stats['has_tcr'] = [True if s != 'No_contig' else False for s in tcr_stats['chain_status']]

# Fraction of cells per group with TCR
tcr_stats = tcr_stats.groupby(['donor', 'age_group2', 'sample'], observed=True).agg(n_cells = ('has_tcr', 'size'), n_tcr = ('has_tcr', 'sum')).reset_index()
tcr_stats['prop_tcr'] = tcr_stats['n_tcr']/tcr_stats['n_cells']

tcr_stats.head()

In [None]:
%%R -i tcr_stats -h 200 -w 300 -u mm

tcr_stats %>%
dplyr::mutate(age_group2 = factor(age_group2, levels = c('infant', 'paed(early)', 'paed(mid)', 'paed(late)', 'adult(early)', 'adult(mid)', 'adult(late)'))) %>%
dplyr::filter(n_tcr > 0) %>%
ggplot(aes(x = age_group2, y = prop_tcr, fill = age_group2)) +
geom_boxplot(outlier.size = 0.5) +
ggsci::scale_fill_locuszoom() +
scale_y_continuous(expand = expansion(add = 0, mult = c(0,0.1))) +
theme_simple() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) 


In [None]:
%%R 

# Sample-level
tcr_stats %>%
dplyr::mutate(has_tcr = ifelse(n_tcr > 0, T, F)) %>%
rstatix::freq_table(age_group2, has_tcr) %>%
dplyr::select(age_group2, has_tcr, n, prop) %>%
dplyr::mutate(label = 'N(samples)') %>%
dplyr::bind_rows(
    # Donor-level
    tcr_stats %>%
    dplyr::group_by(donor) %>%
    dplyr::mutate(has_tcr = ifelse(any(n_tcr > 0), T, F)) %>%
    dplyr::ungroup() %>%
    dplyr::distinct(donor, age_group2, has_tcr) %>%
    rstatix::freq_table(age_group2, has_tcr) %>%
    dplyr::select(age_group2, has_tcr, n, prop) %>%
    dplyr::mutate(label = 'N(donors)')) %>%
dplyr::mutate(age_group2 = factor(age_group2, levels = c('infant', 'paed(early)', 'paed(mid)', 'paed(late)', 'adult(early)', 'adult(mid)', 'adult(late)'))) %>%
ggplot(aes(x = age_group2, y = n, fill = has_tcr)) +
geom_bar(stat = 'identity', position = position_dodge(width = 0.9)) +
geom_text(aes(label = n), position = position_dodge(width = 0.9), vjust = -0.5) +
ggforce::facet_col(~label, scales = 'free_y', strip.position = 'right') +
labs(y = 'N', x = 'Age group', fill = 'Has TCR') +
scale_fill_manual(values = c('grey', 'black')) +
scale_y_continuous(expand = expansion(add = 0, mult = c(0,0.1))) +
theme_simple(facet = T) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))


### Generate spreadsheet with sample info for re-sequencing

In [None]:
# Load latest metadata
latest_meta = get_latest_version(f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
meta = pd.read_excel(latest_meta, index_col = False)

tcr_overview = pd.read_csv(f'{data_path}/metadata/thyAgeing_tcrStats_v4_2024-05-28.csv')

# Subet to ageing samples
meta = meta.loc[(meta['health_status'] == 'healthy') & (meta['age_group2'] != 'fetal')]
meta.head()

In [None]:
to_seq = meta[['sample', 'library', 'donor', 'age_group2', 'sort', 'type', 'chemistry_simple']].drop_duplicates().merge(tcr_overview, on = ['library', 'donor', 'age_group2'], how = 'left')

In [None]:
to_seq

In [None]:
# Filter to only contain cells
to_seq = to_seq.loc[to_seq['type'] == 'cells']

In [None]:
to_seq.to_csv(f'{data_path}/metadata/thyAgeing_tcrReseq_v4_2024-05-28.csv')

In [None]:
session_info.show()