# Thymus ageing atlas: T/NK cell annotations version 4

In [None]:
import os
import sys
import session_info
from datetime import datetime
today = datetime.today().strftime('%Y-%m-%d')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import scanpy as sc
import anndata as ad
import mudata as mu
import hdf5plugin

# Add repo path to sys path (allows to access scripts and metadata from repo)
#repo_path,_ = os.path.split(os.path.split(os.getcwd())[0])
repo_path = '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/T_NK_compartment'
sys.path.insert(1, repo_path) 
sys.path.insert(2, '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts')

# Silence SettingWithCopyWarning
pd.options.mode.chained_assignment = None  # default='warn'

%load_ext rpy2.ipython
%load_ext autoreload
%autoreload 2

In [None]:
# Define paths
plots_path = f'{repo_path}/plots/'
data_path = f'{repo_path}/data/'
model_path = os.path.join(repo_path, 'models')
general_data_path = '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/data'

print('Dir for plots: {}'.format(plots_path))
print('Dir for data: {}'.format(data_path))

In [None]:
# Import custom scripts
from utils import get_latest_version,update_obs,freq_by_donor
from anno_levels import age_group_levels, taa_l4_t_levels, taa_l3_t_levels
from plotting import plot_grouped_boxplot

## Load data

In [None]:
# Load data
object_version = 'v8_2024-11-07'
adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}.zarr')

# For multiple text columns, you can use a loop
for column in adata.obs.columns:
    if pd.api.types.is_object_dtype(adata.obs[column]):
        try:
            adata.obs[column] = adata.obs[column].str.decode('utf-8')
        except AttributeError:  # This catches columns that are not bytes type
            pass

# Add knn predictions to adata (original HTSA reference does not have uncertainties)
ct_anno = pd.read_csv(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}_curatedAnno_v7.csv', index_col = 0)
adata.obs = adata.obs.join(ct_anno)

# Update metadata
latest_meta_path = get_latest_version(dir = f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
latest_meta = pd.read_excel(latest_meta_path)
update_obs(adata, latest_meta, on = 'index', ignore_warning = True)

# Add TCR data
tcrab = pd.read_csv(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_v8_2024-11-07_tcrab.csv', index_col = 0)
adata.obs = adata.obs.join(tcrab)

adata

In [None]:
# Remove non-T cells
adata = adata[adata.obs['taa_l5'].str.startswith('T_')]

adata.shape

In [None]:
# Record whether cell has non-productive rearrangements
adata.obs['has_nonproductive'] = adata.obs.apply(lambda row: any('F' in str(x) for x in [row['productive_VJ'], row['productive_VDJ']]), axis = 1)
adata.obs['has_nonproductive'] = adata.obs['has_nonproductive'].astype('category')

In [None]:
# Plot leiden clusters and annotations
adata.obs[['leiden_r2.5']] = adata.obs[['leiden_r2.5']].astype('category')
sc.pl.umap(adata, color = ['leiden_r2.5', 'taa_l4','has_nonproductive', 'age_group', 'age_group2'], ncols = 2, wspace = 0.5, legend_fontsize=6, return_fig = True, show = False)
plt.savefig(f'{plots_path}/ctAnnotation/v5/thyAgeing_tSplit_leidenAndPrevAnnot_umap.png', dpi=300, bbox_inches='tight')

In [None]:
# samples_to_exclude = adata.obs.groupby('sample')['productive_VJ'].value_counts(normalize=True).to_frame(name='prop').reset_index()
# samples_to_exclude = samples_to_exclude.loc[samples_to_exclude['productive_VJ'] == 'No_contig']
# samples_to_exclude = samples_to_exclude.loc[samples_to_exclude['prop'] > 0.6]['sample'].tolist()

# adata = adata[~adata.obs['sample'].isin(samples_to_exclude)]

## Construct nhoods

In [None]:
import pertpy
milo = pertpy.tl.Milo()

# Construct nhoods
mdata = milo.load(adata)
sc.pp.neighbors(mdata["rna"], use_rep="X_scVI", n_neighbors=100)
milo.make_nhoods(mdata["rna"], prop=0.1)

In [None]:
# Build nhood graph
# Count nhoods
mdata = milo.count_nhoods(mdata, sample_col="donor")

# Create and reorder categories
# (by default, the last category is taken as the condition of interest)
mdata["rna"].obs["da_age_group"] = ["adult" if 'adult' in x else "paed" for x in mdata["rna"].obs["age_group2"]]
mdata["rna"].obs["da_age_group"] = mdata["rna"].obs["da_age_group"].astype("category")
mdata["rna"].obs["da_age_group"] = mdata["rna"].obs["da_age_group"].cat.reorder_categories(["paed", "adult"])

# Differential abundance testing
milo.da_nhoods(mdata, design="~sex+da_age_group")

# Build nhood graph
milo.build_nhood_graph(mdata)

In [None]:
mdata

In [None]:
mdata['milo'].uns

In [None]:
# Remove any samples which were not TCR sequenced
sample_freq = adata.obs.groupby('sample')['chain_status'].apply(lambda x: x.value_counts(normalize = True)).reset_index(name = 'prop').rename(columns = {'level_1' : 'chain_status'})
exclude_samples = sample_freq.loc[(sample_freq['chain_status'] == 'No_contig') & (sample_freq['prop'] == 1)]['sample'].unique()

np.array(exclude_samples)

In [None]:
# Calculate the fraction of cells with unproductive rearrangements in each neighbourhood
nhoods = mdata['rna'][~mdata['rna'].obs['sample'].isin(exclude_samples)].obsm['nhoods']
nhoods_ncells = np.array(nhoods.sum(axis = 0)).flatten()
has_nonproductive = mdata['rna'][~mdata['rna'].obs['sample'].isin(exclude_samples)].obs['has_nonproductive'].astype(int).to_numpy()

n_nonproductive = has_nonproductive * nhoods
prop_nonproductive = np.divide(n_nonproductive, nhoods_ncells, where=nhoods_ncells!=0)
#prop_nonproductive[~np.isfinite(prop_nonproductive)] = np.nan

plt.hist(prop_nonproductive, bins = 20)

In [None]:
# Add to mdata
mdata['milo'].var['prop_nonproductive'] = prop_nonproductive   

In [None]:
mdata['milo'].var.dtypes

In [None]:
for c in mdata['rna'].obs.columns:
    if mdata['rna'].obs[c].dtypes == 'object':
        mdata['rna'].obs[c] = mdata['rna'].obs[c].astype('|S')
for c in mdata['milo'].var.columns:
    if mdata['milo'].var[c].dtypes == 'object':
        mdata['milo'].var[c] = mdata['milo'].var[c].astype('|S')

mdata.write_h5mu(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}_milo.zarr',
                compression=hdf5plugin.FILTERS["zstd"],
                compression_opts=hdf5plugin.Zstd(clevel=5).filter_options,
        )

In [None]:
mdata['milo'].var

In [None]:
# Plot age by nhood
sc.pl.embedding(mdata["milo"].T, "X_milo_graph",
                    color=["prop_nonproductive", 'logFC'], cmap="magma",
                    size=mdata["milo"].T.obs["Nhood_size"]*0.5,neighbors_key="nhood",
                    # edge_width =
                    sort_order=False,
                    frameon=False,
                    return_fig = True,
                    ncols=2,show=False)
plt.savefig(f'{plots_path}/ctAnnotation/v5/thyAgeing_tSplitAllEmbed_miloNhoodGraph_prodAndAge_umap.png', dpi=300, bbox_inches='tight')

In [None]:
# Calculate mean prop nonproductive per cell depending on nhood assignment
nhoods = mdata['rna'].obsm['nhoods']
nhoods_per_cell = np.array(nhoods.sum(axis = 1)).flatten()
prop_nonproductive = mdata['milo'].var['prop_nonproductive'].to_numpy()

import scipy
mdata['rna'].obs['nhood_prop_nonproductive'] = np.array(scipy.sparse.csr_matrix(nhoods.multiply(prop_nonproductive)).sum(axis = 1).flatten() / nhoods_per_cell).flatten()

In [None]:
sc.pl.umap(mdata['rna'], color = 'nhood_prop_nonproductive', cmap = 'magma', return_fig = True, show = False)
plt.savefig(f'{plots_path}/ctAnnotation/v5/thyAgeing_tSplit_allEmbed_propNonproductive_umap.png', dpi=300, bbox_inches='tight')

In [None]:
for c in mdata['rna'].obs.columns:
    if mdata['rna'].obs[c].dtypes == 'object':
        mdata['rna'].obs[c] = mdata['rna'].obs[c].astype('|S')
for c in mdata['rna'].var.columns:
    if mdata['rna'].var[c].dtypes == 'object':
        mdata['rna'].var[c] = mdata['rna'].var[c].astype('|S')

mdata.write_h5mu(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}_milo.zarr',
                compression=hdf5plugin.FILTERS["zstd"],
                compression_opts=hdf5plugin.Zstd(clevel=5).filter_options,
        )

In [None]:
tcr_df = mdata['rna'].obs[['donor', 'age_group', 'taa_l4', 'nhood_prop_nonproductive']].copy()
tcr_df.dropna(subset=['taa_l4'], inplace=True)
tcr_df.head()

In [None]:
plot_grouped_boxplot(tcr_df, x = 'taa_l4', y = 'nhood_prop_nonproductive', hue = 'age_group', order = [c for c in taa_l4_t_levels if c in tcr_df['taa_l4'].unique()], hue_order = ['infant','paed','adult', 'geriatric'], 
                     x_label = 'T cell compartment', y_label = 'Mean proportion of cells with non-productive rearrangements', legend_title = 'Age group', add_stats = True, format_percent = True)

In [None]:
adata.obs.columns.to_numpy()

In [None]:
samples_to_exclude = adata.obs.groupby('sample')['productive_VJ'].value_counts(normalize=True).to_frame(name='prop').reset_index()
samples_to_exclude = samples_to_exclude.loc[samples_to_exclude['productive_VJ'] == 'No_contig']
samples_to_exclude = samples_to_exclude.loc[samples_to_exclude['prop'] > 0.6]['sample'].tolist()

df = adata.obs[['productive_VDJ', 'productive_VJ', 'taa_l4', 'taa_l2', 'age_group', 'donor', 'sample']].copy()
df = df.loc[(df['sample'].isin(samples_to_exclude) == False)]
df['has_prod_VDJ'] = df['productive_VDJ'].apply(lambda x: 'T' in x)
df['has_prod_VJ'] = df['productive_VJ'].apply(lambda x: 'T' in x)

df_grouped = df.groupby(['taa_l4', 'taa_l2', 'age_group', 'donor', 'sample'], observed=True).agg(n_cells = ('donor', 'count'),
                                                                                                 prop_prod_TRA = ('has_prod_VJ', 'mean'),
                                                                                                 prop_prod_TRB = ('has_prod_VDJ', 'mean')).reset_index()
df_grouped

In [None]:
plot_grouped_boxplot(df_grouped, x = 'taa_l4', y = 'prop_prod_TRA', hue = 'age_group', order = [c for c in taa_l4_t_levels if c in tcr_df['taa_l4'].unique()], hue_order = ['infant','paed','adult', 'geriatric'], 
                     x_label = 'T cell compartment', y_label = 'Proportion of productive TRA rearrangements', legend_title = 'Age group', add_stats = True, format_percent = True)
plt.savefig(f'{plots_path}/ctAnnotation/v5/thyAgeing_tSplit_prodTRA_umap.png', dpi=300, bbox_inches='tight')

In [None]:
plot_grouped_boxplot(df_grouped, x = 'taa_l4', y = 'prop_prod_TRB', hue = 'age_group', order = [c for c in taa_l4_t_levels if c in tcr_df['taa_l4'].unique()], hue_order = ['infant','paed','adult', 'geriatric'], 
                     x_label = 'T cell compartment', y_label = 'Proportion of productive TRB rearrangements', legend_title = 'Age group', add_stats = True, format_percent = True)
plt.savefig(f'{plots_path}/ctAnnotation/v5/thyAgeing_tSplit_prodTRB_umap.png', dpi=300, bbox_inches='tight')

In [None]:
session_info.show()