# Build neighbourhood VDJ feature space

In [None]:
import palantir
import dandelion as ddl

import numpy as np
import seaborn as sns
import os
import pandas as pd
import scanpy as sc
from collections import Counter
ddl.logging.print_header()

In [None]:
import matplotlib.pyplot as plt

In [None]:
# set working directory
os.chdir('/nfs/team205/ny1/ThymusSpatialAtlas/Figure5/VDJ/')
fig_path = '/nfs/team205/ny1/ThymusSpatialAtlas/Figure5/VDJ/figures'

In [None]:
sc.logging.print_header()

In [None]:
sc.settings.set_figure_params(dpi = 160, color_map = 'RdYlBu_r', dpi_save = 300, format = 'pdf')
plt.rcParams["figure.figsize"] = [6,6]
sns.set_palette('colorblind')

In [None]:
#### need to add this line to restore plotting function of scanpy in the presence of palantir
%matplotlib inline

# Load data

In [None]:
adata = sc.read('/nfs/team205/ny1/ThymusSpatialAtlas/Figure5/Latest_version/Thymus_Atlas_v17_cite_Tv4_T_lin.h5ad')
adata

In [None]:
adata.obs['annotation_level_3'].value_counts().keys()

In [None]:
# set up subsets and colors

ct_all_order = ['ETP','T_DN(early)','T_DN(P)','T_DN(Q)-early','T_DN(Q)','T_DN(Q)-CD99','T_DN(CD4)','T_DN(CD4)-CD99',
                'T_DP(P)','T_DP(Q)-early','T_DP(Q)','T_DP(Q)-HSPH1', 'T_DP(Q)-CD99',
                'T_αβT(entry)','T_SP-HSP', 'T_CD4', 'T_CD8', 'T_CD8-Prolif', 
                'T_reg(agonist)', 'T_reg(diff)-FOXP3hi', 'T_reg(diff)-FOXP3Lo','T_reg','T_NK']
                   
ct_color_map = dict(zip(ct_all_order, np.array(sns.color_palette("husl", len(ct_all_order)))[range(len(ct_all_order))]))

# Load abTCR

In [None]:
sheet_url = 'https://docs.google.com/spreadsheets/d/1A9FAZ3_hNgBfsgf4aiZ38Wae4zXyk1y1AgmIp3lFw0o/edit?usp=sharing'
url_1 = sheet_url.replace(‘/edit#gid=’, ‘/export?format=csv&gid=’)
meta = pd.read_csv(url_1)                   
# meta = pd.read_csv('/nfs/team205/ny1/ThymusSpatialAtlas/Figure1/Thymus_SingleCell_data_15122022.csv')

In [None]:
meta = meta[~(meta['path_TCRab'].isna())]
meta

In [None]:
import os.path
from os import path
from tqdm import tqdm
tcrab = {}
for x, y in tqdm(zip(meta['path_TCRab'], meta['library '])):
    file1 = '/'+x+'/all_contig_annotations.json'
    file2 = '/'+x+'/outs/all_contig_annotations.json'
    
    if path.exists(file1):
        tmp = ddl.read_10x_vdj(file1)
    else: 
        tmp = ddl.read_10x_vdj(file2)
    
    # update cell_id to librarry-barcode
    tmp.data['cell_id']= [y + '-' + z.split('-1')[0] for z in tmp.data['cell_id']]
    ddl.utl.update_metadata(tmp) # update the metadata_names
    # only leave contigs with cell_id in adata
    tmp = tmp[tmp.data['cell_id'].isin(adata.obs_names)].copy()
    
    tcrab[x] = tmp
len(tcrab)

In [None]:
# concatenate
tcrab = ddl.concat([tcrab[x] for x in tcrab], prefixes = list(tcrab.keys()))
tcrab

In [None]:
ddl.tl.transfer(adata, tcrab)

In [None]:
tcrab.data

In [None]:
# library_type is set to filter out genes that are not TRA/TRB (in 'locus' column) as this library is abTCR
tcrab.data['sequence_alignment'] = tcrab.data['sequence']
tcrab_checked, trab_adata = ddl.pp.check_contigs(tcrab, adata, productive_only = False, library_type = 'tr-ab')
tcrab_checked

# Filter cells

In [None]:
# only take samples that had TCR sequencing done
adata_abtcr = trab_adata[~(trab_adata.obs['path_TCRab'].isna())].copy()
adata_abtcr

In [None]:
adata

In [None]:
adata_abtcr.write('/nfs/team205/ny1/ThymusSpatialAtlas/Figure1/Thymus_Atlas_v15_abTCR.h5ad')

In [None]:
adata_abtcr = sc.read('/nfs/team205/ny1/ThymusSpatialAtlas/Figure3/VDJ/Thymus_Atlas_v15_abTCR.h5ad')

In [None]:
plt.rcParams["figure.figsize"] = [20,20]
plt.rcParams['font.family'] = 'sans-serif'

## Subset cells to dp onwards, and cells with paired TCRab

In [None]:
ct_order = ['T_DP(P)','T_DP(Q)-early','T_DP(Q)','T_DP(Q)-HSPH1', 'T_DP(Q)-CD99',
                'T_αβT(entry)', 'T_CD4', 'T_CD8']
                   
ct_color_map = dict(zip(ct_order, np.array(sns.color_palette("husl", len(ct_order)))[range(len(ct_order))]))


# subset cells to celltypes within ct_order
bdata = adata_abtcr[adata_abtcr.obs['annotation_level_3'].isin(ct_order)]
# filter out cells without any contig
for chain in ['v_call_abT_VDJ_main', 'j_call_abT_VDJ_main','v_call_abT_VJ_main', 'j_call_abT_VJ_main']:
    bdata.obs[chain] = bdata.obs[chain].astype('str')
# change all entries with ',' (inconfident mappings) to 'None'
for cell in bdata.obs_names:
    for chain in ['v_call_abT_VDJ_main', 'j_call_abT_VDJ_main','v_call_abT_VJ_main', 'j_call_abT_VJ_main']:
        gene = bdata.obs.loc[cell, chain]
        if ',' in gene or gene =='None' or gene =='' or gene=='No_contig':
            bdata.obs.loc[cell, chain] = chain+'_None'

In [None]:
# option for DP onwards - only leave cells with all 4 chains
bdata = bdata[~(np.array(bdata.obs['v_call_abT_VDJ_main'].str.endswith('None')) | np.array(bdata.obs['j_call_abT_VDJ_main'].str.endswith('None')) |
np.array(bdata.obs['v_call_abT_VJ_main'].str.endswith('None')) | np.array(bdata.obs['j_call_abT_VJ_main'].str.endswith('None')))]

In [None]:
bdata

# Select neighbourhoods 

In [None]:
## need to redo neighborhood graph after subsetting cells before milo
# n_neighbors decides the minimum neighbourhood size 
# here use_rep = 'X_scvi' as data integration was done using scVI
sc.pp.neighbors(bdata, use_rep = "X_scVI", n_neighbors = 100)
sc.tl.umap(bdata, random_state = 1712)

In [None]:
# take a look at the UMAP to make sure it looks reasonable i.e. different cell types are clustered separately
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [5.5,5]
sc.pl.umap(bdata, color=['annotation_level_3'], palette = ct_color_map, legend_loc = 'right margin', legend_fontsize=10)

In [None]:
import milopy
import milopy.core as milo

# use milo to sample neighbourhood
milo.make_nhoods(bdata)
# build neighbourhood adata in bdata.uns['nhood_adata']
milo.count_nhoods(bdata, sample_col='Sample') # this step is needed to build bdata.uns['nhood_adata'] and sample_col can be anything
# this step is needed for plotting below
milopy.utils.build_nhood_graph(bdata)
# assign neighbourhood celltype by majority voting
# results are in bdata.uns['nhood_adata'].obs['nhood_annotation'] & bdata.uns['nhood_adata'].obs['nhood_annotation_frac'] 
milopy.utils.annotate_nhoods(bdata, anno_col='annotation_level_3')
bdata

Now neighbourhood adata is stored in bdata.uns['nhood_adata']

# Create neighbourhood VDJ feature space

In [None]:
#### this option for DP
# function for making neighbourhood vdj feature space
nhood_adata = ddl.tl.vdj_pseudobulk(bdata, pbs = bdata.obsm['nhoods'], obs_to_take = 'annotation_level_3', extract_cols=['v_call_abT_VDJ_main', 'j_call_abT_VDJ_main','v_call_abT_VJ_main', 'j_call_abT_VJ_main'])
nhood_adata

     nhood_adata is the new neighbourhood VDJ feature space, whereby each observation is a cell neighbourhood
     VDJ usage frequency stored in nhood_adata.X
     VDJ genes stored in nhood_adata.var
     neighbourhood metadata stored in nhood_adata.obs
     can visualise the data using PCA or UMAP (see below)

In [None]:
# # sort out the annotation colour order
nhood_adata.obs['annotation_level_3'] = nhood_adata.obs['annotation_level_3'].astype('category')
nhood_adata.obs['annotation_level_3'] = nhood_adata.obs['annotation_level_3'].cat.reorder_categories(ct_order)

## Run Pseudotime on VDJ feature space

In [None]:
# make sure you install palantir if you don't already have it

# Run diffusion maps
pca_projections = pd.DataFrame(nhood_adata.obsm['X_pca'], index=nhood_adata.obs_names)
dm_res = palantir.utils.run_diffusion_maps(pca_projections, n_components=10)
dm_res

In [None]:
# based on plot above, choose n_eigs
ms_data = palantir.utils.determine_multiscale_space(dm_res, n_eigs=5)

In [None]:
# for DN only
#plt.rcParams["figure.figsize"] = [4,4]
#sc.pl.umap(nhood_adata, color=[col + '_None' for col in cols],color_map = 'RdYlBu_r')

In [None]:
# select the start and end points
# start
tmp = nhood_adata[nhood_adata.obs['annotation_level_3'] == 'T_DP(P)']
#tmp = nhood_adata[nhood_adata.obs['mapping_anno_v3'] == 'DN(early)']
rootcell = np.argmax(tmp.obsm['X_umap'][:,1])
rootcell = tmp.obs_names[rootcell]
nhood_adata.obs['rootcell'] = 0
nhood_adata.obs.loc[rootcell,'rootcell'] = 1

In [None]:
# ends
tmp1 = nhood_adata[nhood_adata.obs['annotation_level_3'] == 'T_CD8']
tmp2 = nhood_adata[nhood_adata.obs['annotation_level_3'] == 'T_CD4']
endcell1 = np.argmax(tmp1.obsm['X_umap'][:,0])
endcell1 = tmp1.obs_names[endcell1]
endcell2 = np.argmax(tmp2.obsm['X_umap'][0,:])
endcell2 = tmp2.obs_names[endcell2]

terminal_states = pd.Series(['T_CD8', 'T_CD4'], 
                           index=[endcell1,endcell2])

In [None]:
# plot rootcell and terminal states
nhood_adata.obs['terminal_states'] = 0
nhood_adata.obs.loc[terminal_states.index, 'terminal_states'] = 1
plt.rcParams["figure.figsize"] = [4,4]
sc.pl.umap(nhood_adata,color=['rootcell','terminal_states','annotation_level_3'],
           title=['root cell','terminal states','nhood annotation'],color_map='OrRd')

In [None]:
pr_res = palantir.core.run_palantir(ms_data,  rootcell, num_waypoints=500, 
                                    terminal_states = terminal_states.index)

In [None]:
pr_res.branch_probs.columns = terminal_states[pr_res.branch_probs.columns]

## Visualise the data

In [None]:
ddl.tl.pseudotime_transfer(adata = nhood_adata, pr_res = pr_res, suffix = '_nhood_vdj')

In [None]:
plt.rcParams["figure.figsize"] = [4,4]
plot = ['pseudotime', 'prob_T_CD8', 'prob_T_CD4']
sc.pl.umap(nhood_adata,color=[term + '_nhood_vdj' for term in plot],
           title=['pseudotime','branch probability to T_CD8',
                  'branch probability to T_CD4'],
           frameon=False,wspace=0.1,
           color_map = 'RdYlBu_r'
          )

## Project pseudotime and branch probabilities back to cells

In [None]:
# project the nhood level pseudotime to cell level pseudotime.
cdata = ddl.tl.project_pseudotime_to_cell(adata = bdata, 
                               pb_adata = nhood_adata, 
                               term_states=['T_CD8','T_CD4'], 
                               suffix = '_nhood_vdj')

In [None]:
sc.pl.umap(cdata, 
           color=[term + '_nhood_vdj' for term in plot]+['annotation_level_3'],
           color_map = 'RdYlBu_r')

In [None]:
sns.set_theme(style='white')
fig, ax = plt.subplots(figsize=(15,5))
cdata.obs['annotation_level_3'] = cdata.obs['annotation_level_3'].cat.reorder_categories(ct_order)
df = cdata.obs.copy()
sigma = 0.01
df['prob_T_CD8_nhood_vdj'] = df['prob_T_CD8_nhood_vdj'] + (np.random.rand(len(df['prob_T_CD8_nhood_vdj']))-0.5)*sigma
ax= sns.scatterplot(data=df, 
                    x ='pseudotime_nhood_vdj',
                    y='prob_T_CD8_nhood_vdj',
                    s=4,
                    hue='annotation_level_3',
                    palette = ct_color_map)
ax.set_ylabel('probability to T_CD8')    
ax.set_xlabel('pseudotime')
ax.set_title('')
h,l = ax.get_legend_handles_labels()
l1 = ax.legend(h[:10],l[:10], loc='upper right',bbox_to_anchor=(1.2, 1),frameon=False, fontsize='small')
plt.savefig(fig_path+'/pseudotime_scatterplot_nhood_vdj.pdf',bbox_inches='tight')

In [None]:
cdata.obs['annotation_level_3']

In [None]:
bins = np.linspace(start=0, stop=1, num=21)[1:]
cdata.obs["prob_T_CD8_nhood_vdj_bin"]=np.digitize(cdata.obs["prob_T_CD8_nhood_vdj"], bins, right=True)

fig, ax = plt.subplots(figsize=(18,5))

# plot color bar
norm = plt.Normalize(0, 1)
sm = plt.cm.ScalarMappable(cmap="rocket_r", norm=norm)
sm.set_array([])

ax=sns.stripplot(data=cdata.obs, x="pseudotime_nhood_vdj", y="annotation_level_3", hue="prob_T_CD8_nhood_vdj_bin",
                   palette='rocket_r',dodge=True, size=1)
ax.set_ylabel('')    
ax.set_xlabel('pseudotime')
ax.set_title('')
ax.get_legend().remove()
ax.figure.colorbar(sm)

#plt.savefig(fig_path+'vdj_pseudotime_stripplot.pdf',bbox_inches='tight')

In [None]:
# DP to SPT, combined option
cdata.uns['nhood_adata'].uns['annotation_labels'] = np.nan
cdata.write('/nfs/team205/ny1/ThymusSpatialAtlas/Figure3/VDJ/DP_combined_pseudotime.h5ad')
cdata

# look at the TRAV/J expression in DP(Q) that are beyond the bifurcation point

In [None]:
# look at TRAV/TRAJ expression 
bulk_adata = ddl.tl.vdj_pseudobulk(adata = cdata, obs_to_bulk = ['annotation_level_3'], obs_to_take = ['annotation_level_3'],
                                   extract_cols= ['v_call_abT_VDJ_main', 'j_call_abT_VDJ_main','v_call_abT_VJ_main', 'j_call_abT_VJ_main'])

In [None]:
# load TCR list by location - lists in github folder metadata/TCR_genes
TCR_list_by_loc_dict = {}
for chain in ['TRAV','TRAJ']:
    path = '/lustre/scratch117/cellgen/team205/cs42/VDJ_collab_manuscript/gene_list/'+chain+'_list_by_location.csv'
    TCR_list_by_loc_dict[chain] = list(pd.read_csv(path, header=None)[0])

In [None]:
gene_intersection = [gene for gene in TCR_list_by_loc_dict['TRAV'] if gene in bulk_adata.var_names] +[gene for gene in TCR_list_by_loc_dict['TRAJ'] if gene in bulk_adata.var_names] 
trav = pd.DataFrame(index = bulk_adata.obs['annotation_level_3'], columns = gene_intersection,
                    data = bulk_adata[:,gene_intersection].X)
trav = trav

In [None]:
trav = trav.reindex(['T_DP(P)','T_DP(Q)-early','T_alpha_vj_init','T_alpha_vj_inter','T_alpha_vj_adv','T_alpha_vj_late','T_DP(Q)-HSPH1','T_DP(Q)-CD99','T_αβT(entry)','T_αβT(entry)_CD4','T_αβT(entry)_CD8', 'T_CD4','T_CD8'])
# trav = trav.reindex(['T_DP(P)','T_DP(Q)-early', 'T_DP(Q)', 'T_DP(Q)-HSPH1','T_DP(Q)-CD99','T_DP_late','T_αβT(entry)','T_αβT(entry)_CD4', 'T_CD4','T_αβT(entry)_CD8','T_CD8'])

In [None]:
fig_path

In [None]:
plt.rcParams["figure.figsize"] = [15,8]
sns.heatmap(trav)
# DP_late is using the end of TRAV/TRAJ

In [None]:
# generate some vdj plots 

In [None]:
adata = adata[adata.obs['annotation_level_3'].isin(ct_order)]

In [None]:
sc.settings.set_figure_params(dpi = 160, color_map = 'RdYlBu_r', dpi_save = 300, format = 'pdf')
sc.pl.umap(adata, color=['pseudotime_nhood_vdj','prob_T_CD4_nhood_vdj','prob_T_CD8_nhood_vdj'],legend_loc='on data',frameon=False)