# Combine Visium data post Tissue Tag processing

In [1]:
import numpy as np
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib import colors
import seaborn as sb
import re
import os
import scipy.stats
from numpy import asarray as ar
from collections import Counter
import scvi
import anndata as ad
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to show output from all the lines in a cells
pd.set_option('display.max_column',None) # display all the columns in pandas
pd.options.display.max_rows = 100

from datetime import date
today = str(date.today())
sc.settings.verbosity = 1
sc.logging.print_version_and_date()
%load_ext autoreload
%autoreload 2

Global seed set to 0


Running Scanpy 1.9.1, on 2023-08-02 00:46.


  new_rank_zero_deprecation(
  return new_rank_zero_deprecation(*args, **kwargs)


In [2]:
## For correct plotting of the images
import matplotlib
from matplotlib import rcParams
rcParams['pdf.fonttype'] = 42
sc.settings.set_figure_params(dpi = 150, color_map = 'RdPu', dpi_save = 150, vector_friendly = True, format = 'pdf')

In [3]:
# output folder 
import os
os.getcwd()

'/nfs/team205/ny1/ThymusSpatialAtlas/Figure2'

In [4]:
def select_slide(adata, s, s_col='SampleID'):
    r""" This function selects the data for one slide from the spatial anndata object.

    :param adata: Anndata object with multiple spatial experiments
    :param s: name of selected experiment
    :param s_col: column in adata.obs listing experiment name for each location
    """

    slide = adata[adata.obs[s_col].isin([s]), :]
    s_keys = list(slide.uns['spatial'].keys())
    s_spatial = np.array(s_keys)[[s in k for k in s_keys]][0]

    slide.uns['spatial'] = {s_spatial: slide.uns['spatial'][s_spatial]}

    return slide

# Orgenize analysis object 
this is divided to 4 steps:
1) load cell2loaction object "sp" which has only HVG genes 
2) load merged raw visium object "adata_vis" 
3) create a new object with all annotations and all raw genes
4) update tissue tag annotations from to generate a final annotated object 

In [5]:
# load the CSV file into a dataframe
meta_df = pd.read_csv('/nfs/team205/ny1/ThymusSpatialAtlas/Figure2/HTSA_All_Spatial_Data_v4-visium.csv')

In [6]:
# load the raw slides
slides = []
# iterate over the dataframe rows
for path in meta_df['path'].tolist():
    # load the anndata object from the path
    adata = sc.read_h5ad(f'{path}/adata_cma_v2.h5ad')
    adata.obs['SampleID'] = meta_df.loc[meta_df['path'] == path, 'SampleID'].values[0]
    adata.var['SYMBOL'] = adata.var_names
    adata.var.rename(columns={'gene_ids': 'ENSEMBL'}, inplace=True)
    adata.var_names_make_unique()
    sc.pp.calculate_qc_metrics(adata, percent_top = None, inplace=True)
    adata.var['mt'] = [gene.startswith('MT-') for gene in adata.var['SYMBOL']]
    adata.obs['percent_mito'] = adata[:, adata.var['mt'].tolist()].X.sum(1).A.squeeze()/adata.obs['total_counts']

    # add sample name to obs names
    adata.obs_names = adata.obs["SampleID"] + '-' + adata.obs_names
    adata.obs.index.name = 'spot_id'
    print(f'{path} has been read')
    slides.append(adata)

  utils.warn_names_duplicates("var")


/nfs/team205/vk8/irods_data/09_thymus/visium/all_thymus_visium/raw_data2/Align_and_detect_output/hsta/TA11486161 has been read


  utils.warn_names_duplicates("var")


/nfs/team205/vk8/irods_data/09_thymus/visium/all_thymus_visium/raw_data2/Align_and_detect_output/hsta/TA11486162 has been read


  utils.warn_names_duplicates("var")


/nfs/team205/vk8/irods_data/09_thymus/visium/all_thymus_visium/raw_data2/Align_and_detect_output/hsta/TA11486163 has been read


  utils.warn_names_duplicates("var")


/nfs/team205/vk8/irods_data/09_thymus/visium/all_thymus_visium/raw_data2/Align_and_detect_output/hsta/TA11486164 has been read


  utils.warn_names_duplicates("var")


/nfs/team205/vk8/irods_data/09_thymus/visium/all_thymus_visium/raw_data2/Align_and_detect_output/hsta/TA11556492 has been read


  utils.warn_names_duplicates("var")


/nfs/team205/vk8/irods_data/09_thymus/visium/all_thymus_visium/raw_data2/Align_and_detect_output/hsta/TA11556493 has been read


  utils.warn_names_duplicates("var")


/nfs/team205/vk8/irods_data/09_thymus/visium/all_thymus_visium/raw_data2/Align_and_detect_output/hsta/TA11556494 has been read


  utils.warn_names_duplicates("var")


/nfs/team205/vk8/irods_data/09_thymus/visium/all_thymus_visium/raw_data2/Align_and_detect_output/hsta/TA11556495 has been read


  utils.warn_names_duplicates("var")


/nfs/team205/vk8/irods_data/09_thymus/visium/all_thymus_visium/raw_data2/Align_and_detect_output/hsta/TA11556496 has been read


  utils.warn_names_duplicates("var")


/nfs/team205/vk8/irods_data/09_thymus/visium/all_thymus_visium/raw_data2/Align_and_detect_output/hsta/WSSS_F_IMMsp10864183 has been read


  utils.warn_names_duplicates("var")


/nfs/team205/vk8/irods_data/09_thymus/visium/all_thymus_visium/raw_data2/Align_and_detect_output/hsta/WSSS_F_IMMsp11604685 has been read


  utils.warn_names_duplicates("var")


/nfs/team205/vk8/irods_data/09_thymus/visium/all_thymus_visium/raw_data2/Align_and_detect_output/hsta/WSSS_F_IMMsp11604686 has been read


  utils.warn_names_duplicates("var")


/nfs/team205/vk8/irods_data/09_thymus/visium/all_thymus_visium/raw_data2/Align_and_detect_output/hsta/WSSS_F_IMMsp11604687 has been read


  utils.warn_names_duplicates("var")


/nfs/team205/vk8/irods_data/09_thymus/visium/all_thymus_visium/raw_data2/Align_and_detect_output/hsta/WSSS_F_IMMsp11604688 has been read


  utils.warn_names_duplicates("var")


/nfs/team205/vk8/irods_data/09_thymus/visium/all_thymus_visium/raw_data2/Align_and_detect_output/hsta/WSSS_F_IMMsp11604689 has been read


  utils.warn_names_duplicates("var")


/nfs/team205/vk8/irods_data/09_thymus/visium/all_thymus_visium/raw_data2/Align_and_detect_output/hsta/WSSS_F_IMMsp11604690 has been read


  utils.warn_names_duplicates("var")


/nfs/team205/vk8/irods_data/09_thymus/visium/all_thymus_visium/raw_data2/Align_and_detect_output/hsta/WSSS_F_IMMsp11765867 has been read


  utils.warn_names_duplicates("var")


/nfs/team205/vk8/irods_data/09_thymus/visium/all_thymus_visium/raw_data2/Align_and_detect_output/hsta/WSSS_F_IMMsp11765868 has been read


  utils.warn_names_duplicates("var")


/nfs/team205/vk8/irods_data/09_thymus/visium/all_thymus_visium/raw_data2/Align_and_detect_output/hsta/WSSS_F_IMMsp11765870 has been read


  utils.warn_names_duplicates("var")


/nfs/team205/vk8/irods_data/09_thymus/visium/all_thymus_visium/raw_data2/Align_and_detect_output/hsta/WSSS_F_IMMsp9838711 has been read


  utils.warn_names_duplicates("var")


/nfs/team205/vk8/irods_data/09_thymus/visium/all_thymus_visium/raw_data2/Align_and_detect_output/hsta/WSSS_F_IMMsp9838716 has been read


  utils.warn_names_duplicates("var")


/nfs/team205/vk8/irods_data/09_thymus/visium/all_thymus_visium/raw_data2/Align_and_detect_output/hsta/WSSS_THYst9142086 has been read


  utils.warn_names_duplicates("var")


/nfs/team205/vk8/irods_data/09_thymus/visium/all_thymus_visium/raw_data2/Align_and_detect_output/hsta/WSSS_THYst9142087 has been read


  utils.warn_names_duplicates("var")


/nfs/team205/vk8/irods_data/09_thymus/visium/all_thymus_visium/raw_data2/Align_and_detect_output/hsta/WSSS_THYst9142088 has been read


  utils.warn_names_duplicates("var")


/nfs/team205/vk8/irods_data/09_thymus/visium/all_thymus_visium/raw_data2/Align_and_detect_output/hsta/WSSS_THYst9142089 has been read


  utils.warn_names_duplicates("var")


/nfs/team205/vk8/irods_data/09_thymus/visium/all_thymus_visium/raw_data2/Align_and_detect_output/hsta/WSSS_THYst9518030 has been read


  utils.warn_names_duplicates("var")


/nfs/team205/vk8/irods_data/09_thymus/visium/all_thymus_visium/raw_data2/Align_and_detect_output/hsta/WSSS_THYst9518032 has been read
/nfs/team205/vk8/irods_data/09_thymus/visium/all_thymus_visium/raw_data2/Align_and_detect_output/hsta/WSSS_THYst9518033 has been read


  utils.warn_names_duplicates("var")


In [7]:
adata_full = slides[0].concatenate(
        slides[1:],
        uns_merge="unique",
        index_unique=None
    )

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


In [8]:
adata_full.obs.columns

Index(['in_tissue', 'array_row', 'array_col', 'y', 'x', 'annotations_level_0',
       'annotations_level_0_number', 'annotations_level_1',
       'annotations_level_1_number', 'annotations_lobules_0',
       'annotations_lobules_0_number', 'L2_dist_annotations_level_0_Artifacts',
       'L2_dist_annotations_level_0_Cortex',
       'L2_dist_annotations_level_0_Edge',
       'L2_dist_annotations_level_0_Medulla', 'L2_dist_annotations_level_1_HS',
       'L2_dist_annotations_level_1_PVS', 'L2_dist_annotations_level_1_fat',
       'L2_dist_annotations_level_1_unassigned',
       'L2_dist_annotations_level_1_vessels', 'cma_v2', 'manual_bin_cma_v2',
       'manual_bin_cma_v2_int', 'SampleID', 'n_genes_by_counts',
       'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
       'percent_mito', 'L2_dist_annotations_level_0_lymph', 'cm', 'cc',
       'batch'],
      dtype='object')

In [9]:
obs_merged = pd.merge(adata_full.obs, meta_df, how = "left", left_on = 'SampleID', right_on = 'SampleID')

In [10]:
obs_merged.index = adata_full.obs_names

In [11]:
obs_merged.columns = ['in_tissue', 'array_row', 'array_col', 'y', 'x', 'annotations_level_0',
       'annotations_level_0_number', 'annotations_level_1',
       'annotations_level_1_number', 'annotations_lobules_0',
       'annotations_lobules_0_number', 'L2_dist_annotations_level_0_Artifacts',
       'L2_dist_annotations_level_0_Cortex',
       'L2_dist_annotations_level_0_Edge',
       'L2_dist_annotations_level_0_Medulla', 'L2_dist_annotations_level_1_HS',
       'L2_dist_annotations_level_1_PVS', 'L2_dist_annotations_level_1_fat',
       'L2_dist_annotations_level_1_unassigned',
       'L2_dist_annotations_level_1_vessels', 'cma_v2', 'manual_bin_cma_v2',
       'manual_bin_cma_v2_int', 'SampleID', 'n_genes_by_counts',
       'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
       'percent_mito', 'L2_dist_annotations_level_0_lymph', 'cm', 'cc',
       'batch', 'Sample', 'Sample_hr', 'Funding', 'Sequencer', 'SlideID',
       'Position', 'Image_name', 'Image_jpg', 'section_thickness (um)',
       'permebialisation(min)', 'RIN/DV200', 'Visium_type', 'DonorID',
       'Donor_type', 'Age_group', 'Age(misc)', 'Age(numeric)', 'Source',
       'Study', 'Study name ', 'Study ID', 'Research_no', 'Deposited before ',
       'Sex', 'QC', 'Batch', 'Spaceranger', 'old_sample_name',
       'annotation version', 'path', 'updated pipeline', 'L1_knn1', 'L0_KNN10',
       'for mapping ']

In [12]:
adata_full.obs = obs_merged.copy()

In [13]:
set(adata_full.obs.columns).difference(set(['Sample','Sample_hr','SampleID', 'SlideID', 'Position','in_tissue', 'array_row', 'array_col', 'x', 'y','n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts',
       'log1p_total_counts', 'percent_mito', 'annotations_level_0',
       'annotations_level_0_number', 'annotations_level_1',
       'annotations_level_1_number', 'annotations_lobules_0',
       'annotations_lobules_0_number', 'L2_dist_annotations_level_0_Cortex',
       'L2_dist_annotations_level_0_Edge',
       'L2_dist_annotations_level_0_Medulla', 'L2_dist_annotations_level_1_HS',
       'L2_dist_annotations_level_1_PVS', 'L2_dist_annotations_level_1_fat',
       'L2_dist_annotations_level_1_vessels', 'L2_dist_annotations_level_0_lymph',
       'L2_dist_annotations_level_1_unassigned', 'cm', 'cc', 'cma_v2',
       'manual_bin_cma_v2', 'manual_bin_cma_v2_int','Image_name', 'Image_jpg', 'section_thickness (um)',
       'permebialisation(min)', 'RIN/DV200', 'Visium_type', 'DonorID',
       'Donor_type', 'Age_group', 'Age(misc)', 'Age(numeric)', 'Source',
       'Study', 'Study name ', 'Study ID', 'Research_no', 'Deposited before ',
       'Sex', 'QC', 'Batch', 'Spaceranger', 'old_sample_name',
       'annotation version', 'path', 'updated pipeline', 'L1_knn1', 'batch']))

{'Funding',
 'L0_KNN10',
 'L2_dist_annotations_level_0_Artifacts',
 'Sequencer',
 'for mapping '}

In [14]:
adata_full.obs = adata_full.obs[['Sample','Sample_hr','SampleID', 'SlideID', 'Position','in_tissue', 'array_row', 'array_col', 'x', 'y','n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts',
       'log1p_total_counts', 'percent_mito', 'annotations_level_0',
       'annotations_level_0_number', 'annotations_level_1',
       'annotations_level_1_number', 'annotations_lobules_0',
       'annotations_lobules_0_number', 'L2_dist_annotations_level_0_Artifacts',
                                 'L2_dist_annotations_level_0_Cortex',
       'L2_dist_annotations_level_0_Edge',
       'L2_dist_annotations_level_0_Medulla', 'L2_dist_annotations_level_1_HS',
       'L2_dist_annotations_level_1_PVS', 'L2_dist_annotations_level_1_fat',
       'L2_dist_annotations_level_1_vessels', 'L2_dist_annotations_level_0_lymph',
       'L2_dist_annotations_level_1_unassigned', 'cm', 'cc', 'cma_v2',
       'manual_bin_cma_v2', 'manual_bin_cma_v2_int','Image_name', 'Image_jpg', 'section_thickness (um)',
       'permebialisation(min)', 'RIN/DV200', 'Visium_type', 'Funding', 'Sequencer', 'DonorID',
       'Donor_type', 'Age_group', 'Age(misc)', 'Age(numeric)', 'Source',
       'Study', 'Study name ', 'Study ID', 'Research_no', 'Deposited before ',
       'Sex', 'QC', 'Batch', 'Spaceranger', 'old_sample_name',
       'annotation version', 'path', 'updated pipeline', 'L1_knn1', 'L0_KNN10', 'for mapping ','batch']].copy()

In [15]:
adata_full.var[['ENSEMBL', 'feature_types', 'genome', 'SYMBOL', 'mt']]

Unnamed: 0,ENSEMBL,feature_types,genome,SYMBOL,mt
MIR1302-2HG,ENSG00000243485,Gene Expression,GRCh38,MIR1302-2HG,False
FAM138A,ENSG00000237613,Gene Expression,GRCh38,FAM138A,False
OR4F5,ENSG00000186092,Gene Expression,GRCh38,OR4F5,False
AL627309.1,ENSG00000238009,Gene Expression,GRCh38,AL627309.1,False
AL627309.3,ENSG00000239945,Gene Expression,GRCh38,AL627309.3,False
...,...,...,...,...,...
AC141272.1,ENSG00000277836,Gene Expression,GRCh38,AC141272.1,False
AC023491.2,ENSG00000278633,Gene Expression,GRCh38,AC023491.2,False
AC007325.1,ENSG00000276017,Gene Expression,GRCh38,AC007325.1,False
AC007325.4,ENSG00000278817,Gene Expression,GRCh38,AC007325.4,False


In [16]:
adata_full.var = adata_full.var[['ENSEMBL', 'feature_types', 'genome', 'SYMBOL', 'mt']].copy()

In [17]:
adata_full = adata_full[adata_full.obs['in_tissue']==1].copy()

In [18]:
adata_full_old = sc.read("/nfs/team205/vk8/projects/thymus_atlas/results/Thymus_atlas_v2_Visium_raw_2022-09-07.h5ad")

In [19]:
dif1 = set(adata_full.obs_names).difference(set(adata_full_old.obs_names))
dif2 = set(adata_full_old.obs_names).difference(set(adata_full.obs_names))

In [20]:
len(dif1)
len(dif2)

2481

0

In [21]:
adata_full.obs['cma_v1'] = adata_full_old.obs.reindex(adata_full.obs_names)['Cortico_Medullar_Axis']

In [22]:
adata_full.shape
adata_full_old.shape

(55653, 36601)

(53172, 36601)

In [24]:
adata_full.uns['spatial']['spaceranger130_count_42337_TA11486161_GRCh38-2020-A']['images'].keys()

dict_keys(['hires', 'hires5K', 'lowres'])

In [25]:
adata_full.write(f'/nfs/team205/ny1/ThymusSpatialAtlas/Figure2/data/Thymus_atlas_v2_Visium_raw_{today}.h5ad')

In [26]:
adata_full.shape
adata_full_old.shape

(55653, 36601)

(53172, 36601)

In [27]:
len(adata_full.obs['SampleID'].unique())

28

In [28]:
len(adata_full.obs.loc[adata_full.obs['Age_group']!='fetal', 'SampleID'].unique())

16