## Notebook for the Reference Map preparation for ExpiMap

- **Developed by**: Anna Maguza
- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**
- 13th February 2023

#### Import required modules

In [1]:
import scanpy as sc
import torch
import scarches as sca
import numpy as np
import gdown
import anndata as ad

#### Setup Cells

In [2]:
%matplotlib inline

In [3]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi = 160, color_map = 'magma_r', dpi_save = 300, vector_friendly = True)

scanpy==1.9.1 anndata==0.8.0 umap==0.5.3 numpy==1.23.5 scipy==1.10.0 pandas==1.5.3 scikit-learn==1.2.1 statsmodels==0.13.5 python-igraph==0.10.4 pynndescent==0.5.8


### Upload All Datasets


In [4]:
# Smilie Dataset
input_Smilie = '/lustre/groups/talaveralopez/datasets/Anna_Maguza_Master_2023/Smillie_ulcerative_colitis/Anndata/Smillie_ulcerative_colitis_anndata.h5ad'  # the file that will store the analysis results
output_Smilie = '/lustre/groups/talaveralopez/datasets/Anna_Maguza_Master_2023/Smillie_ulcerative_colitis/Anndata/Smillie_ulcerative_colitis_anndata_output.h5ad'  # the file that will store the analysis results

In [5]:
#Anndata upload
Smilie = sc.read_h5ad(input_Smilie)
Smilie.X

<365492x21784 sparse matrix of type '<class 'numpy.float32'>'
	with 386767544 stored elements in Compressed Sparse Row format>

In [6]:
# Wang Raw Dataset
input_Wang = '/lustre/groups/talaveralopez/workspace/anna.maguza/Processed_datasets/Wang_2022/Wang_2022_raw_anndata.h5ad'  # the file that will store the analysis results
output_Wang = '/lustre/groups/talaveralopez/workspace/anna.maguza/Processed_datasets/Wang_2022/Wang_2022_raw_anndata_output.h5ad'  # the file that will store the analysis results

In [7]:
#Anndata upload
Wang = sc.read_h5ad(input_Wang)
Wang.X

array([[0., 0., 0., 0., 0., 0., 0., ..., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., ..., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., ..., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., ..., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., ..., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., ..., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., ..., 0., 0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0., 0., ..., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., ..., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., ..., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., ..., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., ..., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., ..., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., ..., 0., 0., 0., 0., 0., 0., 0.]],
      d

In [8]:
#Gut Cell Atlas Data Loading
input_Normal_file = '/lustre/groups/talaveralopez/datasets/Anna_Maguza_Master_2023/Gut_Cell_Atlas_Data/Gut_cell_atlas.h5ad'  # the file that will store the analysis results
output_Normal_file = '/lustre/groups/talaveralopez/datasets/Anna_Maguza_Master_2023/Gut_Cell_Atlas_Data/Gut_cell_atlas_output.h5ad'  # the file that will store the analysis results

In [9]:
#Anndata upload
control_ad = sc.read_h5ad(input_Normal_file)
control_ad.X

<428469x33538 sparse matrix of type '<class 'numpy.float32'>'
	with 760344941 stored elements in Compressed Sparse Row format>

In [10]:
#Pham 2022 Data Loading
input_pham_2022 = '/lustre/groups/talaveralopez/datasets/Anna_Maguza_Master_2023/Pham_2022/Anndata/Pham_2022_naiveTSC_adata.h5ad'
output_pham_2022 = '/lustre/groups/talaveralopez/datasets/Anna_Maguza_Master_2023/Pham_2022/Anndata/Pham_2022_naiveTSC_adata_output.h5ad'

In [11]:
#Anndata upload
pham_2022 = sc.read_h5ad(input_pham_2022)

### Data Preprocessing

#### Gut Cell Atlas Data

In [12]:
#Remove Pediatric Crohn Disease
control_ad_clean = control_ad[~control_ad.obs['Diagnosis'].isin(['Pediatric Crohn Disease']),:]
del control_ad
#Deleting the lymph node samples
control_ad_clean = control_ad_clean[~control_ad_clean.obs['Region'].isin(['lymph node']),:]
#Adding percentage of ribosomial genes
control_ad_clean.var['ribo'] = control_ad_clean.var_names.str.startswith(("RPS","RPL"))  # annotate the group of ribosomal genes as 'ribo'
sc.pp.calculate_qc_metrics(control_ad_clean, qc_vars=['ribo'], percent_top=None, log1p=False, inplace=True)
#Filtering by genes
control_ad_clean = control_ad_clean[control_ad_clean.obs.n_genes_by_counts < 5000, :]
control_ad_clean = control_ad_clean[control_ad_clean.obs.n_genes_by_counts > 200, :]
#Filtering by counts
control_ad_clean = control_ad_clean[control_ad_clean.obs.total_counts < 50000, :]

  control_ad_clean.var['ribo'] = control_ad_clean.var_names.str.startswith(("RPS","RPL"))  # annotate the group of ribosomal genes as 'ribo'


#### Smilie Data

In [13]:
#Extract stem cells from Smilie dataset
Smilie_stem = Smilie[Smilie.obs['cell_type'].isin(['Stem']),:]
#Filter only not inflamed stem cells from the Smilie_stem 
Smilie_stem = Smilie_stem[Smilie_stem.obs['Health'].isin(['Healthy', 'Non-inflamed']),:]

In [14]:
Smilie_stem.obs

Unnamed: 0_level_0,cell_id,cell_type,nGene,nUMI,Subject,Sample,Health,Location
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
N9.EpiA.AAAGACGATCCCGT,N9.EpiA.AAAGACGATCCCGT,Stem,2574,14616,N9,N9.EpiA,Non-inflamed,Epi
N9.EpiA.AAATGTTGAAAGTG,N9.EpiA.AAATGTTGAAAGTG,Stem,3670,19529,N9,N9.EpiA,Non-inflamed,Epi
N9.EpiA.AACATTGATTGTCT,N9.EpiA.AACATTGATTGTCT,Stem,1878,10891,N9,N9.EpiA,Non-inflamed,Epi
N9.EpiA.AAGTCTCTCTCCAC,N9.EpiA.AAGTCTCTCTCCAC,Stem,1040,3535,N9,N9.EpiA,Non-inflamed,Epi
N9.EpiA.AAGTTATGAGAGTA,N9.EpiA.AAGTTATGAGAGTA,Stem,1350,5972,N9,N9.EpiA,Non-inflamed,Epi
...,...,...,...,...,...,...,...,...
N110.LPA.TCGGGACGTCAACTGT,N110.LPA.TCGGGACGTCAACTGT,Stem,1232,4384,N110,N110.LPA,Non-inflamed,LP
N110.LPA.TGAGCATTCCAGTAGT,N110.LPA.TGAGCATTCCAGTAGT,Stem,3181,15655,N110,N110.LPA,Non-inflamed,LP
N110.LPA.TGGCCAGAGAGGACGG,N110.LPA.TGGCCAGAGAGGACGG,Stem,4380,26921,N110,N110.LPA,Non-inflamed,LP
N110.LPA.TTCTTAGCAGTCCTTC,N110.LPA.TTCTTAGCAGTCCTTC,Stem,3634,22468,N110,N110.LPA,Non-inflamed,LP


#### Wang Data

In [15]:
#Extract stem cells from Wang colon dataset
Wang_stem = Wang[Wang.obs['CellType'].isin(['Stem Cell']),:]

In [16]:
Wang_stem.obs

Unnamed: 0_level_0,UniqueCell_ID,Sample_ID,CellType
UniqueCell_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AACACGTTCTTGCATT_Ileum-1_Stem Cell,AACACGTTCTTGCATT_Ileum-1_Stem Cell,Ileum-1,Stem Cell
AACCGCGCATGAAGTA_Ileum-1_Stem Cell,AACCGCGCATGAAGTA_Ileum-1_Stem Cell,Ileum-1,Stem Cell
AACTCAGAGCGATCCC_Ileum-1_Stem Cell,AACTCAGAGCGATCCC_Ileum-1_Stem Cell,Ileum-1,Stem Cell
AACTCCCTCTCAACTT_Ileum-1_Stem Cell,AACTCCCTCTCAACTT_Ileum-1_Stem Cell,Ileum-1,Stem Cell
AACTCTTAGCTTCGCG_Ileum-1_Stem Cell,AACTCTTAGCTTCGCG_Ileum-1_Stem Cell,Ileum-1,Stem Cell
...,...,...,...
TCAGGATGTTCCACAA_Rectum-2_Stem Cell,TCAGGATGTTCCACAA_Rectum-2_Stem Cell,Rectum-2,Stem Cell
TCCACACAGCAATATG_Rectum-2_Stem Cell,TCCACACAGCAATATG_Rectum-2_Stem Cell,Rectum-2,Stem Cell
TCGAGGCCAAGGTGTG_Rectum-2_Stem Cell,TCGAGGCCAAGGTGTG_Rectum-2_Stem Cell,Rectum-2,Stem Cell
TCTATTGTCTTCGAGA_Rectum-2_Stem Cell,TCTATTGTCTTCGAGA_Rectum-2_Stem Cell,Rectum-2,Stem Cell


### Concatenate all anndata files

In [17]:
#Add column with study name to each anndata object
Wang_stem.obs['Study_name'] = 'Wang'
control_ad_clean.obs['Study_name'] = 'Gut Cell Atlas'
Smilie_stem.obs['Study_name'] = 'Smilie'

  Wang_stem.obs['Study_name'] = 'Wang'
  control_ad_clean.obs['Study_name'] = 'Gut Cell Atlas'
  Smilie_stem.obs['Study_name'] = 'Smilie'


In [18]:
# Rename obs index to cell_id in Wang_stem
Wang_stem.obs.index.name = "cell_id"
control_ad_clean.obs.index.name = "cell_id"
Smilie_stem.obs.index.name = "cell_id"

In [19]:
# Rename columns in control_ad_clean as they are in Wang_stem
control_ad_clean.obs["Sample_ID"] = control_ad_clean.obs["sample name"]
del control_ad_clean.obs["sample name"]
control_ad_clean.obs["CellType"] = control_ad_clean.obs["category"]
del control_ad_clean.obs["category"]
control_ad_clean.obs["Location"] = control_ad_clean.obs["Region"]
del control_ad_clean.obs["Region"]
control_ad_clean.obs["n_counts"] = control_ad_clean.obs["total_counts"]
del control_ad_clean.obs["total_counts"]
control_ad_clean.obs["percent_mito"] = control_ad_clean.obs["pct_counts_mt"]
del control_ad_clean.obs["pct_counts_mt"]
control_ad_clean.obs["percent_ribo"] = control_ad_clean.obs["pct_counts_ribo"]
del control_ad_clean.obs["pct_counts_ribo"]
control_ad_clean.obs["Subject"] = control_ad_clean.obs["Sample name"]
del control_ad_clean.obs["Sample name"]

In [20]:
# Rename columns in Smilie_stem as they are in Wang_stem
del Smilie_stem.obs["cell_id"]
Smilie_stem.obs["Sample_ID"] = Smilie_stem.obs["Sample"]
del Smilie_stem.obs["Sample"]
Smilie_stem.obs["CellType"] = Smilie_stem.obs["cell_type"]
del Smilie_stem.obs["cell_type"]
Smilie_stem.obs["n_counts"] = Smilie_stem.obs["nUMI"]
del Smilie_stem.obs["nUMI"]
Smilie_stem.obs["n_genes"] = Smilie_stem.obs["nGene"]
del Smilie_stem.obs["nGene"]
Smilie_stem.obs["Diagnosis"] = Smilie_stem.obs["Health"]
del Smilie_stem.obs["Health"]
Smilie_stem.obs["CellType"] = 'Stem Cell'

In [21]:
Smilie_stem.var

7SK
A1BG
A1BG-AS1
A1CF
A2M
...
hsa-mir-5571
hsa-mir-6080
hsa-mir-8072
snoU109
snoU13


In [22]:
Wang_stem.var

Unnamed: 0_level_0,GENE
GENE,Unnamed: 1_level_1
RP11-34P13.7,RP11-34P13.7
FO538757.2,FO538757.2
AP006222.2,AP006222.2
RP4-669L17.10,RP4-669L17.10
RP11-206L10.9,RP11-206L10.9
...,...
GGT2,GGT2
RP3-510H16.3,RP3-510H16.3
RP4-669P10.16,RP4-669P10.16
BACH1-AS1,BACH1-AS1


In [23]:
control_ad_clean.var

Unnamed: 0,gene_ids,feature_types,ribo,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts
MIR1302-2HG,ENSG00000243485,Gene Expression,False,49,0.000128,99.987172,49.0
FAM138A,ENSG00000237613,Gene Expression,False,2,0.000005,99.999476,2.0
OR4F5,ENSG00000186092,Gene Expression,False,3,0.000008,99.999215,3.0
AL627309.1,ENSG00000238009,Gene Expression,False,441,0.001157,99.884545,442.0
AL627309.3,ENSG00000239945,Gene Expression,False,32,0.000084,99.991622,32.0
...,...,...,...,...,...,...,...
AC233755.2,ENSG00000277856,Gene Expression,False,2759,0.271879,99.277686,103849.0
AC233755.1,ENSG00000275063,Gene Expression,False,3404,0.631719,99.108824,241296.0
AC240274.1,ENSG00000271254,Gene Expression,False,18267,0.053112,95.217650,20287.0
AC213203.1,ENSG00000277475,Gene Expression,False,12,0.000031,99.996858,12.0


In [24]:
# Rename var index to gene_id in Wang_stem, control_ad_clean and Smilie_stem
Wang_stem.var.index.name = "gene_id"
control_ad_clean.var.index.name = "gene_id"
Smilie_stem.var.index.name = "gene_id"

In [25]:
# Concatenate all datasets
Reference_map_full = ad.concat([Wang_stem, control_ad_clean, Smilie_stem], join = 'outer')

In [26]:
Reference_map_full.obs.index = Reference_map_full.obs.index.astype(str)
Reference_map_full.var.index = Reference_map_full.var.index.astype(str)

In [27]:
Reference_map_full.obs_names = Reference_map_full.obs_names.astype(str)
Reference_map_full.var_names = Reference_map_full.var_names.astype(str)

In [28]:
Reference_map_full.obs = Reference_map_full.obs.astype(str)

In [29]:
Reference_map_full.obs

Unnamed: 0_level_0,UniqueCell_ID,Sample_ID,CellType,Study_name,Diagnosis,Age,Region code,Fraction,Gender,10X,...,doublet_scores,predicted_doublets,Age_group,Integrated_05,total_counts_ribo,Location,n_counts,percent_mito,percent_ribo,Subject
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AACACGTTCTTGCATT_Ileum-1_Stem Cell,AACACGTTCTTGCATT_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,,,,,,,...,,,,,,,,,,
AACCGCGCATGAAGTA_Ileum-1_Stem Cell,AACCGCGCATGAAGTA_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,,,,,,,...,,,,,,,,,,
AACTCAGAGCGATCCC_Ileum-1_Stem Cell,AACTCAGAGCGATCCC_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,,,,,,,...,,,,,,,,,,
AACTCCCTCTCAACTT_Ileum-1_Stem Cell,AACTCCCTCTCAACTT_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,,,,,,,...,,,,,,,,,,
AACTCTTAGCTTCGCG_Ileum-1_Stem Cell,AACTCTTAGCTTCGCG_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
N110.LPA.TCGGGACGTCAACTGT,,N110.LPA,Stem Cell,Smilie,Non-inflamed,,,,,,...,,,,,,LP,4384,,,N110
N110.LPA.TGAGCATTCCAGTAGT,,N110.LPA,Stem Cell,Smilie,Non-inflamed,,,,,,...,,,,,,LP,15655,,,N110
N110.LPA.TGGCCAGAGAGGACGG,,N110.LPA,Stem Cell,Smilie,Non-inflamed,,,,,,...,,,,,,LP,26921,,,N110
N110.LPA.TTCTTAGCAGTCCTTC,,N110.LPA,Stem Cell,Smilie,Non-inflamed,,,,,,...,,,,,,LP,22468,,,N110


In [31]:
# Write reference map to file
Reference_map_full.write('/lustre/groups/talaveralopez/workspace/anna.maguza/Processed_datasets/expi_map/Reference_map_(Gut_cell_atlas+Smilie+Wang).h5ad')