## Notebook for the Reference Map preparation for ExpiMap

- **Developed by**: Anna Maguza
- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**
- 13th February 2023

#### Import required modules

In [1]:
import scanpy as sc
import torch
import scarches as sca
import numpy as np
import gdown
import anndata as ad

  from .autonotebook import tqdm as notebook_tqdm
 captum (see https://github.com/pytorch/captum).
INFO:lightning_fabric.utilities.seed:[rank: 0] Global seed set to 0


#### Setup Cells

In [2]:
%matplotlib inline

In [3]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi = 160, color_map = 'magma_r', dpi_save = 300, vector_friendly = True)

scanpy==1.9.1 anndata==0.8.0 umap==0.5.3 numpy==1.23.5 scipy==1.10.0 pandas==1.5.3 scikit-learn==1.2.1 statsmodels==0.13.5 python-igraph==0.10.4 pynndescent==0.5.8


### Upload All Datasets


In [4]:
# Smilie Dataset
input_Smilie = '/lustre/groups/talaveralopez/datasets/Anna_Maguza_Master_2023/Smillie_ulcerative_colitis/Anndata/Smillie_ulcerative_colitis_anndata.h5ad'  # the file that will store the analysis results
output_Smilie = '/lustre/groups/talaveralopez/datasets/Anna_Maguza_Master_2023/Smillie_ulcerative_colitis/Anndata/Smillie_ulcerative_colitis_anndata_output.h5ad'  # the file that will store the analysis results

In [5]:
#Anndata upload
Smilie = sc.read_h5ad(input_Smilie)
Smilie.X

<365492x21784 sparse matrix of type '<class 'numpy.float32'>'
	with 386767544 stored elements in Compressed Sparse Row format>

In [6]:
# Wang Dataset
input_Wang_colon = '/lustre/groups/talaveralopez/datasets/Anna_Maguza_Master_2023/Wang_2020_normal/Wang_2020_colon/wang20_colon.processed.h5ad'  # the file that will store the analysis results
output_Wang_colon = '/lustre/groups/talaveralopez/datasets/Anna_Maguza_Master_2023/Wang_2020_normal/Wang_2020_colon//wang20_colon.processed_output.h5ad'  # the file that will store the analysis results

In [7]:
#Anndata upload
Wang_colon = sc.read_h5ad(input_Wang_colon)
Wang_colon.X

<4329x17181 sparse matrix of type '<class 'numpy.float32'>'
	with 12513556 stored elements in Compressed Sparse Row format>

In [8]:
#Data Loading 
input_ileum_anndata = '/lustre/groups/talaveralopez/datasets/Anna_Maguza_Master_2023/Wang_2020_normal/Wang_2020_ileum/wang20_ileum.processed.h5ad'  # the file that will store the analysis results
output_ileum_anndata = '/lustre/groups/talaveralopez/datasets/Anna_Maguza_Master_2023/Wang_2020_normal/Wang_2020_ileum/wang20_ileum.processed_output.h5ad'  # the file that will store the analysis results

In [9]:
#Anndata upload
Wang_ileum = sc.read_h5ad(input_ileum_anndata)
Wang_ileum.X

<5980x16977 sparse matrix of type '<class 'numpy.float32'>'
	with 12663073 stored elements in Compressed Sparse Row format>

In [10]:
#Data Loading 
input_rectum_anndata = '/lustre/groups/talaveralopez/datasets/Anna_Maguza_Master_2023/Wang_2020_normal//Wang_2020_rectum/wang20_rectum.processed.h5ad'  # the file that will store the analysis results
output_rectum_anndata = '/lustre/groups/talaveralopez/datasets/Anna_Maguza_Master_2023/Wang_2020_normal/Wang_2020_rectum/wang20_rectum.processed_output.h5ad'  # the file that will store the analysis results

In [11]:
#Anndata upload
Wang_rectum = sc.read_h5ad(input_rectum_anndata)
Wang_rectum.X

<3797x17676 sparse matrix of type '<class 'numpy.float32'>'
	with 11593261 stored elements in Compressed Sparse Row format>

In [12]:
#Gut Cell Atlas Data Loading
input_Normal_file = '/lustre/groups/talaveralopez/datasets/Anna_Maguza_Master_2023/Gut_Cell_Atlas_Data/Gut_cell_atlas.h5ad'  # the file that will store the analysis results
output_Normal_file = '/lustre/groups/talaveralopez/datasets/Anna_Maguza_Master_2023/Gut_Cell_Atlas_Data/Gut_cell_atlas_output.h5ad'  # the file that will store the analysis results

In [13]:
#Anndata upload
control_ad = sc.read_h5ad(input_Normal_file)
control_ad.X

<428469x33538 sparse matrix of type '<class 'numpy.float32'>'
	with 760344941 stored elements in Compressed Sparse Row format>

In [14]:
#Pham 2022 Data Loading
input_pham_2022 = '/lustre/groups/talaveralopez/datasets/Anna_Maguza_Master_2023/Pham_2022/Anndata/Pham_2022_naiveTSC_adata.h5ad'
output_pham_2022 = '/lustre/groups/talaveralopez/datasets/Anna_Maguza_Master_2023/Pham_2022/Anndata/Pham_2022_naiveTSC_adata_output.h5ad'

In [15]:
#Anndata upload
pham_2022 = sc.read_h5ad(input_pham_2022)

### Data Preprocessing

#### Gut Cell Atlas Data

In [16]:
#Remove Pediatric Crohn Disease
control_ad_clean = control_ad[~control_ad.obs['Diagnosis'].isin(['Pediatric Crohn Disease']),:]
del control_ad
#Deleting the lymph node samples
control_ad_clean = control_ad_clean[~control_ad_clean.obs['Region'].isin(['lymph node']),:]
#Adding percentage of ribosomial genes
control_ad_clean.var['ribo'] = control_ad_clean.var_names.str.startswith(("RPS","RPL"))  # annotate the group of ribosomal genes as 'ribo'
sc.pp.calculate_qc_metrics(control_ad_clean, qc_vars=['ribo'], percent_top=None, log1p=False, inplace=True)
#Filtering by genes
control_ad_clean = control_ad_clean[control_ad_clean.obs.n_genes_by_counts < 5000, :]
control_ad_clean = control_ad_clean[control_ad_clean.obs.n_genes_by_counts > 200, :]
#Filtering by counts
control_ad_clean = control_ad_clean[control_ad_clean.obs.total_counts < 50000, :]

  control_ad_clean.var['ribo'] = control_ad_clean.var_names.str.startswith(("RPS","RPL"))  # annotate the group of ribosomal genes as 'ribo'


#### Smilie Data

In [17]:
#Extract stem cells from Smilie dataset
Smilie_stem = Smilie[Smilie.obs['cell_type'].isin(['Stem']),:]
#Filter only not inflamed stem cells from the Smilie_stem 
Smilie_stem = Smilie_stem[Smilie_stem.obs['Health'].isin(['Healthy', 'Non-inflamed']),:]


In [18]:
Smilie_stem.obs

Unnamed: 0_level_0,cell_id,cell_type,nGene,nUMI,Subject,Sample,Health,Location
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
N9.EpiA.AAAGACGATCCCGT,N9.EpiA.AAAGACGATCCCGT,Stem,2574,14616,N9,N9.EpiA,Non-inflamed,Epi
N9.EpiA.AAATGTTGAAAGTG,N9.EpiA.AAATGTTGAAAGTG,Stem,3670,19529,N9,N9.EpiA,Non-inflamed,Epi
N9.EpiA.AACATTGATTGTCT,N9.EpiA.AACATTGATTGTCT,Stem,1878,10891,N9,N9.EpiA,Non-inflamed,Epi
N9.EpiA.AAGTCTCTCTCCAC,N9.EpiA.AAGTCTCTCTCCAC,Stem,1040,3535,N9,N9.EpiA,Non-inflamed,Epi
N9.EpiA.AAGTTATGAGAGTA,N9.EpiA.AAGTTATGAGAGTA,Stem,1350,5972,N9,N9.EpiA,Non-inflamed,Epi
...,...,...,...,...,...,...,...,...
N110.LPA.TCGGGACGTCAACTGT,N110.LPA.TCGGGACGTCAACTGT,Stem,1232,4384,N110,N110.LPA,Non-inflamed,LP
N110.LPA.TGAGCATTCCAGTAGT,N110.LPA.TGAGCATTCCAGTAGT,Stem,3181,15655,N110,N110.LPA,Non-inflamed,LP
N110.LPA.TGGCCAGAGAGGACGG,N110.LPA.TGGCCAGAGAGGACGG,Stem,4380,26921,N110,N110.LPA,Non-inflamed,LP
N110.LPA.TTCTTAGCAGTCCTTC,N110.LPA.TTCTTAGCAGTCCTTC,Stem,3634,22468,N110,N110.LPA,Non-inflamed,LP


#### Wang Data

In [19]:
#Concatenate all Wang datasets
Wang = ad.concat([Wang_ileum, Wang_colon, Wang_rectum], join = 'outer')

In [20]:
#Extract stem cells from Wang colon dataset
Wang_stem = Wang[Wang.obs['CellType'].isin(['Stem Cell']),:]

In [21]:
Wang_stem.obs

Unnamed: 0_level_0,Sample_ID,CellType,Location,n_counts,log1p_n_counts,n_genes,log1p_n_genes,percent_mito,percent_ribo,percent_hb,percent_top50
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AACACGTTCTTGCATT_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Ileum,15829.0,9.669662,3128,8.048469,17.196285,32.219345,0.006318,45.574578
AACCGCGCATGAAGTA_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Ileum,14058.0,9.551018,2857,7.957877,15.194196,37.423531,0.007113,43.740219
AACTCAGAGCGATCCC_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Ileum,7034.0,8.858653,1915,7.557995,9.980097,38.612453,0.000000,39.806653
AACTCCCTCTCAACTT_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Ileum,19544.0,9.880475,3821,8.248529,11.579000,36.476669,0.020467,41.148178
AACTCTTAGCTTCGCG_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Ileum,21366.0,9.969603,3577,8.182559,19.109800,36.122810,0.004680,48.207432
...,...,...,...,...,...,...,...,...,...,...,...
TCAGGATGTTCCACAA_Rectum-2_Stem Cell,Rectum-2,Stem Cell,Rectum,17962.0,9.796069,2953,7.990915,11.897339,45.763279,0.000000,47.260884
TCCACACAGCAATATG_Rectum-2_Stem Cell,Rectum-2,Stem Cell,Rectum,21999.0,9.998797,3771,8.235361,12.205100,38.410835,0.013637,40.510932
TCGAGGCCAAGGTGTG_Rectum-2_Stem Cell,Rectum-2,Stem Cell,Rectum,17397.0,9.764111,3101,8.039802,10.766224,41.392197,0.005748,41.346209
TCTATTGTCTTCGAGA_Rectum-2_Stem Cell,Rectum-2,Stem Cell,Rectum,4892.0,8.495561,1223,7.109879,26.185608,30.498774,0.020442,58.053966


### Concatenate all anndata files

In [22]:
#Add column with study name to each anndata object
Wang_stem.obs['Study_name'] = 'Wang'
control_ad_clean.obs['Study_name'] = 'Gut Cell Atlas'
Smilie_stem.obs['Study_name'] = 'Smilie'

  Wang_stem.obs['Study_name'] = 'Wang'
  control_ad_clean.obs['Study_name'] = 'Gut Cell Atlas'
  Smilie_stem.obs['Study_name'] = 'Smilie'


In [28]:
#Rename columns in control_ad_clean as they are in Wang_stem
control_ad_clean.obs_names = ['cell_id']

ValueError: Length of passed value for obs_names is 1, but this AnnData has shape: (372785, 33538)

In [24]:
list(control_ad_clean.obs)

['Sample name',
 'Diagnosis',
 'Age',
 'sample name',
 'Region code',
 'Fraction',
 'Gender',
 'Region',
 '10X',
 'batch',
 'n_genes',
 'n_genes_by_counts',
 'total_counts',
 'total_counts_mt',
 'pct_counts_mt',
 'doublet_scores',
 'predicted_doublets',
 'category',
 'Age_group',
 'Integrated_05',
 'total_counts_ribo',
 'pct_counts_ribo',
 'Study_name']

In [27]:
control_ad_clean.obs

Unnamed: 0,Sample name,Diagnosis,Age,sample name,Region code,Fraction,Gender,Region,10X,batch,...,total_counts_mt,pct_counts_mt,doublet_scores,predicted_doublets,category,Age_group,Integrated_05,total_counts_ribo,pct_counts_ribo,Study_name
AAACCTGAGAACAACT-1-4918STDY7333456,T036,Pediatric healthy,4,T036-TIL-SC-EPCAMP,TIL,SC-EPCAMP,M,SmallInt,3',4918STDY7333456,...,46.228199,2.416790,0.016442,False,B cells,Pediatric,DZ GC cell,491.0,24.623873,Gut Cell Atlas
AAACCTGAGCGATATA-1-4918STDY7333456,T036,Pediatric healthy,4,T036-TIL-SC-EPCAMP,TIL,SC-EPCAMP,M,SmallInt,3',4918STDY7333456,...,0.122579,0.005696,0.027389,False,B cells,Pediatric,Cycling B cell,1048.0,46.974449,Gut Cell Atlas
AAACCTGAGGAGTTTA-1-4918STDY7333456,T036,Pediatric healthy,4,T036-TIL-SC-EPCAMP,TIL,SC-EPCAMP,M,SmallInt,3',4918STDY7333456,...,80.195816,4.754082,0.014817,False,T cells,Pediatric,gdT,652.0,37.066517,Gut Cell Atlas
AAACCTGAGTATCTCG-1-4918STDY7333456,T036,Pediatric healthy,4,T036-TIL-SC-EPCAMP,TIL,SC-EPCAMP,M,SmallInt,3',4918STDY7333456,...,104.506760,4.894509,0.039416,False,B cells,Pediatric,Memory B,996.0,44.404812,Gut Cell Atlas
AAACCTGAGTGACTCT-1-4918STDY7333456,T036,Pediatric healthy,4,T036-TIL-SC-EPCAMP,TIL,SC-EPCAMP,M,SmallInt,3',4918STDY7333456,...,95.846077,3.088770,0.025763,False,B cells,Pediatric,DZ GC cell,1121.0,34.460495,Gut Cell Atlas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGCGCTCATGCATG-1-WTDAtest7770719,A30 (398B),Healthy adult,20-25,A30-SCL-6-SC-45N-2,SCL,SC-45N,F,LargeInt,3',WTDAtest7770719,...,0.031344,0.000948,0.097068,False,Epithelial,Adult,Colonocyte,816.0,24.220839,Gut Cell Atlas
TTTGGTTTCAACACGT-1-WTDAtest7770719,A30 (398B),Healthy adult,20-25,A30-SCL-6-SC-45N-2,SCL,SC-45N,F,LargeInt,3',WTDAtest7770719,...,144.966095,2.996148,0.027510,False,Mesenchymal,Adult,Stromal 1 (CCL11+),1170.0,23.669836,Gut Cell Atlas
TTTGGTTTCCAAATGC-1-WTDAtest7770719,A30 (398B),Healthy adult,20-25,A30-SCL-6-SC-45N-2,SCL,SC-45N,F,LargeInt,3',WTDAtest7770719,...,85.408028,1.600296,0.014349,False,Mesenchymal,Adult,Stromal 1 (CCL11+),1454.0,26.698494,Gut Cell Atlas
TTTGTCAGTGAAAGAG-1-WTDAtest7770719,A30 (398B),Healthy adult,20-25,A30-SCL-6-SC-45N-2,SCL,SC-45N,F,LargeInt,3',WTDAtest7770719,...,600.362671,5.534037,0.091348,False,Epithelial,Adult,Stem cells,4846.0,43.847267,Gut Cell Atlas


In [None]:
control_ad_clean.obs['CellType'] = control_ad_clean.obs['category']
Smilie_stem.obs['CellType'] = Smilie_stem.obs['cell_type']


In [None]:
#Rename columns in Smilie_stem as it is in the Gut Cell Atlas Data 
Smilie_stem.obs['CellType'] = Smilie_stem.obs['cell_type']