## Notebook for the Healthy reference counts preparation

**Developed by**: Anna Maguza  
**Institute of Computational Biology - Computational Health Centre - Hemlholtz Munich**  
**19 June 2023**  

#### Load required packages

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import scipy as sci

In [2]:
import scrublet

#### Setup Cells

In [3]:
%matplotlib inline

In [4]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

  from .autonotebook import tqdm as notebook_tqdm


scanpy==1.9.3 anndata==0.8.0 umap==0.5.3 numpy==1.23.5 scipy==1.9.1 pandas==1.3.5 scikit-learn==1.2.2 statsmodels==0.13.5 pynndescent==0.5.8


In [5]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

### Prepare GCA part of the object

In [23]:
input_path_healthy = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Datasets Integration/Integrated datasets/All_cells_5000_HVGs_scvi_scanvi.h5ad'
Healthy_adata = sc.read_h5ad(input_path_healthy)
X_is_raw(Healthy_adata)

True

In [19]:
GCA_adata = sc.read('/Users/anna.maguza/Desktop/GCA_social_network/data/raw_anndata/GCA/GCA_raw_anndata.h5ad')
X_is_raw(GCA_adata)

True

In [24]:
Healthy_adata = Healthy_adata[Healthy_adata.obs['Study_name'] == 'Gut Cell Atlas']

In [25]:
# Delete cells from GCA_adata that are not in Healthy_adata
GCA_adata = GCA_adata[GCA_adata.obs.index.isin(Healthy_adata.obs.index)]

In [26]:
# Change the order of cells in the Healthy_adata as it is in the GCA_adata
assert set(Healthy_adata.obs.index) == set(GCA_adata.obs.index)

# Make the cells order the same
Healthy_adata = Healthy_adata[GCA_adata.obs_names]

In [27]:
# Create a new anndata object with obs from Healthy_adata and var and X from GCA_adata
Healthy_GCA_adata = an.AnnData(X=GCA_adata.X, obs=Healthy_adata.obs, var=GCA_adata.var)

In [28]:
del Healthy_adata, GCA_adata

### Prepare Kong part of the object

In [9]:
input_path_healthy = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Datasets Integration/Integrated datasets/All_cells_5000_HVGs_scvi_scanvi.h5ad'
Healthy_adata = sc.read_h5ad(input_path_healthy)

In [10]:
Kong_adata = sc.read('/Users/anna.maguza/Desktop/GCA_social_network/data/raw_anndata/Kong_2023/Kong_2023_raw_anndata.h5ad')
X_is_raw(Kong_adata)

True

In [11]:
Healthy_adata = Healthy_adata[Healthy_adata.obs['Study_name'] == 'Kong 2023']

In [12]:
# Delete cells from Kong_adata that are not in Healthy_adata
Kong_adata = Kong_adata[Kong_adata.obs.index.isin(Healthy_adata.obs.index)]

In [13]:
# Change the order of cells in the Healthy_adata as it is in the GCA_adata
assert set(Healthy_adata.obs.index) == set(Kong_adata.obs.index)

# Make the cells order the same
Healthy_adata = Healthy_adata[Kong_adata.obs_names]

In [14]:
# Create a new anndata object with obs from Healthy_adata and var and X from Kong_adata
Healthy_Kong_adata = an.AnnData(X=Kong_adata.X, obs=Healthy_adata.obs, var=Kong_adata.var)

In [15]:
del Kong_adata, Healthy_adata

### Prepare Wang part of the object

In [35]:
input_path_healthy = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Datasets Integration/Integrated datasets/All_cells_5000_HVGs_scvi_scanvi.h5ad'
Healthy_adata = sc.read_h5ad(input_path_healthy)

In [36]:
Wang_adata = sc.read('/Users/anna.maguza/Desktop/GCA_social_network/data/raw_anndata/Wang/Wang_2022_raw_anndata.h5ad')
X_is_raw(Wang_adata)

True

In [37]:
Healthy_adata = Healthy_adata[Healthy_adata.obs['Study_name'] == 'Wang']

In [38]:
# Delete cells from Wang_adata that are not in Healthy_adata
Wang_adata = Wang_adata[Wang_adata.obs.index.isin(Healthy_adata.obs.index)]

In [39]:
# Change the order of cells in the Healthy_adata as it is in the GCA_adata
assert set(Healthy_adata.obs.index) == set(Wang_adata.obs.index)

# Make the cells order the same
Healthy_adata = Healthy_adata[Wang_adata.obs_names]

In [40]:
# Create a new anndata object with obs from Healthy_adata and var and X from Kong_adata
Healthy_Wang_adata = an.AnnData(X=Wang_adata.X, obs=Healthy_adata.obs, var=Wang_adata.var)

In [41]:
del Healthy_adata, Wang_adata

### Prepare Smillie part of the object

In [42]:
input_path_healthy = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Datasets Integration/Integrated datasets/All_cells_5000_HVGs_scvi_scanvi.h5ad'
Healthy_adata = sc.read_h5ad(input_path_healthy)

In [43]:
Smillie_adata = sc.read('/Users/anna.maguza/Desktop/GCA_social_network/data/raw_anndata/Smillie/Smillie_ulcerative_colitis_anndata.h5ad')
X_is_raw(Smillie_adata)

True

In [44]:
Healthy_adata = Healthy_adata[Healthy_adata.obs['Study_name'] == 'Smilie']

In [45]:
# Delete cells from Wang_adata that are not in Healthy_adata
Smillie_adata = Smillie_adata[Smillie_adata.obs.index.isin(Healthy_adata.obs.index)]

In [46]:
# Change the order of cells in the Healthy_adata as it is in the GCA_adata
assert set(Healthy_adata.obs.index) == set(Smillie_adata.obs.index)

# Make the cells order the same
Healthy_adata = Healthy_adata[Smillie_adata.obs_names]

In [47]:
# Create a new anndata object with obs from Healthy_adata and var and X from Kong_adata
Healthy_Smillie_adata = an.AnnData(X=Smillie_adata.X, obs=Healthy_adata.obs, var=Smillie_adata.var)

In [48]:
del Healthy_adata, Smillie_adata

### Concatenate 4 datasets

In [30]:
# Make index as a second column in GCA_adata.var
Healthy_GCA_adata.var['gene_name'] = Healthy_GCA_adata.var.index

# Make gene_name as an index in GCA_adata.var
Healthy_GCA_adata.var = Healthy_GCA_adata.var.set_index('gene_ids')

In [32]:
# Rename index in GCA_adata.var to gene_name
Healthy_GCA_adata.var.index.name = 'gene_name'
Healthy_Kong_adata.var.index.name = 'gene_name'

# Rename index in GCA_adata.obs to cell_id
Healthy_GCA_adata.obs.index.name = 'cell_id'
Healthy_Kong_adata.obs.index.name = 'cell_id'

# Rename GCA_adata.var['gene_ids'] to gene_id
Healthy_GCA_adata.var = Healthy_GCA_adata.var.rename(columns={'gene_ids': 'gene_id'})

In [33]:
# Merge all datasets
adata = Healthy_GCA_adata.concatenate(Healthy_Kong_adata, index_unique = None)

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


In [51]:
adata.var

Unnamed: 0_level_0,feature_types-0,gene_name-1,gene_id
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AL627309.1,Gene Expression,RP11-34P13.7,ENSG00000238009
AL627309.3,Gene Expression,RP11-34P13.8,ENSG00000239945
AL627309.4,Gene Expression,RP11-34P13.9,ENSG00000241599
AL732372.1,Gene Expression,RP4-669L17.2,ENSG00000236601
AC114498.1,Gene Expression,RP5-857K21.2,ENSG00000235146
...,...,...,...
MT-ND4L,Gene Expression,MT-ND4L,ENSG00000212907
MT-ND4,Gene Expression,MT-ND4,ENSG00000198886
MT-ND5,Gene Expression,MT-ND5,ENSG00000198786
MT-ND6,Gene Expression,MT-ND6,ENSG00000198695


In [50]:
# Make index as a second column in Kong_adata.var
adata.var['gene_id'] = adata.var.index

# Make gene_name as an index in Kong_adata.var
adata.var = adata.var.set_index('gene_name-0')

adata.var.index.name = 'gene_name'

In [52]:
# Rename indexes in Healthy_Smillie_adata and Healthy_Wang_adata
Healthy_Smillie_adata.var.index.name = 'gene_name'
Healthy_Wang_adata.var.index.name = 'gene_name'

# RRename indexes in Healthy_Smillie_adata and Healthy_Wang_adata
Healthy_Smillie_adata.obs.index.name = 'cell_id'
Healthy_Wang_adata.obs.index.name = 'cell_id'

In [53]:
# Delete genes from Healthy_Wang_adata that are not in adata
common_genes = Healthy_Wang_adata.var_names.isin(adata.var_names)
Healthy_Wang_adata = Healthy_Wang_adata[:, common_genes]

common_genes = Healthy_Smillie_adata.var_names.isin(adata.var_names)
Healthy_Smillie_adata = Healthy_Smillie_adata[:, common_genes]

In [54]:
adata = adata.concatenate(Healthy_Wang_adata, index_unique = None, join = 'outer')
adata = adata.concatenate(Healthy_Smillie_adata, index_unique = None, join = 'outer')

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


In [55]:
adata.var

Unnamed: 0_level_0,feature_types-0-0-0,gene_name-1-0-0,gene_id-0-0,GENE-1-0
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A1BG,Gene Expression,A1BG,ENSG00000121410,A1BG
A1BG-AS1,Gene Expression,A1BG-AS1,ENSG00000268895,A1BG-AS1
A1CF,Gene Expression,A1CF,ENSG00000148584,A1CF
A2M,Gene Expression,A2M,ENSG00000175899,A2M
A2M-AS1,Gene Expression,A2M-AS1,ENSG00000245105,A2M-AS1
...,...,...,...,...
ZXDC,Gene Expression,ZXDC,ENSG00000070476,ZXDC
ZYG11A,Gene Expression,ZYG11A,ENSG00000203995,ZYG11A
ZYG11B,Gene Expression,ZYG11B,ENSG00000162378,ZYG11B
ZYX,Gene Expression,ZYX,ENSG00000159840,ZYX


In [56]:
#Write the anndata object to h5ad file
adata.write_h5ad('/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/all_cells/Healthy_integrated_data_all_genes.h5ad')

: 