## Notebook for Kong 2023 anndata file creation 
### Developed by: Anna Maguza

### Institute of Computational Biology - Computational Health Centre - Hemlholtz Munich

### 4th April 2023

#### Load packages

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
import h5py
from scipy.io import mmread
from scipy.sparse import coo_matrix
import matplotlib.pyplot as plt
import scipy as sci

#### Setup Cells

In [2]:
%matplotlib inline

In [3]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

  from .autonotebook import tqdm as notebook_tqdm


scanpy==1.9.3 anndata==0.8.0 umap==0.5.3 numpy==1.23.5 scipy==1.10.1 pandas==1.3.5 scikit-learn==1.2.2 statsmodels==0.13.5 pynndescent==0.5.8


## Data Upload

In [4]:
#Upload barcodes, features, and matrix - CO_STR
CO_STR_barcodes = pd.read_csv('/Users/anna.maguza/Desktop/Data/Gut_project/Healthy_gut_data/Kong_2023/SCP1884/expression/CO_STR/CO_STR.scp.barcodes.tsv', sep='\t', header=None)
CO_STR_features = pd.read_csv('/Users/anna.maguza/Desktop/Data/Gut_project/Healthy_gut_data/Kong_2023/SCP1884/expression/CO_STR/CO_STR.scp.features.tsv', sep='\t', header=None)
CO_STR_matrix = mmread('/Users/anna.maguza/Desktop/Data/Gut_project/Healthy_gut_data/Kong_2023/SCP1884/expression/CO_STR/CO_STR.scp.raw.mtx')

In [5]:
#Upload barcodes, features, and matrix - CO_Epi
CO_Epi_barcodes = pd.read_csv('/Users/anna.maguza/Desktop/Data/Gut_project/Healthy_gut_data/Kong_2023/SCP1884/expression/CO_EPI/CO_Epi.scp.barcodes.tsv', sep='\t', header=None)
CO_Epi_features = pd.read_csv('/Users/anna.maguza/Desktop/Data/Gut_project/Healthy_gut_data/Kong_2023/SCP1884/expression/CO_EPI/CO_Epi.scp.features.tsv', sep='\t', header=None)
CO_Epi_matrix = mmread('/Users/anna.maguza/Desktop/Data/Gut_project/Healthy_gut_data/Kong_2023/SCP1884/expression/CO_EPI/CO_EPI.scp.raw.mtx')

In [6]:
#Upload barcodes, features, and matrix - CO_IMM
CO_IMM_barcodes = pd.read_csv('/Users/anna.maguza/Desktop/Data/Gut_project/Healthy_gut_data/Kong_2023/SCP1884/expression/CO_IMM/CO_IMM.scp.barcodes.tsv', sep='\t', header=None)
CO_IMM_features = pd.read_csv('/Users/anna.maguza/Desktop/Data/Gut_project/Healthy_gut_data/Kong_2023/SCP1884/expression/CO_IMM/CO_IMM.scp.features.tsv', sep='\t', header=None)
CO_IMM_matrix = mmread('/Users/anna.maguza/Desktop/Data/Gut_project/Healthy_gut_data/Kong_2023/SCP1884/expression/CO_IMM/CO_IMM.scp.raw.mtx')


In [7]:
#Upload barcodes, features, and matrix - Ti_Epi
Ti_Epi_barcodes = pd.read_csv('/Users/anna.maguza/Desktop/Data/Gut_project/Healthy_gut_data/Kong_2023/SCP1884/expression/TI_EPI/Ti_Epi.scp.barcodes.tsv', sep='\t', header=None)
Ti_Epi_features = pd.read_csv('/Users/anna.maguza/Desktop/Data/Gut_project/Healthy_gut_data/Kong_2023/SCP1884/expression/TI_EPI/Ti_Epi.scp.features.tsv', sep='\t', header=None)
Ti_Epi_matrix = mmread('/Users/anna.maguza/Desktop/Data/Gut_project/Healthy_gut_data/Kong_2023/SCP1884/expression/TI_EPI/Ti_Epi.scp.raw.mtx')

#Upload barcodes, features, and matrix - Ti_Str
Ti_Str_barcodes = pd.read_csv('/Users/anna.maguza/Desktop/Data/Gut_project/Healthy_gut_data/Kong_2023/SCP1884/expression/TI_STR/Ti_Str.scp.barcodes.tsv', sep='\t', header=None)
Ti_Str_features = pd.read_csv('/Users/anna.maguza/Desktop/Data/Gut_project/Healthy_gut_data/Kong_2023/SCP1884/expression/TI_STR/Ti_Str.scp.features.tsv', sep='\t', header=None)
Ti_Str_matrix = mmread('/Users/anna.maguza/Desktop/Data/Gut_project/Healthy_gut_data/Kong_2023/SCP1884/expression/TI_STR/Ti_Str.scp.raw.mtx')

#Upload barcodes, features, and matrix - Ti_IMM
Ti_IMM_barcodes = pd.read_csv('/Users/anna.maguza/Desktop/Data/Gut_project/Healthy_gut_data/Kong_2023/SCP1884/expression/TI_IMM/Ti_IMM.scp.barcodes.tsv', sep='\t', header=None)
Ti_IMM_features = pd.read_csv('/Users/anna.maguza/Desktop/Data/Gut_project/Healthy_gut_data/Kong_2023/SCP1884/expression/TI_IMM/Ti_IMM.scp.features.tsv', sep='\t', header=None)
Ti_IMM_matrix = mmread('/Users/anna.maguza/Desktop/Data/Gut_project/Healthy_gut_data/Kong_2023/SCP1884/expression/TI_IMM/Ti_IMM.scp.raw.mtx')

### Create anndata files

In [8]:
# Convert countmatrixes to sparse matrices
CO_STR_matrix = sci.sparse.csr_matrix(CO_STR_matrix)
CO_Epi_matrix = sci.sparse.csr_matrix(CO_Epi_matrix)
CO_IMM_matrix = sci.sparse.csr_matrix(CO_IMM_matrix)
Ti_Epi_matrix = sci.sparse.csr_matrix(Ti_Epi_matrix)
Ti_Str_matrix = sci.sparse.csr_matrix(Ti_Str_matrix)
Ti_IMM_matrix = sci.sparse.csr_matrix(Ti_IMM_matrix)

In [9]:
# Set indexes for barcodes and features
# Rename columns for barcodes and features
CO_STR_features.rename(columns={0: 'gene_id'}, inplace=True)
CO_STR_features.rename(columns={1: 'gene_name'}, inplace=True)
CO_STR_barcodes.rename(columns={0: 'cell_id'}, inplace=True)
CO_Epi_features.rename(columns={0: 'gene_id'}, inplace=True)
CO_Epi_features.rename(columns={1: 'gene_name'}, inplace=True)
CO_Epi_barcodes.rename(columns={0: 'cell_id'}, inplace=True)
CO_IMM_features.rename(columns={0: 'gene_id'}, inplace=True)
CO_IMM_features.rename(columns={1: 'gene_name'}, inplace=True)
CO_IMM_barcodes.rename(columns={0: 'cell_id'}, inplace=True)
Ti_Epi_features.rename(columns={0: 'gene_id'}, inplace=True)
Ti_Epi_features.rename(columns={1: 'gene_name'}, inplace=True)
Ti_Epi_barcodes.rename(columns={0: 'cell_id'}, inplace=True)
Ti_Str_features.rename(columns={0: 'gene_id'}, inplace=True)
Ti_Str_features.rename(columns={1: 'gene_name'}, inplace=True)
Ti_Str_barcodes.rename(columns={0: 'cell_id'}, inplace=True)
Ti_IMM_features.rename(columns={0: 'gene_id'}, inplace=True)
Ti_IMM_features.rename(columns={1: 'gene_name'}, inplace=True)
Ti_IMM_barcodes.rename(columns={0: 'cell_id'}, inplace=True)

# Set indexes for barcodes and features
CO_STR_barcodes = CO_STR_barcodes.set_index('cell_id')
CO_STR_features = CO_STR_features.set_index('gene_id')
CO_Epi_barcodes = CO_Epi_barcodes.set_index('cell_id')
CO_Epi_features = CO_Epi_features.set_index('gene_id')
CO_IMM_barcodes = CO_IMM_barcodes.set_index('cell_id')
CO_IMM_features = CO_IMM_features.set_index('gene_id')
Ti_Epi_barcodes = Ti_Epi_barcodes.set_index('cell_id')
Ti_Epi_features = Ti_Epi_features.set_index('gene_id')
Ti_Str_barcodes = Ti_Str_barcodes.set_index('cell_id')
Ti_Str_features = Ti_Str_features.set_index('gene_id')
Ti_IMM_barcodes = Ti_IMM_barcodes.set_index('cell_id')
Ti_IMM_features = Ti_IMM_features.set_index('gene_id')

In [10]:
# Create anndata objects 
CO_STR = an.AnnData(X=np.transpose(CO_STR_matrix), obs=CO_STR_barcodes, var=CO_STR_features)
CO_STR.obs['cell_type'] = 'Stromal'
CO_STR.obs['tissue'] = 'Colon'

CO_Epi = an.AnnData(X=np.transpose(CO_Epi_matrix), obs=CO_Epi_barcodes, var=CO_Epi_features)
CO_Epi.obs['cell_type'] = 'Epithelial'
CO_Epi.obs['tissue'] = 'Colon'

CO_IMM = an.AnnData(X=np.transpose(CO_IMM_matrix), obs=CO_IMM_barcodes, var=CO_IMM_features)
CO_IMM.obs['cell_type'] = 'Immune'
CO_IMM.obs['tissue'] = 'Colon'

Ti_Epi = an.AnnData(X=np.transpose(Ti_Epi_matrix), obs=Ti_Epi_barcodes, var=Ti_Epi_features)
Ti_Epi.obs['cell_type'] = 'Epithelial'
Ti_Epi.obs['tissue'] = 'Terminal Ileum'

Ti_Str = an.AnnData(X=np.transpose(Ti_Str_matrix), obs=Ti_Str_barcodes, var=Ti_Str_features)
Ti_Str.obs['cell_type'] = 'Stromal'
Ti_Str.obs['tissue'] = 'Terminal Ileum'

Ti_IMM = an.AnnData(X=np.transpose(Ti_IMM_matrix), obs=Ti_IMM_barcodes, var=Ti_IMM_features)
Ti_IMM.obs['cell_type'] = 'Immune'
Ti_IMM.obs['tissue'] = 'Terminal Ileum'


  CO_STR = an.AnnData(X=np.transpose(CO_STR_matrix), obs=CO_STR_barcodes, var=CO_STR_features)
  CO_Epi = an.AnnData(X=np.transpose(CO_Epi_matrix), obs=CO_Epi_barcodes, var=CO_Epi_features)
  CO_IMM = an.AnnData(X=np.transpose(CO_IMM_matrix), obs=CO_IMM_barcodes, var=CO_IMM_features)
  Ti_Epi = an.AnnData(X=np.transpose(Ti_Epi_matrix), obs=Ti_Epi_barcodes, var=Ti_Epi_features)
  Ti_Str = an.AnnData(X=np.transpose(Ti_Str_matrix), obs=Ti_Str_barcodes, var=Ti_Str_features)
  Ti_IMM = an.AnnData(X=np.transpose(Ti_IMM_matrix), obs=Ti_IMM_barcodes, var=Ti_IMM_features)


In [11]:
del CO_STR_matrix, CO_Epi_matrix, CO_IMM_matrix, Ti_Epi_matrix, Ti_Str_matrix, Ti_IMM_matrix

In [12]:
# Concatenate all anndata objects without batch categories and batch key
adata = an.AnnData.concatenate(CO_STR, CO_Epi, CO_IMM, Ti_Epi, Ti_Str, Ti_IMM, index_unique = None)

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


In [13]:
adata.obs

Unnamed: 0_level_0,cell_type,tissue,batch
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
N105446_L-GTGTGGCTCCGTCAAA,Stromal,Colon,0
N105446_L-CAATACGAGTCCCTAA,Stromal,Colon,0
N105446_L-CCCTGATAGTGTTCCA,Stromal,Colon,0
N105446_L-CATTGTTAGAGCCCAA,Stromal,Colon,0
N105446_L-TCCATGCGTTCGTTCC,Stromal,Colon,0
...,...,...,...
N119540_L2-CACTGGGAGCTGACCC,Immune,Terminal Ileum,5
N119540_L2-CGAGGCTTCTCACTCG,Immune,Terminal Ileum,5
N119540_L2-CTCAAGATCTACGCGG,Immune,Terminal Ileum,5
N119540_L2-TATTCCATCGCCTAGG,Immune,Terminal Ileum,5


In [14]:
# Upload metadata
metadata = pd.read_csv('/Users/anna.maguza/Desktop/Data/Gut_project/Healthy_gut_data/Kong_2023/SCP1884/metadata/scp_metadata_combined.v2.txt', sep='\t', header=0)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [15]:
# Set index for metadata
metadata = metadata.set_index('NAME')

# Drop row 'TYPE' from metadata
metadata = metadata.drop('TYPE')

In [16]:
# Join metadata to anndata object by index
adata.obs = adata.obs.join(metadata)


In [17]:
adata.obs

Unnamed: 0_level_0,cell_type,tissue,batch,biosample_id,n_genes,n_counts,Chem,Site,Type,donor_id,...,Celltype,sex,species,species__ontology_label,library_preparation_protocol,library_preparation_protocol__ontology_label,organ,organ__ontology_label,disease,disease__ontology_label
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
N105446_L-GTGTGGCTCCGTCAAA,Stromal,Colon,0,N105446_L,5135,19014,v3,CO,NonI,105446,...,Fibroblasts ADAMDEC1,unknown,NCBITaxon_9606,Homo sapiens,EFO_0009922,10x 3' v3,UBERON_0001155,colon,MONDO_0005011,Crohn's disease
N105446_L-CAATACGAGTCCCTAA,Stromal,Colon,0,N105446_L,5119,18425,v3,CO,NonI,105446,...,Endothelial cells CD36,unknown,NCBITaxon_9606,Homo sapiens,EFO_0009922,10x 3' v3,UBERON_0001155,colon,MONDO_0005011,Crohn's disease
N105446_L-CCCTGATAGTGTTCCA,Stromal,Colon,0,N105446_L,5024,18305,v3,CO,NonI,105446,...,Fibroblasts ADAMDEC1,unknown,NCBITaxon_9606,Homo sapiens,EFO_0009922,10x 3' v3,UBERON_0001155,colon,MONDO_0005011,Crohn's disease
N105446_L-CATTGTTAGAGCCCAA,Stromal,Colon,0,N105446_L,4817,17791,v3,CO,NonI,105446,...,Fibroblasts ADAMDEC1,unknown,NCBITaxon_9606,Homo sapiens,EFO_0009922,10x 3' v3,UBERON_0001155,colon,MONDO_0005011,Crohn's disease
N105446_L-TCCATGCGTTCGTTCC,Stromal,Colon,0,N105446_L,4690,17548,v3,CO,NonI,105446,...,Fibroblasts KCNN3 LY6H,unknown,NCBITaxon_9606,Homo sapiens,EFO_0009922,10x 3' v3,UBERON_0001155,colon,MONDO_0005011,Crohn's disease
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
N119540_L2-CACTGGGAGCTGACCC,Immune,Terminal Ileum,5,N119540_L2,270,460,v3,TI,NonI,119540,...,Plasma cells,unknown,NCBITaxon_9606,Homo sapiens,EFO_0009922,10x 3' v3,UBERON_0002116,ileum,MONDO_0005011,Crohn's disease
N119540_L2-CGAGGCTTCTCACTCG,Immune,Terminal Ileum,5,N119540_L2,267,454,v3,TI,NonI,119540,...,Cycling cells,unknown,NCBITaxon_9606,Homo sapiens,EFO_0009922,10x 3' v3,UBERON_0002116,ileum,MONDO_0005011,Crohn's disease
N119540_L2-CTCAAGATCTACGCGG,Immune,Terminal Ileum,5,N119540_L2,327,453,v3,TI,NonI,119540,...,Plasma cells,unknown,NCBITaxon_9606,Homo sapiens,EFO_0009922,10x 3' v3,UBERON_0002116,ileum,MONDO_0005011,Crohn's disease
N119540_L2-TATTCCATCGCCTAGG,Immune,Terminal Ileum,5,N119540_L2,268,430,v3,TI,NonI,119540,...,Cycling cells,unknown,NCBITaxon_9606,Homo sapiens,EFO_0009922,10x 3' v3,UBERON_0002116,ileum,MONDO_0005011,Crohn's disease


In [19]:
adata.var

Unnamed: 0_level_0,gene_name
gene_id,Unnamed: 1_level_1
ENSG00000237683,AL627309.1
ENSG00000228463,AP006222.2
ENSG00000237094,RP4-669L17.10
ENSG00000235373,RP11-206L10.3
ENSG00000228327,RP11-206L10.2
...,...
ENSG00000234854,LINC00676
ENSG00000264151,RP11-739N10.1
ENSG00000268287,CTB-60B18.18
ENSG00000254760,CTD-2616J11.3


In [20]:
adata.obs.index = adata.obs.index.astype(str)
adata.var.index = adata.var.index.astype(str)
adata.obs_names = adata.obs_names.astype(str)
adata.var.index.name = str(adata.var.index.name) if adata.var.index.name is not None else None
adata.var_names = adata.var_names.astype(str)
adata.obs = adata.obs.astype(str)
adata.var = adata.var.astype(str)

In [21]:
# Save anndata object
adata.write('/Users/anna.maguza/Desktop/Data/Gut_project/Healthy_gut_data/Kong_2023/Raw_anndata/Kong_2023_raw_anndata.h5ad')