## Notebook for Smilie 2019 anndata file creation 

+ Developed by: Anna Maguza
+ Institute of Computational Biology - Computational Health Centre - Hemlholtz Munich
+ Date created: 10th February 2023
+ Date modified: 22nd May 2024

#### Load packages

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
import h5py
from scipy.io import mmread
from scipy.sparse import coo_matrix
import matplotlib.pyplot as plt
import scipy as sci

#### Setup Cells

In [2]:
%matplotlib inline

In [3]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

scanpy==1.9.5 anndata==0.9.2 umap==0.5.4 numpy==1.24.4 scipy==1.13.0 pandas==2.1.1 scikit-learn==1.3.1 statsmodels==0.14.0 igraph==0.11.2 pynndescent==0.5.10


### Data Upload

In [14]:
input_dir = '/mnt/LaCIE/annaM/gut_project/raw_data/Smillie_2019/SCP259/'
epi_dir = 'expression/5cdc540d328cee7a2efc2348'
imm_dir = 'expression/5cdc540d328cee7a2efc234a'
fib_dir = 'expression/5cdc540d328cee7a2efc2349'

meta_data = 'metadata'

In [5]:
#Upload barcodes (tsv file) and gene names (tsv file) 
#Epithelial cells
epi_barcodes = pd.read_csv(f'{input_dir}/{epi_dir}/Epi.barcodes2.tsv', sep='\t', header=None)
epi_genes = pd.read_csv(f'{input_dir}/{epi_dir}/Epi.genes.tsv', sep='\t', header=None)

#Immune cells
imm_barcodes = pd.read_csv(f'{input_dir}/{imm_dir}/Imm.barcodes2.tsv', sep='\t', header=None)
imm_genes = pd.read_csv(f'{input_dir}/{imm_dir}/Imm.genes.tsv', sep='\t', header=None)

#Fibroblasts
fib_barcodes = pd.read_csv(f'{input_dir}/{fib_dir}/Fib.barcodes2.tsv', sep='\t', header=None)
fib_genes = pd.read_csv(f'{input_dir}/{fib_dir}/Fib.genes.tsv', sep='\t', header=None)

In [7]:
#Upload countmatrix (mtx file)
#Epithelial cells
epi_mtx = mmread(f'{input_dir}/{epi_dir}/gene_sorted-Epi.matrix.mtx')
epi_mtx = sci.sparse.csr_matrix(epi_mtx)

#Immune cells
imm_mtx = mmread(f'{input_dir}/{imm_dir}/gene_sorted-Imm.matrix.mtx')
imm_mtx = sci.sparse.csr_matrix(imm_mtx)

#Fibroblasts
fib_mtx = mmread(f'{input_dir}/{fib_dir}/gene_sorted-Fib.matrix.mtx')
fib_mtx = sci.sparse.csr_matrix(fib_mtx)

In [9]:
#Rename epi_barcode column to 'cell_id'
epi_barcodes.columns = ['cell_id']
fib_barcodes.columns = ['cell_id']
imm_barcodes.columns = ['cell_id']

In [10]:
#Create epithelial anndata object
epi_adata = an.AnnData(X = np.transpose(epi_mtx), obs = epi_barcodes, var = epi_genes)
#Create fibroblasts anndata object
fib_adata = an.AnnData(X = np.transpose(fib_mtx), obs = fib_barcodes, var = fib_genes)
#Create immune anndata object
imm_adata = an.AnnData(X = np.transpose(imm_mtx), obs = imm_barcodes, var = imm_genes)



In [11]:
#Make gene names as index
fib_adata.var_names = fib_adata.var[0]
imm_adata.var_names = imm_adata.var[0]
epi_adata.var_names = epi_adata.var[0]

In [12]:
#Create a merged anndata object
merged = an.concat([fib_adata, imm_adata, epi_adata], join = 'outer')

  if pd.api.types.is_categorical_dtype(dtype):
  utils.warn_names_duplicates("obs")


In [13]:
#Create an obs_names with cell_id 
merged.obs_names = merged.obs['cell_id']

In [15]:
#Upload metadata (txt file)
all_meta2 = pd.read_csv(f'{input_dir}/{meta_data}/all.meta2.txt', sep='\t', header=0)

  all_meta2 = pd.read_csv(f'{input_dir}/{meta_data}/all.meta2.txt', sep='\t', header=0)


In [16]:
all_meta2

Unnamed: 0,NAME,Cluster,nGene,nUMI,Subject,Health,Location,Sample
0,TYPE,group,numeric,numeric,group,group,group,group
1,N7.EpiA.AAACATACACACTG,TA 1,328,891,N7,Non-inflamed,Epi,N7.EpiA
2,N7.EpiA.AAACCGTGCATCAG,TA 1,257,663,N7,Non-inflamed,Epi,N7.EpiA
3,N7.EpiA.AAACGCACAATCGC,TA 2,300,639,N7,Non-inflamed,Epi,N7.EpiA
4,N7.EpiA.AAAGATCTAACCGT,Enterocyte Progenitors,250,649,N7,Non-inflamed,Epi,N7.EpiA
...,...,...,...,...,...,...,...,...
365488,N110.LPB.TTTGGTTAGGATGGTC,Macrophages,635,1366,N110,Inflamed,LP,N110.LPB
365489,N110.LPB.TTTGGTTCACCTCGTT,Plasma,610,2730,N110,Inflamed,LP,N110.LPB
365490,N110.LPB.TTTGGTTTCGGAAACG,Macrophages,859,1979,N110,Inflamed,LP,N110.LPB
365491,N110.LPB.TTTGTCAGTTGACGTT,Macrophages,965,2696,N110,Inflamed,LP,N110.LPB


In [17]:
#Make NAME as index and remove the first column
all_meta2.index = all_meta2['NAME']
all_meta2 = all_meta2.drop(['NAME'], axis = 1)

In [18]:
all_meta2 = all_meta2.drop(['TYPE'])

In [19]:
#Change nUMI to int type in all_meta2
all_meta2['nUMI'] = all_meta2['nUMI'].astype(str)
all_meta2['nGene'] = all_meta2['nGene'].astype(str)

In [20]:
#Add all_meta2 to merged anndata object by cell_id in merged.obs and NAME in all_meta2
merged.obs['cell_type'] = all_meta2['Cluster']
merged.obs['nGene'] = all_meta2['nGene']
merged.obs['nUMI'] = all_meta2['nUMI']
merged.obs['Subject'] = all_meta2['Subject']
merged.obs['Sample'] = all_meta2['Sample']
merged.obs['Health'] = all_meta2['Health']
merged.obs['Location'] = all_meta2['Location']


In [21]:
merged.obs

Unnamed: 0_level_0,cell_id,cell_type,nGene,nUMI,Subject,Sample,Health,Location
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
N7.LPA.AAACATACCATTTC,N7.LPA.AAACATACCATTTC,WNT2B+ Fos-lo 1,550,1118,N7,N7.LPA,Non-inflamed,LP
N7.LPA.AAACGCACCCGATA,N7.LPA.AAACGCACCCGATA,Myofibroblasts,303,516,N7,N7.LPA,Non-inflamed,LP
N7.LPA.AAACGCTGTATCGG,N7.LPA.AAACGCTGTATCGG,Myofibroblasts,636,1312,N7,N7.LPA,Non-inflamed,LP
N7.LPA.AAAGACGAGACAGG,N7.LPA.AAAGACGAGACAGG,WNT2B+ Fos-hi,1030,2594,N7,N7.LPA,Non-inflamed,LP
N7.LPA.AAAGATCTAGTCTG,N7.LPA.AAAGATCTAGTCTG,WNT2B+ Fos-hi,707,1769,N7,N7.LPA,Non-inflamed,LP
...,...,...,...,...,...,...,...,...
N110.LPB.TTTGGTTGTGTGGCTC,N110.LPB.TTTGGTTGTGTGGCTC,Immature Enterocytes 2,2553,11705,N110,N110.LPB,Inflamed,LP
N110.LPB.TTTGGTTTCCTTAATC,N110.LPB.TTTGGTTTCCTTAATC,TA 2,3234,16164,N110,N110.LPB,Inflamed,LP
N110.LPB.TTTGGTTTCTTACCTA,N110.LPB.TTTGGTTTCTTACCTA,Enterocyte Progenitors,258,384,N110,N110.LPB,Inflamed,LP
N110.LPB.TTTGTCAAGGATGGAA,N110.LPB.TTTGTCAAGGATGGAA,TA 1,487,772,N110,N110.LPB,Inflamed,LP


In [22]:
#Filter only not inflamed stem cells from the Smilie_stem 
merged = merged[merged.obs['Health'].isin(['Healthy', 'Non-inflamed']),:]

  if not is_categorical_dtype(df_full[k]):


In [23]:
merged.obs.index.name = "cell_id"

In [24]:
# Rename columns in Smilie_stem as they are in Wang_stem
del merged.obs["cell_id"]
merged.obs["Donor_ID"] = merged.obs["Subject"]
del merged.obs["Subject"]
merged.obs["Sample_ID"] = merged.obs["Sample"]
del merged.obs["Sample"]
merged.obs["CellType"] = merged.obs["cell_type"]
del merged.obs["cell_type"]
merged.obs["n_counts"] = merged.obs["nUMI"]
del merged.obs["nUMI"]
merged.obs["n_genes"] = merged.obs["nGene"]
del merged.obs["nGene"]
merged.obs["Diagnosis"] = merged.obs["Health"]
del merged.obs["Health"]

  merged.obs["Donor_ID"] = merged.obs["Subject"]


In [25]:
merged.var.index.name = "gene_id"

In [26]:
#Save anndata object
merged.write(f'{input_dir}/Smillie_ulcerative_colitis_anndata.h5ad')