## Notebook for Smilie anndata file creation 
### Developed by: Anna Maguza

### Institute of Computational Biology - Computational Health Centre - Hemlholtz Munich

### 10 February 2022

#### Load packages

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
import h5py
from scipy.io import mmread
from scipy.sparse import coo_matrix
import matplotlib.pyplot as plt
import scipy as sci

#### Setup Cells

In [2]:
%matplotlib inline

In [3]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

  from .autonotebook import tqdm as notebook_tqdm


scanpy==1.9.1 anndata==0.8.0 umap==0.5.3 numpy==1.21.5 scipy==1.7.3 pandas==1.5.1 scikit-learn==1.1.3 statsmodels==0.13.2 pynndescent==0.5.8


#### Data Upload

In [4]:
#Upload barcodes (tsv file) and gene names (tsv file) 
#Epithelial cells
epi_barcodes = pd.read_csv('/lustre/groups/talaveralopez/datasets/Colorectal_cancer/Raw_data/Smillie_ulcerative_colitis/Epi.barcodes2.tsv', sep='\t', header=None)
epi_genes = pd.read_csv('/lustre/groups/talaveralopez/datasets/Colorectal_cancer/Raw_data/Smillie_ulcerative_colitis/Epi.genes.tsv', sep='\t', header=None)

#Immune cells
imm_barcodes = pd.read_csv('/lustre/groups/talaveralopez/datasets/Colorectal_cancer/Raw_data/Smillie_ulcerative_colitis/Imm.barcodes2.tsv', sep='\t', header=None)
imm_genes = pd.read_csv('/lustre/groups/talaveralopez/datasets/Colorectal_cancer/Raw_data/Smillie_ulcerative_colitis/Imm.genes.tsv', sep='\t', header=None)

#Fibroblasts
fib_barcodes = pd.read_csv('/lustre/groups/talaveralopez/datasets/Colorectal_cancer/Raw_data/Smillie_ulcerative_colitis/Fib.barcodes2.tsv', sep='\t', header=None)
fib_genes = pd.read_csv('/lustre/groups/talaveralopez/datasets/Colorectal_cancer/Raw_data/Smillie_ulcerative_colitis/Fib.genes.tsv', sep='\t', header=None)

In [5]:
#Upload countmatrix (mtx file)
#Epithelial cells
epi_mtx = mmread('/lustre/groups/talaveralopez/datasets/Colorectal_cancer/Raw_data/Smillie_ulcerative_colitis/gene_sorted-Epi.matrix.mtx')
epi_mtx = sci.sparse.csr_matrix(epi_mtx)

#Immune cells
imm_mtx = mmread('/lustre/groups/talaveralopez/datasets/Colorectal_cancer/Raw_data/Smillie_ulcerative_colitis/gene_sorted-Imm.matrix.mtx')
imm_mtx = sci.sparse.csr_matrix(imm_mtx)

#Fibroblasts
fib_mtx = mmread('/lustre/groups/talaveralopez/datasets/Colorectal_cancer/Raw_data/Smillie_ulcerative_colitis/gene_sorted-Fib.matrix.mtx')
fib_mtx = sci.sparse.csr_matrix(fib_mtx)

In [6]:
#Rename epi_barcode column to 'cell_id'
epi_barcodes.columns = ['cell_id']
fib_barcodes.columns = ['cell_id']
imm_barcodes.columns = ['cell_id']

In [7]:
#Create epithelial anndata object
epi_adata = an.AnnData(X = np.transpose(epi_mtx), obs = epi_barcodes, var = epi_genes)
#Create fibroblasts anndata object
fib_adata = an.AnnData(X = np.transpose(fib_mtx), obs = fib_barcodes, var = fib_genes)
#Create immune anndata object
imm_adata = an.AnnData(X = np.transpose(imm_mtx), obs = imm_barcodes, var = imm_genes)

  epi_adata = an.AnnData(X = np.transpose(epi_mtx), obs = epi_barcodes, var = epi_genes)
  fib_adata = an.AnnData(X = np.transpose(fib_mtx), obs = fib_barcodes, var = fib_genes)
  imm_adata = an.AnnData(X = np.transpose(imm_mtx), obs = imm_barcodes, var = imm_genes)


In [8]:
#Make gene names as index
fib_adata.var_names = fib_adata.var[0]
imm_adata.var_names = imm_adata.var[0]
epi_adata.var_names = epi_adata.var[0]

In [9]:
#Create a merged anndata object
merged = an.concat([fib_adata, imm_adata, epi_adata], join = 'outer')

  utils.warn_names_duplicates("obs")


In [10]:
#Create an obs_names with cell_id 
merged.obs_names = merged.obs['cell_id']

In [11]:
#Upload metadata (txt file)
all_meta2 = pd.read_csv('/lustre/groups/talaveralopez/datasets/Colorectal_cancer/Raw_data/Smillie_ulcerative_colitis/all.meta2.csv', sep=';')

  all_meta2 = pd.read_csv('/lustre/groups/talaveralopez/datasets/Colorectal_cancer/Raw_data/Smillie_ulcerative_colitis/all.meta2.csv', sep=';')


In [12]:
#Make NAME as index and remove the first column
all_meta2.index = all_meta2['NAME']
all_meta2 = all_meta2.drop(['NAME'], axis = 1)

In [13]:
#Change nUMI to int type in all_meta2
all_meta2['nUMI'] = all_meta2['nUMI'].astype(str)
all_meta2['nGene'] = all_meta2['nGene'].astype(str)

In [14]:
#Add all_meta2 to merged anndata object by cell_id in merged.obs and NAME in all_meta2
merged.obs['cell_type'] = all_meta2['Cluster']
merged.obs['nGene'] = all_meta2['nGene']
merged.obs['nUMI'] = all_meta2['nUMI']
merged.obs['Subject'] = all_meta2['Subject']
merged.obs['Sample'] = all_meta2['Sample']
merged.obs['Health'] = all_meta2['Health']
merged.obs['Location'] = all_meta2['Location']


In [15]:
merged.obs

Unnamed: 0_level_0,cell_id,cell_type,nGene,nUMI,Subject,Sample,Health,Location
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
N7.LPA.AAACATACCATTTC,N7.LPA.AAACATACCATTTC,WNT2B+ Fos-lo 1,550,1118,N7,N7.LPA,Non-inflamed,LP
N7.LPA.AAACGCACCCGATA,N7.LPA.AAACGCACCCGATA,Myofibroblasts,303,516,N7,N7.LPA,Non-inflamed,LP
N7.LPA.AAACGCTGTATCGG,N7.LPA.AAACGCTGTATCGG,Myofibroblasts,636,1312,N7,N7.LPA,Non-inflamed,LP
N7.LPA.AAAGACGAGACAGG,N7.LPA.AAAGACGAGACAGG,WNT2B+ Fos-hi,1030,2594,N7,N7.LPA,Non-inflamed,LP
N7.LPA.AAAGATCTAGTCTG,N7.LPA.AAAGATCTAGTCTG,WNT2B+ Fos-hi,707,1769,N7,N7.LPA,Non-inflamed,LP
...,...,...,...,...,...,...,...,...
N110.LPB.TTTGGTTGTGTGGCTC,N110.LPB.TTTGGTTGTGTGGCTC,Immature Enterocytes 2,2553,11705,N110,N110.LPB,Inflamed,LP
N110.LPB.TTTGGTTTCCTTAATC,N110.LPB.TTTGGTTTCCTTAATC,TA 2,3234,16164,N110,N110.LPB,Inflamed,LP
N110.LPB.TTTGGTTTCTTACCTA,N110.LPB.TTTGGTTTCTTACCTA,Enterocyte Progenitors,258,384,N110,N110.LPB,Inflamed,LP
N110.LPB.TTTGTCAAGGATGGAA,N110.LPB.TTTGTCAAGGATGGAA,TA 1,487,772,N110,N110.LPB,Inflamed,LP


In [16]:
#Save anndata object
merged.write('/lustre/groups/talaveralopez/datasets/Colorectal_cancer/Raw_data/Smillie_ulcerative_colitis/Smillie_ulcerative_colitis_anndata.h5ad')