## Notebook for Integrated healthy reference dataset plots creation

+ Developed by: Anna Maguza
+ Institute of Computational Biology - Computational Health Centre - Hemlholtz Munich
+ Date created: 22nd May 2024
+ Last modified: 22nd May 2024

#### Import required packages

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

### Data Upload

In [2]:
input_dir = '/mnt/LaCIE/annaM/gut_project/Processed_data/Gut_data/Healthy_reference/Integrated/'

In [3]:
adata = sc.read(input_dir + 'Integrated_4_datasets_05042024')

### UMAP

In [None]:
adata

In [None]:
new_palette = ['#43766C', '#759EB8', '#824670', '#FF76CE', '#E9A8F2', '#94FFD8',
               '#A3D8FF', '#FFA27F', '#97BE5A']  # Hex codes for pink and light blue

# Assign the new color palette to your categories
adata.uns['Cell_Type_colors'] = new_palette

fig_dir = '/mnt/LaCIE/annaM/gut_project/Processed_data/Gut_data/Plots/Finding_stem_cells'

with plt.rc_context():
    sc.set_figure_params(dpi=300, figsize=(15, 15))
    sc.pl.umap(adata, frameon=False, color='Cell_Type', size=10, legend_fontsize=5, ncols=3, show=False)
    plt.savefig(f"{fig_dir}/Integrated_reference_umap.png", bbox_inches="tight")

+ Umap with stem scores

In [6]:
adata_log = adata.copy()
sc.pp.normalize_total(adata_log, target_sum = 1e6, exclude_highly_expressed = True)
sc.pp.log1p(adata_log)

In [None]:
stem_cells_markers = ['AXIN2', 'ASCL2', 'ATOH1', 'BMI1', 'CA12', 'CLU', 'GPX2', 'HMGCS2', 'LEFTY1', 'LGR5', 'LRIG1', 'MYC', 'OLFM4', 'SMOC2', 'TERT']

In [None]:
sc.tl.score_genes(adata_log, stem_cells_markers, score_name = 'Stem_cells_markers_score')

In [None]:
with plt.rc_context():
    sc.set_figure_params(dpi=300, figsize=(15, 15))
    sc.pl.umap(adata_log, color= ['Stem_cells_markers_score'], color_map = "magma_r", frameon=False, size = 10, show=False)
    plt.savefig(f"{fig_dir}/Integrated_reference_stem_markers.png", bbox_inches="tight")

+ umap with stem cells

In [None]:
stem_cells = ['Stem cells OLFM4', 'Stem cells OLFM4 GSTA1', 'Stem cells OLFM4 LGR5', 'Stem cells OLFM4 PCNA', 'Stem_Cells_GCA', 'Stem_Cells_ext']

In [None]:
# create a column stem_cells in adata.obs with True if the cell is a stem cell and False otherwise
adata.obs['stem_cells'] = adata.obs['Cell States'].isin(stem_cells)

In [None]:
adata.obs['stem_cells'] = adata.obs['stem_cells'].astype(str)

In [None]:
new_palette = ['#759EB8', '#824670']  # Hex codes for pink and light blue

# Assign the new color palette to your categories
adata.uns['stem_cells_colors'] = new_palette

with plt.rc_context():
    sc.set_figure_params(dpi=300, figsize=(15, 15))
    sc.pl.umap(adata, frameon=False, color='stem_cells', size=10, legend_fontsize=5, ncols=3, show=False)
    plt.savefig(f"{fig_dir}/Integrated_reference_stem_umap.png", bbox_inches="tight")

+ Dot plot

In [None]:
df = adata_log.obs['Cell States'].value_counts()

In [7]:
adata_log.obs['states_for_figure'] = adata_log.obs['Cell_Type'].copy()

In [8]:
adata_log.obs['states_for_figure'].value_counts()

states_for_figure
Epithelial         210075
Mesenchymal        172657
T cells             47043
Plasma cells        46681
Myeloid             25587
Neuronal            19307
B cells             17772
Endothelial         16631
Red blood cells      1346
Name: count, dtype: int64

In [9]:
adata_log.obs['states_for_figure'] = adata_log.obs['Cell_Type'].copy()
adata_log.obs['states_for_figure'] = adata_log.obs['states_for_figure'].cat.set_categories(['Epithelial',
                                                                                            'Mesenchymal', 'T cells', 'Plasma cells', 'Myeloid',
                                                                                            'Neuronal', 'B cells', 'Endothelial', 'Red blood cells', 
                                                                                            'Colonocyte', 'Goblet cells', 'Enterocyte', 'Paneth cells', 
                                                                                            'Stem cells', 'TA',
                                                                                            'Tuft cells'])


adata_log.obs.loc[adata_log.obs['Cell States'] == 'Goblet cell', 'states_for_figure'] = 'Goblet cells'
adata_log.obs.loc[adata_log.obs['Cell States'] == 'BEST2+ Goblet cell', 'states_for_figure'] = 'Goblet cells'
adata_log.obs.loc[adata_log.obs['Cell States'] == 'Goblet cells MUC2 TFF1', 'states_for_figure'] = 'Goblet cells'
adata_log.obs.loc[adata_log.obs['Cell States'] == 'Goblet cells SPINK4', 'states_for_figure'] = 'Goblet cells'
adata_log.obs.loc[adata_log.obs['Cell States'] == 'Paneth', 'states_for_figure'] = 'Paneth cells'
adata_log.obs.loc[adata_log.obs['Cell States'] == 'Paneth cells', 'states_for_figure'] = 'Paneth cells'
adata_log.obs.loc[adata_log.obs['Cell States'] == 'TA', 'states_for_figure'] = 'TA'
adata_log.obs.loc[adata_log.obs['Cell States'] == 'Tuft', 'states_for_figure'] = 'Tuft cells'
adata_log.obs.loc[adata_log.obs['Cell States'] == 'Tuft cells', 'states_for_figure'] = 'Tuft cells'
adata_log.obs.loc[adata_log.obs['Cell States'] == 'Enterocyte', 'states_for_figure'] = 'Enterocyte'
adata_log.obs.loc[adata_log.obs['Cell States'] == 'Enterocytes BEST4', 'states_for_figure'] = 'Enterocyte'
adata_log.obs.loc[adata_log.obs['Cell States'] == 'Enterocytes TMIGD1 MEP1A GSTA1', 'states_for_figure'] = 'Enterocyte'
adata_log.obs.loc[adata_log.obs['Cell States'] == 'Enterocytes TMIGD1 MEP1A', 'states_for_figure'] = 'Enterocyte'
adata_log.obs.loc[adata_log.obs['Cell States'] == 'Stem cells OLFM4', 'states_for_figure'] = 'Stem cells'
adata_log.obs.loc[adata_log.obs['Cell States'] == 'Stem cells OLFM4 GSTA1', 'states_for_figure'] = 'Stem cells'
adata_log.obs.loc[adata_log.obs['Cell States'] == 'Stem cells OLFM4 LGR5', 'states_for_figure'] = 'Stem cells'
adata_log.obs.loc[adata_log.obs['Cell States'] == 'Stem cells OLFM4 PCNA', 'states_for_figure'] = 'Stem cells'
adata_log.obs.loc[adata_log.obs['Cell States'] == 'Stem_Cells_GCA', 'states_for_figure'] = 'Stem cells'
adata_log.obs.loc[adata_log.obs['Cell States'] == 'Stem_Cells_ext', 'states_for_figure'] = 'Stem cells'
adata_log.obs.loc[adata_log.obs['Cell States'] == 'Enterocytes CA1 CA2 CA4-', 'states_for_figure'] = 'Enterocyte'
adata_log.obs.loc[adata_log.obs['Cell States'] == 'Colonocyte', 'states_for_figure'] = 'Colonocyte'
adata_log.obs.loc[adata_log.obs['Cell States'] == 'Goblet cells MUC2 TFF1-', 'states_for_figure'] = 'Goblet cells'

In [None]:
with plt.rc_context():
    sc.set_figure_params(dpi=300, figsize=(15, 15))
    sc.pl.dotplot(adata_log, stem_cells_markers, groupby='states_for_figure', cmap = 'magma_r', show=False) 
    plt.savefig(f"{fig_dir}/Integrated_stem_markers_dotplot.png", bbox_inches="tight")

In [10]:
adata_log.obs['states_for_figure'].value_counts()

states_for_figure
Mesenchymal        172164
Enterocyte          75420
T cells             47043
Plasma cells        46681
TA                  39556
Stem cells          35828
Myeloid             25587
Epithelial          24703
Neuronal            19307
B cells             17772
Endothelial         16631
Goblet cells        16375
Colonocyte          13614
Paneth cells         3743
Red blood cells      1346
Tuft cells           1329
Name: count, dtype: int64

In [11]:
adata_log.obs

Unnamed: 0_level_0,Sample_ID,Cell_Type,Study_name,Donor_ID,Diagnosis,Age,Region code,Fraction,Sex,Library_Preparation_Protocol,...,total_counts,total_counts_mito,pct_counts_mito,total_counts_ribo,pct_counts_ribo,Cell_ID,_scvi_batch,_scvi_labels,C_scANVI,states_for_figure
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AACACGTTCTTGCATT_Ileum-1_Stem Cell,Ileum-1,Epithelial,"Wang, 2020",Wang_Donor_1,Healthy adult,,,,Male,10x 3' v1,...,15733.0,2722.0,17.301214,5100.0,32.415943,AACACGTTCTTGCATT_Ileum-1_Stem Cell,0,8,Stem Cell,Stem cells
AACCGCGCATGAAGTA_Ileum-1_Stem Cell,Ileum-1,Epithelial,"Wang, 2020",Wang_Donor_1,Healthy adult,,,,Male,10x 3' v1,...,13928.0,2136.0,15.336015,5261.0,37.772831,AACCGCGCATGAAGTA_Ileum-1_Stem Cell,0,8,Stem Cell,Stem cells
AACTCAGAGCGATCCC_Ileum-1_Stem Cell,Ileum-1,Epithelial,"Wang, 2020",Wang_Donor_1,Healthy adult,,,,Male,10x 3' v1,...,6978.0,702.0,10.060189,2716.0,38.922329,AACTCAGAGCGATCCC_Ileum-1_Stem Cell,0,8,Stem Cell,Stem cells
AACTCCCTCTCAACTT_Ileum-1_Stem Cell,Ileum-1,Epithelial,"Wang, 2020",Wang_Donor_1,Healthy adult,,,,Male,10x 3' v1,...,19378.0,2263.0,11.678192,7129.0,36.789143,AACTCCCTCTCAACTT_Ileum-1_Stem Cell,0,8,Stem Cell,Stem cells
AACTCTTAGCTTCGCG_Ileum-1_Stem Cell,Ileum-1,Epithelial,"Wang, 2020",Wang_Donor_1,Healthy adult,,,,Male,10x 3' v1,...,21237.0,4083.0,19.225880,7718.0,36.342232,AACTCTTAGCTTCGCG_Ileum-1_Stem Cell,0,8,Stem Cell,Stem cells
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
H180844_N4-GATCGATCATATACGC,H180844_N4,Plasma cells,"Kong, 2023",180844,Healthy adult,,,,Male,10x 3' v2,...,228.0,22.0,9.649122,29.0,12.719299,H180844_N4-GATCGATCATATACGC,0,6,Plasma cells,Plasma cells
H180844_N4-ACTGCTCAGAAACCTA,H180844_N4,Myeloid,"Kong, 2023",180844,Healthy adult,,,,Male,10x 3' v2,...,310.0,8.0,2.580645,88.0,28.387096,H180844_N4-ACTGCTCAGAAACCTA,0,4,Myeloid,Myeloid
H180844_N4-CATCAGACACGGCCAT,H180844_N4,Myeloid,"Kong, 2023",180844,Healthy adult,,,,Male,10x 3' v2,...,305.0,14.0,4.590164,66.0,21.639343,H180844_N4-CATCAGACACGGCCAT,0,4,Myeloid,Myeloid
H180844_N4-TATGCCCCAATGACCT,H180844_N4,Myeloid,"Kong, 2023",180844,Healthy adult,,,,Male,10x 3' v2,...,232.0,8.0,3.448276,17.0,7.327586,H180844_N4-TATGCCCCAATGACCT,0,4,Myeloid,Myeloid
