## Notebook for plots describing datasets

+ Developed by: Anna Maguza
+ Institute of Computational Biology - Computational Health Centre - Hemlholtz Munich
+ Date created: 7th July 2024
+ Last modified: 30th July 2024

#### Import required packages

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns

### Data Upload

In [2]:
input_dir = '/../../../gut_project/Processed_data/Gut_data/Healthy_reference/Integrated/'

In [3]:
adata = sc.read(input_dir + 'Integrated_4_datasets_05042024')

In [4]:
fig_dir = '/../../../gut_project/Processed_data/Gut_data/Plots/Datasets_description/'

In [5]:
input_dir = '/../../../gut_project/raw_data/Elmentaite_2021/'
Elmentaite = sc.read(f'{input_dir}/GCA_filtered_raw.h5ad')

In [6]:
input_dir = '/../../../gut_project/raw_data/Kong_2023'
Kong = sc.read_h5ad(f'{input_dir}/adata_Kong_2023_healthy_with_QC.h5ad')

In [7]:
input_dir = '/../../../gut_project/raw_data/Smillie_2019/SCP259/'
Smillie = sc.read(f'{input_dir}/Smillie_with_QC_raw.h5ad')

In [8]:
input_dir = '/../../../gut_project/raw_data/Wang_2020/'
Wang = sc.read(f'{input_dir}/Wang_2022_raw_anndata.h5ad')

### Adult vs Fetal

In [31]:
df = pd.DataFrame(adata.obs['Age_group'].value_counts())
# rename First trim into 1-3 months
df.rename(index={'First trim':'First Trimester'}, inplace=True)
df.rename(index={'Second trim':'Second Trimester'}, inplace=True)

In [None]:
# Define custom colors
colors = ['#E9A8F2', '#FFA27F', '#A3D8FF', '#FF76CE']

# Ensure the order of categories
df = df.loc[['Adult', 'Pediatric', 'Second Trimester', 'First Trimester']]

# Plot the stacked bar plot
fig, ax = plt.subplots(figsize=(8, 12))
df.T.plot(kind='bar', stacked=True, color=colors, ax=ax, width=0.1)
plt.ylabel('Number of cells')
plt.legend(title='Age Group', bbox_to_anchor=(1.05, 1), loc='upper left')

# Improve quality and adjust plot aesthetics
plt.grid(False)
plt.box(True)
plt.tight_layout()

# Save the plot to the specified directory 
plt.savefig(f"{fig_dir}/age_distribution.png", dpi=300)
plt.show()


### Visualize datasets

In [44]:
df = adata.obs[['Study_name']].value_counts()

In [None]:
colors =  ['#E9A8F2']

# Plot the stacked bar plot
fig, ax = plt.subplots(figsize=(5, 5))
df.plot(kind='bar', stacked=True, color=colors, ax=ax, width=1, )

# Improve quality and adjust plot aesthetics
plt.grid(False)
plt.box(True)
plt.tight_layout()

# Save the plot to the specified directory 
#plt.savefig(f"{fig_dir}/age_distribution.png", dpi=300)
plt.show()

### Visualize cell types

In [85]:
adata_copy = adata.copy()

In [86]:
adata_copy = adata_copy[adata_copy.obs['Study_name'] != 'Smilie, 2019']
adata_copy = adata_copy[adata_copy.obs['Study_name'] != 'Wang, 2020']

  if not is_categorical_dtype(df_full[k]):


In [87]:
Smillie.obs['Study_name'] = 'Smillie, 2019'
Wang.obs['Study_name'] = 'Wang, 2020'

# rename 'CellType' column to 'Cell_Type'
Smillie.obs.rename(columns={'CellType':'Cell_Type'}, inplace=True)
Wang.obs['Cell_Type'] = 'Epithelial'
Wang.obs['Diagnosis'] = 'Healthy adult'

In [88]:
# concatenate Smillie to the adata_copy
adata_copy = an.concat([adata_copy, Smillie], join='outer')
adata_copy = an.concat([adata_copy, Wang], join='outer')

  if pd.api.types.is_categorical_dtype(dtype):
  if pd.api.types.is_categorical_dtype(dtype):
  if pd.api.types.is_categorical_dtype(dtype):
  if pd.api.types.is_categorical_dtype(dtype):
  if pd.api.types.is_categorical_dtype(dtype):
  if pd.api.types.is_categorical_dtype(dtype):


In [90]:
df = adata_copy.obs[['Diagnosis', 'Cell_Type', 'Study_name']]

In [91]:
df['Diagnosis'] = df['Diagnosis'].astype(str)
df['Study_name'] = df['Study_name'].astype(str)
df['Study_Diagnosis'] = df['Study_name'] + '-' + df['Diagnosis']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Diagnosis'] = df['Diagnosis'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Study_name'] = df['Study_name'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Study_Diagnosis'] = df['Study_name'] + '-' + df['Diagnosis']


In [92]:
df_grouped = df.groupby(['Study_Diagnosis', 'Cell_Type']).size().reset_index(name='Count')

In [93]:
df_pivot = df_grouped.pivot(index='Study_Diagnosis', columns='Cell_Type', values='Count').fillna(0)

In [94]:
colors =  ['#43766C', '#759EB8', '#824670', '#FF76CE', '#E9A8F2', '#94FFD8',
               '#A3D8FF', '#FFA27F', '#97BE5A']

In [None]:
# Plot the stacked bar plot
fig, ax = plt.subplots(figsize=(8, 12))
df_pivot.plot(kind='bar', stacked=True, color=colors, ax=ax, width=1, )

# Improve quality and adjust plot aesthetics
plt.grid(False)
plt.box(True)
plt.tight_layout()

# Save the plot to the specified directory 
plt.savefig(f"{fig_dir}/cell_types_distribution_across_datasets.png", dpi=300)
plt.show()

### Stem cells markers in each dataset

In [146]:
adata.obs['Cell States Kong'] = adata.obs['Cell States Kong'].astype(str)
adata.obs['Cell States GCA'] = adata.obs['Cell States GCA'].astype(str)

# Create a new column 'Original stem cells' based on the specified conditions
adata.obs['Original stem cells'] = (
    (adata.obs['Cell States Kong'].isin(['Stem cells OLFM4', 'Stem cells OLFM4 GSTA1', 'Stem cells OLFM4 LGR5', 'Stem cells OLFM4 PCNA'])) |
    (adata.obs['Cell States GCA'].isin(['Stem_Cells_GCA', 'Stem_Cells_ext']))
)

In [149]:
adata_log = adata.copy()
sc.pp.normalize_total(adata_log, target_sum = 1e6, exclude_highly_expressed = True)
sc.pp.log1p(adata_log)

In [150]:
stem_cells_markers = ['AXIN2', 'ASCL2', 'ATOH1', 'BMI1', 'CA12', 'CLU', 'GPX2', 'HMGCS2', 'LEFTY1', 'LGR5', 'LRIG1', 'MYC', 'OLFM4', 'SMOC2', 'TERT']

In [151]:
adata_log = adata_log[adata_log.obs['Original stem cells'] == True]

  if not is_categorical_dtype(df_full[k]):


In [152]:
# create new column study-condition with the name of the study and the condition
adata_log.obs['Study_name'] = adata_log.obs['Study_name'].astype(str)
adata_log.obs['Diagnosis'] = adata_log.obs['Diagnosis'].astype(str)

#rename 'Adult Ulcerative Colitis' into 'UC' in the Diagnosis column
adata_log.obs['Diagnosis'] = adata_log.obs['Diagnosis'].replace('Adult Ulcerative Colitis Non-inflamed', 'Non-inflamed UC adult')

adata_log.obs['Study_condition'] = adata_log.obs['Study_name'] + '-' + adata_log.obs['Diagnosis']

  adata_log.obs['Study_name'] = adata_log.obs['Study_name'].astype(str)


In [None]:
with plt.rc_context():
    sc.set_figure_params(dpi=300, figsize=(15, 15))
    sc.pl.dotplot(adata_log, stem_cells_markers, groupby='Study_condition', cmap = 'magma_r', show=False, swap_axes=True) 
    plt.savefig(f"{fig_dir}/original_stem_markers_dotplot.png", bbox_inches="tight")

### Visualize markers in all cell types + stem cells

In [49]:
adata_log = adata.copy()
adata_log.layers["sqrt_norm"] = np.sqrt(
    sc.pp.normalize_total(adata_log, inplace=False)["X"]
)
adata_log.X = adata_log.layers["sqrt_norm"].copy()

In [58]:
adata_log = adata.copy()
sc.pp.normalize_total(adata_log, target_sum = 1e6, exclude_highly_expressed = True)
sc.pp.log1p(adata_log)

In [59]:
adata_log.obs['states_for_figure'] = adata_log.obs['Cell_Type'].copy()

In [60]:
adata_log.obs['states_for_figure'] = adata_log.obs['states_for_figure'].cat.set_categories(['Stem cells', 'Paneth cells', 'TA', 'Monocytes', 'Macrophages', 'B cells', 'T cells', 'Fibroblasts', 'Neuronal'])

In [61]:
mapping_states = {
    'Stromal 3 (C7+)': 'Fibroblasts',
    'Stromal 3 (KCNN3+)': 'Fibroblasts',
    'Stromal 1 (ADAMDEC1+)': 'Fibroblasts',
    'Fibroblasts ADAMDEC1': 'Fibroblasts',
    'Stromal 2 (NPY+)': 'Fibroblasts',
    'Fibroblasts SMOC2 PTGIS': 'Fibroblasts',
    'Stromal 1 (CCL11+)': 'Fibroblasts',
    'Fibroblasts KCNN3 LY6H': 'Fibroblasts',
    'Fibroblasts SFRP2 SLPI': 'Fibroblasts',
    'Fibroblasts NPY SLITRK6': 'Fibroblasts',
    'Activated fibroblasts CCL19 ADAMADEC1': 'Fibroblasts',
    'Inflammatory fibroblasts IL11 CHI3L1': 'Fibroblasts',
    'Stromal 4 (MMP1+)': 'Fibroblasts',
    'Stromal 2 (CH25H+)': 'Fibroblasts',
    'Transitional Stromal 3 (C3+)': 'Fibroblasts',
    'cycling stromal': 'Fibroblasts',
    'mLN Stroma (FMO2+)': 'Fibroblasts',
    'Inflammatory fibroblasts IL11 CHI3L1': 'Fibroblasts',
    'Stromal Cycling cells': 'Fibroblasts',

    'Macrophages': 'Macrophages',
    'Macrophages CCL3 CCL4': 'Macrophages',
    'LYVE1+ Macrophage': 'Macrophages',
    'Macrophages LYVE1': 'Macrophages',
    'Macrophages Metallothionein': 'Macrophages',
    'MMP9+ Inflammatory macrophage': 'Macrophages',
    'Macrophages PLA2G2D': 'Macrophages',
    'Macrophages CXCL9 CXCL10': 'Macrophages',

    'Monocytes': 'Monocytes',
    'Monocytes HBB': 'Monocytes',
    'Monocytes CHI3L1 CYP27A1': 'Monocytes',
    'Monocytes S100A8 S100A9': 'Monocytes',
    'MPO+ mono-neutrophil': 'Monocytes',
    'Paneth': 'Paneth cells',
    'Paneth cells': 'Paneth cells',
    'TA': 'TA',
    'Stem cells OLFM4': 'Stem cells',
    'Stem cells OLFM4 GSTA1': 'Stem cells',
    'Stem cells OLFM4 LGR5': 'Stem cells',
    'Stem cells OLFM4 PCNA': 'Stem cells',
    'Stem_Cells_GCA': 'Stem cells',
    'Stem_Cells_ext': 'Stem cells'
}

In [62]:
# Update 'states_for_figure' based on the mapping dictionary
adata_log.obs.loc[adata_log.obs['Cell States'].isin(mapping_states.keys()), 'states_for_figure'] = \
    adata_log.obs['Cell States'].map(mapping_states)

In [63]:
marker_genes_dict = {
    "Stem cells": ['LGR5', 'ASCL2', 'SMOC2', 'RGMB'],
    "Paneth": ['DEFA5', 'DEFA6', 'REG3A'],
    "TA": ['MKI67', 'TOP2A', 'UBE2C'],
    "Monocytes": ['FCN1', 'S100A4', 'CD14'],
    "Macrophages": ['MERTK', 'CTSC', 'CTSD'],
    "B cells": ['MS4A1', 'CD19'],
    "T cell": ["CD3D"],
    "Stromal": ['ADAMDEC1', 'ADAM28', 'PDGFRA', 'BMP4'],
    "Neuronal": ['ETV1','BNC2']
}


In [None]:
with plt.rc_context():
    sc.set_figure_params(dpi=300, figsize=(15, 15))
    sc.pl.dotplot(adata_log, marker_genes_dict, "states_for_figure", dendrogram=False, cmap = 'magma_r', show=False) 
    plt.savefig(f"{fig_dir}/all_markers_dotplot.png", bbox_inches="tight")