#### This particular notebook details the preprocessing pipeline for our Merscope data

#### Required input files:

* Raw Merscope data object (availabile via FigShare)

Environment: Please create and activate the conda environment provided in default_env.yaml before running this notebook

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import scanpy as sc
import squidpy as sq

import gzip
import anndata

import os

from scipy.cluster import hierarchy as sch
from copy import deepcopy

from pathlib import Path

### Read in raw data file

In [None]:
adata = sc.read_h5ad('/path/25_11_12_Merscope_Raw.h5ad')

adata

# 124,938 cells and 280 genes

### Examine object

In [None]:
adata.obs

In [None]:
adata.obsm

In [None]:
adata.obsm['spatial']

In [None]:
adata.obsm['blank_genes']

In [None]:
adata.obsm["blank_genes"].to_numpy().sum()

In [None]:
adata.obs["transcript_count"].sum()

In [None]:
adata.obsm["blank_genes"].to_numpy().sum() / adata.obs["transcript_count"].sum() * 100

# 0.7% false positive rate

In [None]:
count_greater_than_0 = (adata.obs["transcript_count"] > 0).sum()
count_greater_than_0

# (29710 / 124938 total cells) * 100 = 23.8% of cells had >0 transcripts

In [None]:
adata.obs["fov"].nunique()

In [None]:
print(adata.obs["transcript_count"].mean())
print((adata.obs.groupby("fov").sum()["transcript_count"]).mean())
print(adata.obs["volume"].mean())

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(15, 4))

axs[0].set_title("Total transcripts per cell")
sns.histplot(
    adata.obs["transcript_count"],
    kde=False,
    ax=axs[0],
)

axs[1].set_title("Transcripts per FOV")
sns.histplot(
    adata.obs.groupby("fov").sum()["transcript_count"],
    kde=False,
    ax=axs[1],
)

axs[2].set_title("Volume of segmented cells")
sns.histplot(
    adata.obs["volume"],
    kde=False,
    ax=axs[2],
)

# Add an overall title to the figure
fig.suptitle('Raw data', fontsize=16)

plt.show()

### Filtering

In [None]:
adata.obs

In [None]:
## Our filtering schema (matches CosMx and Xenium)
adata_filtered = adata.copy()

# filter out cells with <50 counts and <10 genes
sc.pp.filter_cells(adata_filtered, min_counts=50)
sc.pp.filter_cells(adata_filtered, min_genes=10)

# filter out genes that have <1 count and are detected in <10 cells
sc.pp.filter_genes(adata_filtered, min_counts=1)
sc.pp.filter_genes(adata_filtered, min_cells=10)

In [None]:
# View
adata_filtered.obs

# Only 5 cells made it through

In [None]:
## Lite filtering scheme (since this dataset is so small)

adata_filtered_lite = adata.copy()

# filter out cells with <10 counts and <10 genes
sc.pp.filter_cells(adata_filtered_lite, min_counts=10)
sc.pp.filter_cells(adata_filtered_lite, min_genes=10)

# filter out genes that have <1 count and are detected in <10 cells
sc.pp.filter_genes(adata_filtered_lite, min_counts=1)
sc.pp.filter_genes(adata_filtered_lite, min_cells=10)

In [None]:
# View
adata_filtered_lite.obs

# 212 cells

In [None]:
## Filtered stats

# adata_filtered
print("adata_filtered")

print(f'mean number of transcripts per cell: {adata_filtered.obs["transcript_count"].mean()}')
print(f'total number of transcripts: {adata_filtered.obs["transcript_count"].sum()}')
print(f'mean number of transcripts per fov: {(adata_filtered.obs.groupby("fov").sum()["transcript_count"]).mean()}')
print(f'mean number of genes per cell: {adata_filtered.obs["n_genes"].mean()}')
print(f'mean volume per cell: {adata_filtered.obs["volume"].mean()}')

# adata_filtered_lite
print("")
print("adata_filtered_lite")

print(f'mean number of transcripts per cell: {adata_filtered_lite.obs["transcript_count"].mean()}')
print(f'total number of transcripts: {adata_filtered_lite.obs["transcript_count"].sum()}')
print(f'mean number of transcripts per fov: {(adata_filtered_lite.obs.groupby("fov").sum()["transcript_count"]).mean()}')
print(f'mean number of genes per cell: {adata_filtered_lite.obs["n_genes"].mean()}')
print(f'mean volume per cell: {adata_filtered_lite.obs["volume"].mean()}')

In [None]:
fig, axs = plt.subplots(1, 4, figsize=(15, 4))

axs[0].set_title("Total transcripts per cell")
sns.histplot(
    adata_filtered.obs["transcript_count"],
    kde=False,
    ax=axs[0],
)

axs[1].set_title("Transcripts per FOV")
sns.histplot(
    adata_filtered.obs.groupby("fov").sum()["transcript_count"],
    kde=False,
    ax=axs[1],
)

axs[2].set_title("Total genes per cell")
sns.histplot(
    adata_filtered.obs["n_genes"],
    kde=False,
    ax=axs[2],
)

axs[3].set_title("Volume of segmented cells")
sns.histplot(
    adata_filtered.obs["volume"],
    kde=False,
    ax=axs[3],
)

# Add an overall title to the figure
fig.suptitle('Filtered data', fontsize=16)

plt.show()

In [None]:
fig, axs = plt.subplots(1, 4, figsize=(15, 4))

axs[0].set_title("Total transcripts per cell")
sns.histplot(
    adata_filtered_lite.obs["transcript_count"],
    kde=False,
    ax=axs[0],
)

axs[1].set_title("Transcripts per FOV")
sns.histplot(
    adata_filtered_lite.obs.groupby("fov").sum()["transcript_count"],
    kde=False,
    ax=axs[1],
)

axs[2].set_title("Total genes per cell")
sns.histplot(
    adata_filtered_lite.obs["n_genes"],
    kde=False,
    ax=axs[2],
)

axs[3].set_title("Volume of segmented cells")
sns.histplot(
    adata_filtered_lite.obs["volume"],
    kde=False,
    ax=axs[3],
)

# Add an overall title to the figure
fig.suptitle('Filtered (lite) data', fontsize=16)

plt.show()

#### Note: We're choosing to continue with the filtered (lite) version, because there are more cells

Visualize genes with the highest expression levels

In [None]:
sc.pl.highest_expr_genes(adata_filtered_lite, n_top=20, )

Make a copy of the original raw counts (post-filtering; pre-normalization)

In [None]:
adata_filtered_lite.layers['raw_counts'] = adata_filtered_lite.X.copy() # Make a copy
adata_filtered_lite

In [None]:
adata_filtered_lite.obs

### Continue with analysis

Normalize counts per cell using scanpy.pp.normalize_total.

Logarithmize, do principal component analysis, compute a neighborhood graph of the observations using scanpy.pp.log1p, scanpy.pp.pca and scanpy.pp.neighbors respectively.

Use scanpy.tl.umap to embed the neighborhood graph of the data and cluster the cells into subgroups employing scanpy.tl.leiden.

In [None]:
sc.pp.normalize_total(adata_filtered_lite, inplace=True)
sc.pp.log1p(adata_filtered_lite)
# Save log_normalized_counts as a layer
adata_filtered_lite.layers['log_normalized_counts']=adata_filtered_lite.X

Calculate and plot the top highly variable genes

In [None]:
sc.pp.highly_variable_genes(adata_filtered_lite, min_mean=0.0125, max_mean=3, min_disp=0.5)

In [None]:
sc.pl.highly_variable_genes(adata_filtered_lite)

Making a copy of the data object

In [None]:
# yes to regress copy
adata_filtered_lite_R = adata_filtered_lite.copy()

In [None]:
sc.pp.regress_out(adata_filtered_lite_R, ["transcript_count","n_genes"])

In [None]:
## Scale data
sc.pp.scale(adata_filtered_lite_R, max_value=10)

## Run PCA and plot PCA variance ratio

sc.pp.pca(adata_filtered_lite_R, svd_solver='arpack')

sc.pl.pca(adata_filtered_lite_R, color=['MUC2','ACTG1','APP'])

In [None]:
sc.pl.pca_variance_ratio(adata_filtered_lite_R, log=True)

In [None]:
sc.pl.pca_variance_ratio(adata_filtered_lite_R, n_pcs = 20, log=True)

In [None]:
adata_filtered_lite_R

### Compute neighbor graph and plot UMAP

In [None]:
sc.settings.figdir = '/path/UMAP_pngs/'

#### Parameter descriptions:

n_neighbors
* A value between 2 and 100, representing the number of neighboring data points used for manifld approximation. Larger values give a manifold with a more global view of the dataset, while smaller values preserve more of the local structures.
* Default value is 15

n_pcs
* Use this many PCs
* Default value is None

min_dist
* The minimum distance between two points in the UMAP embedding.
* Default value is 0.05

spread
* A scaling factor for distance between embedded points.
* Default value is 1.0

Helpful resource: https://smorabit.github.io/blog/2020/umap/

#### Testing vR4

In [None]:
adata_filtered_lite_R_v4 = adata_filtered_lite_R.copy()

In [None]:
sc.pp.neighbors(adata_filtered_lite_R_v4, n_neighbors=10, n_pcs=10)
sc.tl.umap(adata_filtered_lite_R_v4, min_dist=0.02, spread=1.75)
sc.tl.leiden(adata_filtered_lite_R_v4)

In [None]:
sc.pl.umap(
    adata_filtered_lite_R_v4,
    color=[
        "transcript_count",
        "n_genes",
        "leiden",
    ],
    wspace=0.4,
    save = '_vR4_Merscopedata_240203.png',
)

In [None]:
sq.pl.spatial_scatter(
    adata_filtered_lite_R_v4,
    shape=None,
    color=[
        "leiden",
    ],
    wspace=0.4,
    size=25
)

In [None]:
# Save object with UMAP
adata_filtered_lite_R_v4.write_h5ad('/path/DataObjects_withUMAP/Merscopedata_umapvR4_240203.h5ad')

In [None]:
adata_filtered_lite_R_v4