# 0. Basic processing and QC for Visium

In [None]:
import sys
import warnings

import anndata as ad
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
from helper_functions import select_slide

warnings.filterwarnings("ignore")

sys.path.insert(1, "../../helper_functions")

In [None]:
samples = {
    "B123": None,
    "B154": None,
    "B156": None,
    "B175": None,
    "B178": None,
    "B183": None,
    "B22": None,
    "B24": None,
    "B32": None,
    "B4": None,
    "B42": None,
    "B60": None,
}

In [None]:
sc.__version__

In [None]:
for name in samples:
    sample = sc.read_visium(
        f"chromothripsis/j462r/spatial_transcriptomics/spaceranger_results/{name}/outs/",
        library_id=name,
    )
    sample.var_names_make_unique()
    samples[name] = sample

In [None]:
for sample in samples.values():
    sc.pp.calculate_qc_metrics(sample, inplace=True)

In [None]:
def make_qc_df(samples):
    df = []
    for sname, sample in samples.items():
        cdf = {"sample": sname, "spots": sample.n_obs}
        for stat, name in (
            ("total_counts", "total counts"),
            ("n_genes_by_counts", "detected genes"),
        ):
            cdf[f"median {name}"] = sample.obs[stat].median()
            cdf[f"mean {name}"] = sample.obs[stat].mean()
            cdf[f"stddev {name}"] = sample.obs[stat].std()
        df.append(pd.DataFrame(cdf, index=[0]))
    return pd.concat(df, axis=0).sort_values("sample").reset_index(drop=True)

In [None]:
make_qc_df(samples).to_csv("../../data/QC_raw.tsv", sep="\t", index=False)

In [None]:
for i, sample in samples.items():
    fig, axs = plt.subplots(1, 4, figsize=(20, 4))
    axs[0].hist(sample.obs["total_counts"], bins=100)
    axs[0].set_title("total counts")
    axs[1].hist(sample.obs["total_counts"][sample.obs["total_counts"] < 15000], bins=50)
    axs[1].set_title("total_counts")
    axs[2].hist(sample.obs["n_genes_by_counts"], bins=100)
    axs[2].set_title("detected genes")
    axs[3].hist(
        sample.obs["n_genes_by_counts"][sample.obs["n_genes_by_counts"] < 6000], bins=50
    )
    axs[3].set_title("detected genes")

    fig.suptitle(f"sample {i}")

In [None]:
for i, sample in samples.items():
    plt.figure(figsize=(12, 6))
    sc.pl.spatial(
        sample,
        color=["total_counts", "log1p_total_counts", "n_genes_by_counts"],
        title=[
            f"sample {i}: total counts",
            f"sample {i}: log1p(total counts)",
            f"sample {i}: detected genes",
        ],
        size=1.5,
        img_key=None,
        vmax="p99",
    )

In [None]:
del samples["B183"]
del samples["B32"]

In [None]:
sc.pp.filter_cells(samples["B123"], min_counts=5000)
sc.pp.filter_cells(samples["B154"], min_counts=2500)
sc.pp.filter_cells(samples["B156"], min_counts=4000)
sc.pp.filter_cells(samples["B175"], min_counts=4000)
sc.pp.filter_cells(samples["B178"], min_counts=2500)
sc.pp.filter_cells(samples["B22"], min_counts=2000)
sc.pp.filter_cells(samples["B24"], min_counts=2000)
sc.pp.filter_cells(samples["B4"], min_counts=6000)
sc.pp.filter_cells(samples["B42"], min_counts=5000)
sc.pp.filter_cells(samples["B60"], min_counts=2500)

In [None]:
# Trying to keep as many genes as possible
for sample in samples.values():
    sc.pp.filter_genes(sample, min_cells=10)

In [None]:
make_qc_df(samples).to_csv("../../data/QC_filtered.tsv", sep="\t", index=False)

In [None]:
for i, sample in samples.items():
    sc.pl.spatial(
        sample,
        color=["log1p_total_counts", "n_genes_by_counts"],
        title=[f"sample {i}: log1p(total counts)", f"sample {i}: detected genes"],
        size=1.5,
        img_key=None,
    )

In [None]:
for sample in samples.values():
    sample.layers["counts"] = sample.X.copy()

In [None]:
for sample in samples.values():
    sc.pp.normalize_total(sample, target_sum=1e4, inplace=True)
    sc.pp.log1p(sample)
    sample.layers["log_counts"] = sample.X.copy()
    sample.raw = sample

In [None]:
adata_vis = ad.concat(
    samples, label="sample", merge="same", uns_merge="unique", index_unique="_"
)

In [None]:
for sample in samples:
    print(sample)
    sub_adata = select_slide(adata_vis, sample)
    sub_adata.write_h5ad(
        f"../../data/{sample}.h5ad", compression="gzip", compression_opts=9
    )

print("Saving merged adata object")
adata_vis.write_h5ad(
    "../../data/merged_samples.h5ad", compression="gzip", compression_opts=9
)