## Analysis of FF Endometrial Adenocarcinoma Dataset

In [1]:
import scanpy as sc
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib import rcParams
import seaborn as sns
import squidpy as sq
import csv
from sklearn.metrics.pairwise import cosine_distances

plt.rcParams['figure.figsize']=(8,8)


%load_ext autoreload
%autoreload 2


### First pre-process data with scanpy

In [None]:
##Load in data
ff_human_ovarian_cancer = sc.datasets.visium_sge('Parent_Visium_Human_OvarianCancer')
ff_human_ovarian_cancer.var_names_make_unique()

100%|██████████████████████████████████████| 6.84M/6.84M [00:02<00:00, 3.03MB/s]
  1%|▎                                       | 136k/16.1M [00:00<00:44, 380kB/s]

In [None]:
ff_human_ovarian_cancer

In [None]:
#Calculate some QC metrics for the data
ff_human_ovarian_cancer.var["mt"] = ff_human_ovarian_cancer.var_names.str.startswith("MT-")
sc.pp.calculate_qc_metrics(ff_human_ovarian_cancer, qc_vars=["mt"], inplace=True)

In [None]:
##Filter by MT values
ff_human_ovarian_cancer = ff_human_ovarian_cancer[ff_human_ovarian_cancer.obs["pct_counts_mt"] < 15]
ff_human_ovarian_cancer = ff_human_ovarian_cancer[ff_human_ovarian_cancer.obs["pct_counts_mt"] > 2]

In [None]:
ff_human_ovarian_cancer.var["mt"]
ff_human_ovarian_cancer.var.sort_values(by=['mt'])

In [None]:
ff_human_ovarian_cancer.obs

In [None]:
#Visualise some QC metrics for the data
%matplotlib notebook

fig, axs = plt.subplots(1, 4, figsize=(15, 4))
sns.distplot(ff_human_ovarian_cancer.obs["total_counts"], kde=False, ax=axs[0])
sns.distplot(ff_human_ovarian_cancer.obs["total_counts"][ff_human_ovarian_cancer.obs["total_counts"] < 20000], kde=False, bins=40, ax=axs[1])
sns.distplot(ff_human_ovarian_cancer.obs["n_genes_by_counts"], kde=False, bins=60, ax=axs[2])
sns.distplot(ff_human_ovarian_cancer.obs["n_genes_by_counts"][ff_human_ovarian_cancer.obs["n_genes_by_counts"] < 4000], kde=False, bins=60, ax=axs[3])
sc.pl.violin(ff_human_ovarian_cancer,['pct_counts_mt'])

In [None]:
ff_human_ovarian_cancer

In [None]:
#Perform some filtering
sc.pp.filter_cells(ff_human_ovarian_cancer, min_counts=2000)
print(f'Number of cells after min count filter: {ff_human_ovarian_cancer.n_obs}')
sc.pp.filter_cells(ff_human_ovarian_cancer, max_counts=20000)
print(f'Number of cells after max count filter: {ff_human_ovarian_cancer.n_obs}')
ff_human_ovarian_cancer = ff_human_ovarian_cancer[ff_human_ovarian_cancer.obs["pct_counts_mt"] < 15]
ff_human_ovarian_cancer = ff_human_ovarian_cancer[ff_human_ovarian_cancer.obs["pct_counts_mt"] > 2]
print(f"#cells after MT filter: {ff_human_ovarian_cancer.n_obs}")
#Filter out genes that are detected in less than 10 cells
sc.pp.filter_genes(ff_human_ovarian_cancer, min_cells=10)
print(f'Number of genes after cell filter: {ff_human_ovarian_cancer.n_vars}')
sc.pp.filter_cells(ff_human_ovarian_cancer, min_genes = 2000)
print(f'Number of cells after gene filter: {ff_human_ovarian_cancer.n_obs}')

In [None]:
##Do normalisation and find highly variable genes
sc.pp.normalize_total(ff_human_ovarian_cancer, inplace=True)
sc.pp.log1p(ff_human_ovarian_cancer)
sc.pp.highly_variable_genes(ff_human_ovarian_cancer, flavor="seurat", n_top_genes=2000)

In [None]:
#Look at same plots after filtering an normalisation
fig, axs = plt.subplots(1, 2, figsize=(15, 4))
sns.distplot(ff_human_ovarian_cancer.obs["total_counts"], kde=False, ax=axs[0])
sns.distplot(ff_human_ovarian_cancer.obs["n_genes_by_counts"], kde=False, bins=60, ax=axs[1])

In [None]:
#Perform next steps in cluster identification
sc.pp.pca(ff_human_ovarian_cancer,n_comps=20)
sc.pp.neighbors(ff_human_ovarian_cancer)
sc.tl.umap(ff_human_ovarian_cancer)
sc.tl.leiden(ff_human_ovarian_cancer, key_added='clusters')

In [None]:
#Plot UMAPs
plt.rcParams["figure.figsize"] = (4, 4)
sc.pl.umap(ff_human_ovarian_cancer, color=["total_counts", "n_genes_by_counts", "clusters"], wspace=0.4)

In [None]:
#Visualise metrics in spatial coordinates
plt.rcParams["figure.figsize"] = (8, 8)
sc.pl.spatial(ff_human_ovarian_cancer, img_key="hires", color=["total_counts", "n_genes_by_counts"])

In [None]:
#Visualise cell type clusters on image of. tissue
sc.pl.spatial(ff_human_ovarian_cancer, img_key="hires", color="clusters", size=1.5)

In [None]:
#Visualise expression of a particular gene across tissue
sc.pl.spatial(ff_human_ovarian_cancer, img_key = "hires", color="HLA-B")

### Now perform SVG analysis with Squidpy

In [None]:
##Run moran's I on all genes in dataset
sq.gr.spatial_neighbors(ff_human_ovarian_cancer, coord_type="generic", delaunay=True)
sq.gr.spatial_autocorr(ff_human_ovarian_cancer, genes=ff_human_ovarian_cancer.var_names, use_raw=False, mode="moran")
ff_human_ovarian_cancer.uns["moranI"].head(10)

In [None]:
moran_I = pd.DataFrame(ff_human_ovarian_cancer.uns['moranI'].sort_values(ascending=False,by=['I']))

##Sort this so corrected pvalue is above 0.05 and I is above 0.5
moran_I = moran_I[moran_I['pval_norm_fdr_bh'] <=0.05]
moran_I = moran_I[moran_I['I'] > 0]

##Write out results
moran_I.to_csv("ff_human_ovarian_cancer_squidpy_moranI_results_pval_fdr_bh_0.05_moransI_positive.csv")