In [1]:
import scanpy as sc
from pathlib import Path

# Data preparation (based on QC)

In [3]:
data_file = Path('../data/endometrium_all.h5ad')
data = sc.read_h5ad(data_file)

Filtering

In [4]:
sc.pp.filter_cells(data, min_genes=500)
sc.pp.filter_genes(data, min_cells=10)
print(len(data.obs_names), len(data.var_names))

100307 25839


The number of cells did not change, which indicates that the dataset was already filtered. The scientists stated, that they had filtered out the cells with less then 500 genes, but did not emphasized, that the data they provided had already been normalized.

Doublet detection

In [5]:
sc.pp.scrublet(data, n_neighbors=20, batch_key='SampleID', sim_doublet_ratio=3.0)

In [6]:
doublet_percent = data.obs['predicted_doublet'].sum() / len(data.obs)
print(doublet_percent)

0.0065299530441544455


In [7]:
data = data[~data.obs['predicted_doublet']].copy()

doublet_percent = data.obs['predicted_doublet'].sum() / len(data.obs)
print(doublet_percent)

0.0


## Normalization

In [8]:
print(data.X[0])

<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 3842 stored elements and shape (1, 25839)>
  Coords	Values
  (0, 16)	1.5849624872207642
  (0, 23)	2.0
  (0, 28)	1.0
  (0, 34)	2.0
  (0, 38)	1.0
  (0, 41)	1.0
  (0, 43)	1.0
  (0, 44)	1.0
  (0, 47)	2.321928024291992
  (0, 48)	2.321928024291992
  (0, 50)	2.321928024291992
  (0, 60)	2.0
  (0, 62)	1.0
  (0, 64)	1.0
  (0, 73)	1.5849624872207642
  (0, 84)	2.321928024291992
  (0, 125)	4.247927665710449
  (0, 131)	1.0
  (0, 146)	1.5849624872207642
  (0, 152)	1.5849624872207642
  (0, 157)	3.8073549270629883
  (0, 165)	2.321928024291992
  (0, 173)	1.0
  (0, 174)	1.0
  (0, 175)	1.0
  :	:
  (0, 25737)	1.0
  (0, 25738)	1.5849624872207642
  (0, 25739)	1.5849624872207642
  (0, 25742)	1.0
  (0, 25754)	1.0
  (0, 25755)	1.0
  (0, 25767)	2.0
  (0, 25768)	1.5849624872207642
  (0, 25791)	2.0
  (0, 25795)	3.4594316482543945
  (0, 25806)	1.0
  (0, 25807)	1.0
  (0, 25811)	6.087462902069092
  (0, 25812)	6.087462902069092
  (0, 25813)	7.25738811492919

In [9]:
print(data.raw.X[0])

<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 3842 stored elements and shape (1, 28614)>
  Coords	Values
  (0, 21)	2.0
  (0, 28)	3.0
  (0, 33)	1.0
  (0, 40)	3.0
  (0, 44)	1.0
  (0, 47)	1.0
  (0, 49)	1.0
  (0, 50)	1.0
  (0, 53)	4.0
  (0, 54)	4.0
  (0, 56)	4.0
  (0, 67)	3.0
  (0, 70)	1.0
  (0, 72)	1.0
  (0, 81)	2.0
  (0, 92)	4.0
  (0, 137)	18.0
  (0, 143)	1.0
  (0, 159)	2.0
  (0, 165)	2.0
  (0, 170)	13.0
  (0, 179)	4.0
  (0, 187)	1.0
  (0, 188)	1.0
  (0, 189)	1.0
  :	:
  (0, 28498)	1.0
  (0, 28499)	2.0
  (0, 28500)	2.0
  (0, 28503)	1.0
  (0, 28515)	1.0
  (0, 28516)	1.0
  (0, 28532)	3.0
  (0, 28533)	2.0
  (0, 28560)	3.0
  (0, 28565)	10.0
  (0, 28577)	1.0
  (0, 28578)	1.0
  (0, 28582)	67.0
  (0, 28583)	67.0
  (0, 28584)	152.0
  (0, 28585)	136.0
  (0, 28587)	88.0
  (0, 28588)	160.0
  (0, 28589)	39.0
  (0, 28590)	3.0
  (0, 28591)	92.0
  (0, 28592)	10.0
  (0, 28593)	1.0
  (0, 28594)	93.0
  (0, 28608)	1.0


Teh data in data.X are normalized. The unnormalized data can be found in the data.raw.X.

## Saving filtered data

In [10]:
filtered = Path('../data/endometrium_filtered.h5ad')
data.write(filtered)