# 1. Import modules

In [1]:
import scanpy as sc
from scGeneClust.utils import load_PBMC3k
from scGeneClust import scGeneClust

# 2. Load the example PBMC3k dataset
GeneClust expects raw counts as input.

In [2]:
pbmc3k = sc.read_h5ad("../data/pbmc3k_raw.h5ad")
pbmc3k

AnnData object with n_obs × n_vars = 2700 × 32738
    var: 'gene_ids'

# 3. Quality control

In [3]:
sc.pp.filter_cells(pbmc3k, min_genes=300)
sc.pp.filter_genes(pbmc3k, min_cells=10)
pbmc3k.X = pbmc3k.X.toarray()

In [4]:
pbmc3k

AnnData object with n_obs × n_vars = 2685 × 11135
    obs: 'n_genes'
    var: 'gene_ids', 'n_cells'

# Use GeneClust-fast to select features

In [5]:
selected_genes_fast = scGeneClust(pbmc3k, version='fast', n_gene_clusters=300, random_stat=2022, verbosity=1)

INFO  | Preprocessing the data...
INFO  | Data preprocessing finished!
INFO  | Reducing data dimension...
INFO  | Dimension reduction finished!
INFO  | Clustering genes...
INFO  | Gene clustering finished!


In [6]:
selected_genes_fast[:10]

array(['A1BG', 'ABCC10', 'ABHD15', 'ABHD4', 'ABHD5', 'AC012358.8',
       'AC013264.2', 'AC015987.2', 'AC016629.8', 'AC018816.3'],
      dtype=object)

# Use GeneClust-ps to select features

In [7]:
selected_genes_ps = scGeneClust(pbmc3k, version='ps', n_cell_clusters=7, scale=100, top_percent_relevance=5, random_stat=2022, verbosity=1)

INFO  | Preprocessing the data...
INFO  | Data preprocessing finished!
INFO  | Reducing data dimension...
INFO  | Dimension reduction finished!
INFO  | Finding high-confidence cells...
INFO  | High-confidence cell detection finished!
INFO  | Start to find relevant genes...
INFO  | Relevant gene detection finished!
INFO  | Clustering genes...
INFO  | Gene clustering finished!


In [8]:
selected_genes_ps[:10]

Index(['HES4', 'ISG15', 'RPL22', 'ENO1', 'RBP7', 'AGTRAP', 'TNFRSF1B', 'EFHD2',
       'CDA', 'RPL11'],
      dtype='object', name='index')