# 1. Import modules

In [1]:
import scanpy as sc
import scGeneClust as gc

# 2. Load the example PBMC3k dataset
GeneClust expects raw counts as input.

In [2]:
pbmc3k = sc.read_h5ad("../data/pbmc3k_raw.h5ad")
pbmc3k

AnnData object with n_obs × n_vars = 2700 × 32738
    var: 'gene_ids'

# 3. Quality control

In [3]:
sc.pp.filter_cells(pbmc3k, min_genes=300)
sc.pp.filter_genes(pbmc3k, min_cells=10)
pbmc3k.X = pbmc3k.X.toarray()
pbmc3k

AnnData object with n_obs × n_vars = 2685 × 11135
    obs: 'n_genes'
    var: 'gene_ids', 'n_cells'

# 4. Feature selection
## Use GeneClust-fast to select features

In [4]:
info, selected_genes_fast = gc.scGeneClust(pbmc3k, n_var_clusters=200, version='fast', return_info=True)

[32m2023-02-12 15:14:18.107[0m | [1mINFO [0m | [1mPerforming [35mGeneClust-fast[0m[1m on [35mscRNA-seq[0m[1m data, with [33m95[0m[1m workers.
[0m[32m2023-02-12 15:14:18.156[0m | [1mINFO [0m | [1mPreprocessing data...
[0m[32m2023-02-12 15:14:19.207[0m | [1mINFO [0m | [1mData preprocessing done.
[0m[32m2023-02-12 15:14:19.208[0m | [1mINFO [0m | [1mClustering genes...
[0m[32m2023-02-12 15:14:19.657[0m | [1mINFO [0m | [1mGene clustering done!
[0m[32m2023-02-12 15:14:20.182[0m | [1mINFO [0m | [1mSelected [33m396[0m[1m genes.
[0m[32m2023-02-12 15:14:20.183[0m | [1mINFO [0m | [1mGeneClust-fast finished.
[0m

In [5]:
selected_genes_fast[:10]

array(['TSFM', 'MTF1', 'LONRF1', 'WDR13', 'RPL39', 'SEPT8', 'S100A11',
       'TTC7A', 'KDELC2', 'GNPTG'], dtype=object)

## Use GeneClust-ps to select features

In [6]:
info, selected_genes_ps = gc.scGeneClust(pbmc3k, n_obs_clusters=7, version='ps', return_info=True)

[32m2023-02-12 15:14:20.563[0m | [1mINFO [0m | [1mPerforming [35mGeneClust-ps[0m[1m on [35mscRNA-seq[0m[1m data, with [33m95[0m[1m workers.
[0m[32m2023-02-12 15:14:20.613[0m | [1mINFO [0m | [1mPreprocessing data...
[0m[32m2023-02-12 15:14:21.913[0m | [1mINFO [0m | [1mData preprocessing done.
[0m[32m2023-02-12 15:14:21.914[0m | [1mINFO [0m | [1mClustering genes...
[0m[32m2023-02-12 15:14:21.915[0m | [1mINFO [0m | [1mFinding high-confidence cells...
[0m[32m2023-02-12 15:14:24.594[0m | [1mINFO [0m | [1mFound [33m1641[0m[1m ([33m61.0%[0m[1m) high-confidence cells.
[0m[32m2023-02-12 15:14:24.609[0m | [1mINFO [0m | [1mFinding relevant genes...
[0m[32m2023-02-12 15:14:28.836[0m | [1mINFO [0m | [1m[33m2227[0m[1m ([33m20%[0m[1m) genes are marked as relevant genes.
[0m[32m2023-02-12 15:14:28.839[0m | [1mINFO [0m | [1mComputing gene redundancy...
[0m[32m2023-02-12 15:15:52.055[0m | [1mINFO [0m | [1mGene redundancy com

In [7]:
selected_genes_ps[:10]

array(['TNFRSF4', 'MIB2', 'PIK3CD', 'RBP7', 'EXOSC10', 'EFHD2', 'CDA',
       'RCAN3', 'LDLRAP1', 'FGR'], dtype=object)