In [2]:
# Core scverse libraries
import importlib
import scanpy as sc
import anndata as ad
import scanpy_clustering.clustering as cl
from sklearn.metrics.cluster import adjusted_rand_score
importlib.reload(cl)



### Listing the different algorithms

In [2]:
cl.list_algorithms()



### Reading data from our sample set

In [23]:
adata = sc.read_h5ad('..\\data\\symsim_observed_counts_5000genes_5000cells_complex.h5ad')

### **Optional** finding the 2000 most variable genes 

In [24]:
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=2000)
sc.tl.pca(adata)
sc.pp.neighbors(adata)

### Performing clustering (DBSCAN)

##### Option 1 using the cluster function

In [27]:
cl.cluster(adata, algorithm='DBScan_Base', key_added='dbscan_labels', eps=14, min_samples=8, metric='euclidean')

##### Option 2 using scanpy integration

In [115]:
cl.enable_scanpy_integration()
sc.tl.DBScan_Base(adata, key_added='dbscan_labels', eps=10, min_samples=8, metric='euclidean')

#### Getting the score [-0.5-1.0]

-0.5 = worse than random
0.0 = might as well use a random generator
1.0 = Does not get better!

In [28]:
adjusted_rand_score(adata.obs['group'], adata.obs['dbscan_labels'])



#### Compare to leiden

In [15]:
import numpy as np
# Logarithmize the data
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=2000)
sc.tl.pca(adata)
sc.pp.neighbors(adata)
np.random.seed(42)  # Ensures a controlled range
np.random.randint(0, min(10000, 2147483647))  # Safe upper bound

adata.obs.index = adata.obs.index.astype("category")
sc.tl.leiden(adata)
adjusted_rand_score(adata.obs['group'], adata.obs['leiden'])

