# Examples of running IHPF 

1. Download the scRNA-seq datasets in h5ad format and put in the folder Data/ 

In [3]:
import IHPF
import scanpy as sc

  data = yaml.load(f.read()) or {}


In [5]:
from scipy.sparse import coo_matrix
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
import numpy as np
import pandas as pd

In [1]:
datasets = ['humanpancreas','10Xmouse','10Xpbmc','mixedpbmc']
for dataset in datasets:
    ## Replace this with the location of your h5ad files
    adata = sc.read('../Data/{}.h5ad'.format(dataset))
    adataobs = adata.obs.copy()
    no_cell_types = len(adataobs['actual'].unique())
    adataobs.reset_index(inplace=True)
    Xlist = list()
    for i,df in adataobs.groupby('batch'):
        batchidx = df.index
        Xlist.append(coo_matrix(adata.X[batchidx,:]))
    model = IHPF.scIHPF(no_cell_types,max_iter=500)
    model.fit(Xlist)
    adata.obsm['IHPF_{}'.format(no_cell_types)] = np.concatenate(model.cell_scores(),axis=0)
    adata.varm['IHPF_{}'.format(no_cell_types)] = model.shared_gene_scores()
    kmeans_cell = KMeans(n_clusters=no_cell_types, random_state=0).fit(normalize(adata.obsm['IHPF_{}'.format(no_cell_types)]))
    adata.obs['IHPF_{}_kmeans_normalised'.format(no_cell_types)] = kmeans_cell.labels_
    print(adata)
    

NameError: name 'sc' is not defined

In [31]:
adata.obs

Unnamed: 0,actual,batch,IHPF_kmeans_normalised,HPF_kmeans_normalised,INMF_kmeans_normalised,IPCA_kmeans_normalised,IHPF_2_kmeans_normalised
0,293t,0,1,0,1,0,0
1,293t,0,1,0,1,0,0
2,293t,0,1,0,1,0,0
3,293t,0,1,0,1,0,0
4,293t,0,1,0,1,0,0
...,...,...,...,...,...,...,...
9525,jurkat,2,0,1,0,0,1
9526,293t,2,1,1,1,0,0
9527,293t,2,1,1,1,0,0
9528,293t,2,1,1,1,0,0


In [32]:
from sklearn.metrics import adjusted_mutual_info_score, silhouette_score
datasets = ['humanpancreas','10Xmouse','10Xpbmc','mixedpbmc']
batchAMI = dict()
actualAMI = dict()

In [33]:
for dataset in datasets:
    batch = dict()
    actual = dict()
    adata = sc.read('../Data/{}.h5ad'.format(dataset))
    for method in ['IHPF_2','IHPF','HPF','INMF','IPCA']:
        batch[method] = adjusted_mutual_info_score(adata.obs['batch'],adata.obs['{}_kmeans_normalised'.format(method)])
        actual[method] = adjusted_mutual_info_score(adata.obs['actual'],adata.obs['{}_kmeans_normalised'.format(method)])
    batch['actual'] = adjusted_mutual_info_score(adata.obs['batch'],adata.obs['actual'])
    actual['batch'] = adjusted_mutual_info_score(adata.obs['batch'],adata.obs['actual'])
    batchAMI[dataset] = batch
    actualAMI[dataset] = actual

In [34]:
pd.DataFrame(batchAMI)

Unnamed: 0,humanpancreas,10Xmouse,newpbmc,hpfpbmc
IHPF_2,0.498353,0.498353,0.498353,0.498353
IHPF,0.498353,0.498353,0.498353,0.498353
HPF,0.744954,0.744954,0.744954,0.744954
INMF,0.288737,0.288737,0.288737,0.288737
IPCA,0.73875,0.73875,0.73875,0.73875
actual,0.498373,0.498373,0.498373,0.498373


In [35]:
pd.DataFrame(actualAMI)

Unnamed: 0,humanpancreas,10Xmouse,newpbmc,hpfpbmc
IHPF_2,0.976076,0.976076,0.976076,0.976076
IHPF,0.976076,0.976076,0.976076,0.976076
HPF,-6.7e-05,-6.7e-05,-6.7e-05,-6.7e-05
INMF,0.226908,0.226908,0.226908,0.226908
IPCA,0.447701,0.447701,0.447701,0.447701
batch,0.498373,0.498373,0.498373,0.498373


## INMF 

In [26]:
from INMF import INMF

In [27]:
class scINMF:
    def __init__(self, k, alpha=1, **kwargs):
        np.random.seed(0)
        self.n_components = k
        self.method = INMF(
            n_components=self.n_components, solver="mu", alpha=alpha, **kwargs
        )

    def fit(self, X):
        self.data = X
        (
            self.cell_scores,
            self.shared_gene_scores,
            self.dataset_gene_scores,
        ) = self.method.fit_transform(self.data)

    def count_matrix(self):
        # print(self.cell_score[0].shape)
        # print(self.dataset_gene_score[0].shape)
        # print(self.shared_gene_score.shape)
        original = [
            np.dot(
                self.cell_score[i], self.shared_gene_score + self.dataset_gene_score[i]
            )
            for i in range(len(self.cell_score))
        ]
        return original

    def explained_deviance(self, X, X_rep, beta):
        try:
            X_avg = coo_matrix(
                np.matmul(np.ones((X.shape[0], 1)), X.mean(axis=0).reshape(1, -1))
            )
            average_divergence = beta_divergence_ppc(X, X_avg, beta)
            model_divergence = beta_divergence_ppc(X, X_rep, beta)
            ratio = (average_divergence - model_divergence) / average_divergence
            return ratio
        except:
            print("Error in calculating deviance")
            return 0

In [32]:
datasets = ['humanpancreas']
for dataset in datasets:
    ## Replace this with the location of your h5ad files
    adata = sc.read('../Data/{}.h5ad'.format(dataset))
    adataobs = adata.obs.copy()
    no_cell_types = len(adataobs['actual'].unique())
    adataobs.reset_index(inplace=True)
    Xlist = list()
    for i,df in adataobs.groupby('batch'):
        batchidx = df.index
        Xlist.append(coo_matrix(adata.X[batchidx,:]))
    model = scINMF(no_cell_types,max_iter=500)
    model.fit(Xlist)
    adata.obsm['INMF_{}'.format(no_cell_types)] = np.concatenate(model.cell_scores,axis=0)
    adata.varm['INMF_{}'.format(no_cell_types)] = model.shared_gene_scores.transpose()
    kmeans_cell = KMeans(n_clusters=no_cell_types, random_state=0).fit(normalize(adata.obsm['INMF_{}'.format(no_cell_types)]))
    adata.obs['INMF_{}_kmeans_normalised'.format(no_cell_types)] = kmeans_cell.labels_
    adata.write('../Data/{}.h5ad'.format(dataset))

Reconstruction error 1120212.068773359


In [33]:
from sklearn.metrics import adjusted_mutual_info_score, silhouette_score
datasets = ['humanpancreas']
batchAMI = dict()
actualAMI = dict()

for dataset in datasets:
    batch = dict()
    actual = dict()
    adata = sc.read('../Data/{}.h5ad'.format(dataset))
    for method in ['INMF_10','IHPF','HPF','INMF','IPCA']:
        batch[method] = adjusted_mutual_info_score(adata.obs['batch'],adata.obs['{}_kmeans_normalised'.format(method)])
        actual[method] = adjusted_mutual_info_score(adata.obs['actual'],adata.obs['{}_kmeans_normalised'.format(method)])
    batch['actual'] = adjusted_mutual_info_score(adata.obs['batch'],adata.obs['actual'])
    actual['batch'] = adjusted_mutual_info_score(adata.obs['batch'],adata.obs['actual'])
    batchAMI[dataset] = batch
    actualAMI[dataset] = actual

In [34]:
pd.DataFrame(actualAMI)

Unnamed: 0,humanpancreas
HPF,0.141297
IHPF,0.276381
INMF,0.238332
INMF_10,0.199625
IPCA,0.211085
batch,0.031719


## scHPF

In [38]:
from schpf import scHPF
from scipy.sparse import vstack

In [42]:
datasets = ['humanpancreas']
for dataset in datasets:
    ## Replace this with the location of your h5ad files
    adata = sc.read('../Data/{}.h5ad'.format(dataset))
    adataobs = adata.obs.copy()
    no_cell_types = len(adataobs['actual'].unique())
    adataobs.reset_index(inplace=True)
    Xlist = list()
    for i,df in adataobs.groupby('batch'):
        batchidx = df.index
        Xlist.append(coo_matrix(adata.X[batchidx,:]))
    model = scHPF(no_cell_types,max_iter=500)
    model.fit(vstack(Xlist))
    adata.obsm['HPF_{}'.format(no_cell_types)] = model.cell_score()
    adata.varm['HPF_{}'.format(no_cell_types)] = model.gene_score()
    kmeans_cell = KMeans(n_clusters=no_cell_types, random_state=0).fit(normalize(adata.obsm['HPF_{}'.format(no_cell_types)]))
    adata.obs['HPF_{}_kmeans_normalised'.format(no_cell_types)] = kmeans_cell.labels_
    adata.write('../Data/{}.h5ad'.format(dataset))

[Iter.    0]  loss:707.795400  pct:100.000000000
[Iter.   10]  loss:27.256316  pct:-96.149124950
[Iter.   20]  loss:21.592846  pct:-20.778562361
[Iter.   30]  loss:20.945739  pct:-2.996855619
[Iter.   40]  loss:20.653689  pct:-1.394317453
[Iter.   50]  loss:20.561153  pct:-0.448038017
[Iter.   60]  loss:20.528952  pct:-0.156609043
[Iter.   70]  loss:20.508578  pct:-0.099245690
[Iter.   80]  loss:20.492615  pct:-0.077837435
[Iter.   90]  loss:20.479040  pct:-0.066243636
[Iter.  100]  loss:20.469607  pct:-0.046061907
[Iter.  110]  loss:20.460487  pct:-0.044551732
[Iter.  120]  loss:20.453325  pct:-0.035004752
[Iter.  130]  loss:20.448780  pct:-0.022220218
[Iter.  140]  loss:20.445213  pct:-0.017443994
[Iter.  150]  loss:20.442168  pct:-0.014894396
[Iter.  160]  loss:20.439343  pct:-0.013820324
[Iter.  170]  loss:20.436525  pct:-0.013784295
[Iter.  180]  loss:20.434212  pct:-0.011318608
[Iter.  190]  loss:20.431732  pct:-0.012139555
[Iter.  200]  loss:20.428680  pct:-0.014936018
[Iter.  2

In [45]:
from sklearn.metrics import adjusted_mutual_info_score, silhouette_score
datasets = ['humanpancreas']
batchAMI = dict()
actualAMI = dict()

for dataset in datasets:
    batch = dict()
    actual = dict()
    adata = sc.read('../Data/{}.h5ad'.format(dataset))
    for method in ['HPF_10','IHPF','HPF','INMF','IPCA']:
        batch[method] = adjusted_mutual_info_score(adata.obs['batch'],adata.obs['{}_kmeans_normalised'.format(method)])
        actual[method] = adjusted_mutual_info_score(adata.obs['actual'],adata.obs['{}_kmeans_normalised'.format(method)])
    batch['actual'] = adjusted_mutual_info_score(adata.obs['batch'],adata.obs['actual'])
    actual['batch'] = adjusted_mutual_info_score(adata.obs['batch'],adata.obs['actual'])
    batchAMI[dataset] = batch
    actualAMI[dataset] = actual

In [46]:
pd.DataFrame(actualAMI)

Unnamed: 0,humanpancreas
HPF,0.141297
HPF_10,0.133217
IHPF,0.276381
INMF,0.238332
IPCA,0.211085
batch,0.031719
