In [1]:
import h5py
import numpy as np
from scipy import sparse
from pca import pca
from pca import exact_pca
from _oja import oja_batch
from pathlib import Path

In [2]:
def read_matrix(path: Path):
  with h5py.File(path, 'r') as f:
    shape = f['shape'][()]
    data = f['data'][()]
    indices = f['indices'][()]
    indptr = f['indptr'][()]
  return sparse.csr_matrix((data, indices, indptr), shape=shape)

In [3]:
data_path = '/data1/intern/pca_benchmark/'
csr_mtx = read_matrix(data_path + '11.h5')[:,:]

In [4]:
csr_mtx.shape

(109995, 23636)

In [5]:
exact_pca_mtx = exact_pca(csr_mtx, k=50)

In [6]:
appro_pca_mtx = pca(csr_mtx, k=50)

In [7]:
dense = csr_mtx.todense()
dense = dense - dense.mean(axis=0)

In [None]:
oja_pca_mtx = csr_mtx @ oja_batch(dense, 50, 2048, 1, float(0.01))

In [None]:
exact_vars = np.var(exact_pca_mtx, axis=0)
appro_vars = np.var(appro_pca_mtx, axis=0)
oja_vars = np.var(oja_pca_mtx, axis=0)

In [None]:
np.sum(appro_vars) / np.sum(exact_vars)

0.99530756

In [None]:
np.sum(oja_vars) / np.sum(exact_vars)

0.9707506229378031