In [1]:
import pandas as pd
import numpy as np
import random
import time
import scipy.sparse
from arboreto.algo import grnboost2
from distributed import Client, LocalCluster

tfs_path = "/home/amorin/Data/Metadata/TFs_human.tsv"
mat_dense_path = "/space/scratch/amorin/R_objects/GSE180928_mcg_filt.tsv"
mat_sparse_path = "/space/scratch/amorin/R_objects/GSE180928_mcg_filt.mtx"

local_cluster = LocalCluster(n_workers=8, threads_per_worker=1)
custom_client = Client(local_cluster)

In [2]:
tfs = pd.read_table(tfs_path)["Symbol"].tolist()
mat_dense = pd.read_table(mat_dense_path, index_col = 0)
mat_sparse = scipy.io.mmread(mat_sparse_path).tocsc()  # CSC needed for arboreto

In [3]:
# Random subset for speed
random.seed(5)
samp_ix = random.sample(range(mat_dense.shape[1]), 5000)
mat_dense_sub = mat_dense.iloc[:, samp_ix]
mat_sparse_sub = mat_sparse[:, samp_ix]
tfs_sub = set(tfs).intersection(mat_dense_sub.columns)
genes_sub = mat_dense_sub.columns.tolist()

In [4]:
# Checking first column as numpy array to ensure equality
d0 = mat_dense_sub.iloc[:, 0].values
s0 = mat_sparse_sub[:, 0].toarray().ravel()
all(np.isclose(d0, s0))

True

In [5]:
start = time.time()

network_dense = grnboost2(expression_data=mat_dense_sub, 
                          tf_names=tfs_sub,
                          seed=4,
                          client_or_address=custom_client)

end = time.time()

print(end - start)

This may cause some slowdown.
Consider scattering data ahead of time and using futures.


198.9852135181427


In [6]:
start = time.time()

network_sparse = grnboost2(expression_data=mat_sparse_sub, 
                           tf_names=tfs_sub,
                           gene_names=genes_sub,
                           seed=4,
                           client_or_address=custom_client)

end = time.time()

print(end - start)

This may cause some slowdown.
Consider scattering data ahead of time and using futures.


189.47893118858337


In [7]:
network_dense

Unnamed: 0,TF,target,importance
1,HMGB1,SH3BGR,1.379992e+01
138,MYEF2,CEP19,1.238868e+01
137,MYEF2,MEIS1,1.224647e+01
246,RARG,SOX13,1.193636e+01
96,DEAF1,SYF2,1.156339e+01
...,...,...,...
260,ZBTB16,ZNF181,6.564108e-21
263,TSC22D2,SYT10,6.103199e-21
120,ARID1A,ALKBH6,5.852440e-21
97,IRF1,COL4A2,4.961053e-21


In [8]:
network_sparse

Unnamed: 0,TF,target,importance
1,HMGB1,SH3BGR,1.379992e+01
96,DEAF1,SYF2,1.289276e+01
138,MYEF2,CEP19,1.238868e+01
137,MYEF2,MEIS1,1.224647e+01
1,HMGB1,BHLHE40,1.145334e+01
...,...,...,...
50,ELK4,ZNF544,5.022568e-21
241,ZNF225,IKZF3,3.075267e-21
373,MAFB,MCHR2,2.893262e-21
139,ZNF26,NRBP2,2.688361e-21


In [9]:
ix = np.isclose(network_dense["importance"].tolist(), 
                network_sparse["importance"].tolist())

ValueError: operands could not be broadcast together with shapes (319451,) (319560,) 

In [None]:
network_dense.iloc[~ix, :]

In [None]:
network_sparse.iloc[~ix, :]