In [1]:
import pandas as pd
import numpy as np
import os
import sys
import h5py

In [2]:
def allkeys(obj):
    "Recursively find all keys in an h5py.Group."
    keys = (obj.name,)
    if isinstance(obj, h5py.Group):
        for key, value in obj.items():
            if isinstance(value, h5py.Group):
                keys = keys + allkeys(value)
            else:
                keys = keys + (value.name,)
    return keys

def extract_data(fpath):
    # open the file as 'f'
    colnames = None
    covariates = None
    rownames = None

    ##Achilles

    with h5py.File(fpath, 'r') as f:
        # h5_data = f['default']
        tempKeys = allkeys(f)
        print(tempKeys)

        for k in tempKeys:
            print(f"for k={k} \t value={f.get(k)}")

        colnames = list(np.array(f.get('colnames')))
        covariates = np.array(f.get('covariates'))
        rownames = list(np.array(f.get('rownames')))

        colnames = [x.decode("utf-8") for x in colnames]
        rownames = [x.decode("utf-8") for x in rownames]

    return colnames, rownames, covariates 

In [4]:
schmidt_ifng_path = "/raid/shadab/prateek/genedisco/gd_cache/schmidt_2021_ifng.h5"
schmidt_il2_path  = "/raid/shadab/prateek/genedisco/gd_cache/schmidt_2021_il2.h5"
zhuang_path       = "/raid/shadab/prateek/genedisco/gd_cache/zhuang_2019.h5"
sanchez_path      = "/raid/shadab/prateek/genedisco/gd_cache/sanchez_2021_neurons_tau.h5"
zhu_sarscov_path  = "/raid/shadab/prateek/genedisco/gd_cache/zhu_2021_sarscov2_host_factors.h5"

In [5]:
schmidt_ifng_colnames, schmidt_ifng_rownames, schmidt_ifng_covariates = extract_data(schmidt_ifng_path)
schmidt_il2_colnames, schmidt_il2_rownames, schmidt_il2_covariates = extract_data(schmidt_il2_path)
zhuang_colnames, zhuang_rownames, zhuang_covariates = extract_data(zhuang_path)
sanchez_colnames, sanchez_rownames, sanchez_covariates = extract_data(sanchez_path)
zhu_sarscov_colnames, zhu_sarscov_rownames, zhu_sarscov_covariates = extract_data(zhu_sarscov_path)

('/', '/colnames', '/covariates', '/rownames')
for k=/ 	 value=<HDF5 group "/" (3 members)>
for k=/colnames 	 value=<HDF5 dataset "colnames": shape (1,), type "|O">
for k=/covariates 	 value=<HDF5 dataset "covariates": shape (18421, 1), type "<f4">
for k=/rownames 	 value=<HDF5 dataset "rownames": shape (18421,), type "|O">
('/', '/colnames', '/covariates', '/rownames')
for k=/ 	 value=<HDF5 group "/" (3 members)>
for k=/colnames 	 value=<HDF5 dataset "colnames": shape (1,), type "|O">
for k=/covariates 	 value=<HDF5 dataset "covariates": shape (18421, 1), type "<f4">
for k=/rownames 	 value=<HDF5 dataset "rownames": shape (18421,), type "|O">
('/', '/colnames', '/covariates', '/rownames')
for k=/ 	 value=<HDF5 group "/" (3 members)>
for k=/colnames 	 value=<HDF5 dataset "colnames": shape (1,), type "|O">
for k=/covariates 	 value=<HDF5 dataset "covariates": shape (20147, 1), type "<f4">
for k=/rownames 	 value=<HDF5 dataset "rownames": shape (20147,), type "|O">
('/', '/colnames', '/c

In [6]:
print("--- Schmidt IFNG ---")
print(schmidt_ifng_colnames[:5], "Len: ", len(schmidt_ifng_colnames))
print(schmidt_ifng_rownames[:5], "Len: ", len(schmidt_ifng_rownames))
print(schmidt_ifng_covariates.shape)

print("--- Schmidt IL2---")
print(schmidt_il2_colnames[:5], "Len: ", len(schmidt_il2_colnames))
print(schmidt_il2_rownames[:5], "Len: ", len(schmidt_il2_rownames))
print(schmidt_il2_covariates.shape)

print("--- Zhuang---")
print(zhuang_colnames[:5], "Len: ", len(zhuang_colnames))
print(zhuang_rownames[:5], "Len: ", len(zhuang_rownames))
print(zhuang_covariates.shape)

print("--- Sanchez---")
print(sanchez_colnames[:5], "Len: ", len(sanchez_colnames))
print(sanchez_rownames[:5], "Len: ", len(sanchez_rownames))
print(sanchez_covariates.shape)

print("--- Zhu_sarscov---")
print(zhu_sarscov_colnames[:5], "Len: ", len(zhu_sarscov_colnames))
print(zhu_sarscov_rownames[:5], "Len: ", len(zhu_sarscov_rownames))
print(zhu_sarscov_covariates.shape)

--- Schmidt IFNG ---
['log-fold-change'] Len:  1
['A1BG', 'A1CF', 'A2M', 'A2ML1', 'A3GALT2'] Len:  18421
(18421, 1)
--- Schmidt IL2---
['log-fold-change'] Len:  1
['A1BG', 'A1CF', 'A2M', 'A2ML1', 'A3GALT2'] Len:  18421
(18421, 1)
--- Zhuang---
['log-fold-change'] Len:  1
['1-Dec', '10-Mar', '10-Sep', '11-Mar', '11-Sep'] Len:  20147
(20147, 1)
--- Sanchez---
['RSA'] Len:  1
['A1BG', 'A1CF', 'A2M', 'A2ML1', 'A4GALT'] Len:  17989
(17989, 1)
--- Zhu_sarscov---
['RSA'] Len:  1
['01.Dec', '01.Mar', '01.Sep', '02.Mar', '02.Sep'] Len:  19112
(19112, 1)


In [12]:
print("-"*10)
print("nparray: ", schmidt_ifng_covariates.shape)
print("-"*10)
df_schmidt_ifng = pd.DataFrame(schmidt_ifng_covariates, columns=schmidt_ifng_colnames, index=schmidt_ifng_rownames)
df_schmidt_ifng

----------
nparray:  (18421, 1)
----------


Unnamed: 0,log-fold-change
A1BG,-0.161214
A1CF,0.129081
A2M,-0.189340
A2ML1,0.005275
A3GALT2,0.183225
...,...
ZYG11A,0.140970
ZYG11B,-0.120866
ZYX,0.222654
ZZEF1,0.261937


In [16]:
df_schmidt_ifng.head()

Unnamed: 0,log-fold-change
A1BG,-0.161214
A1CF,0.129081
A2M,-0.18934
A2ML1,0.005275
A3GALT2,0.183225


In [8]:
print("-"*10)
print("nparray: ", schmidt_il2_covariates.shape)
print("-"*10)
df_schmidt_il2 = pd.DataFrame(schmidt_il2_covariates, columns=schmidt_il2_colnames, index=schmidt_il2_rownames)
df_schmidt_il2

----------
nparray:  (18421, 1)
----------


Unnamed: 0,log-fold-change
A1BG,-0.010116
A1CF,-0.038385
A2M,0.017750
A2ML1,0.173729
A3GALT2,0.188805
...,...
ZYG11A,-0.282410
ZYG11B,0.246458
ZYX,0.038386
ZZEF1,0.015188


In [9]:
print("-"*10)
print("nparray: ", zhuang_covariates.shape)
print("-"*10)
df_zhuang = pd.DataFrame(zhuang_covariates, columns=zhuang_colnames, index=zhuang_rownames)
df_zhuang

----------
nparray:  (20147, 1)
----------


Unnamed: 0,log-fold-change
1-Dec,1.271158
10-Mar,-1.251360
10-Sep,2.733800
11-Mar,-1.522360
11-Sep,-0.884623
...,...
hsa-mir-95,-1.684410
hsa-mir-96,1.543140
hsa-mir-98,0.475710
hsa-mir-99a,1.175450


In [10]:
print("-"*10)
print("nparray: ", sanchez_covariates.shape)
print("-"*10)
df_sanchez = pd.DataFrame(sanchez_covariates, columns=sanchez_colnames, index=sanchez_rownames)
df_sanchez

----------
nparray:  (17989, 1)
----------


Unnamed: 0,RSA
A1BG,0.647879
A1CF,0.352316
A2M,1.354603
A2ML1,0.882487
A4GALT,0.691250
...,...
ZYG11A,0.232184
ZYG11B,1.919269
ZYX,5.815895
ZZEF1,1.697511


In [11]:
print("-"*10)
print("nparray: ", zhu_sarscov_covariates.shape)
print("-"*10)
df_zhu_sarscov = pd.DataFrame(zhu_sarscov_covariates, columns=zhu_sarscov_colnames, index=zhu_sarscov_rownames)
df_zhu_sarscov

----------
nparray:  (19112, 1)
----------


Unnamed: 0,RSA
01.Dec,29.597134
01.Mar,29.131605
01.Sep,24.999132
02.Mar,24.896654
02.Sep,24.260283
...,...
ZYG11A,-0.000000
ZYG11B,-0.000000
ZYX,-0.000000
ZZEF1,-0.000000
