In [15]:
import pandas as pd
# open up a datastore
store = pd.HDFStore('all_data.h5')

In [16]:
# An HDF5 file is a key-value store, and in our case, each value is a Pandas
# object (either a 2D DataFrame, or a 1D Series)

# Keys:
# /rpkm           This contains the data. Each row is a single cell. Each column is the expression of a gene in RPKM units.
#                 The "index" (the row names, primary key of the table) is a unique identifier for each cell. The column names are Entrez IDs,
#                 which are unique numerical identifiers for genes.

# /labels         Vector, same length as number of rows in 'rpkm', contains the correct label (cell type) for each cell.

# /accessions     This is a vector of the same length as number of rows in 'rpkm' and contains the experimentID (accession) for
#                 each cell. This is actually embedded in the unique identifier for the cell, but is included as a separate vector
#                 for convenience.

# /gene_symbols   Might not be useful. Vector, same length as number of columns in 'rpkm', contains the names (strings) for each gene. Just
#                 a different way of identifying the genes.

# /true_ids       Not relevant for you, ignore.
print(store)
print()
print()

# Get the feature matrix (samples and their features)
feature_matrix_dataframe = store['rpkm']
print(type(feature_matrix_dataframe))
print(feature_matrix_dataframe.info())
print()
print()

# Get the labels corresponding to each of the samples in the feature matrix
labels_series = store['labels']
print(type(labels_series))
print(labels_series.shape)
print()
print()

# Get the accession numbers (experiment IDs) corresponding to each of the
# samples in the feature matrix
accessions_series = store['accessions']
print(type(accessions_series))
print(accessions_series.shape)


<class 'pandas.io.pytables.HDFStore'>
File path: all_data.h5



<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
Index: 24244 entries, 22182_GSM552335_SRR057552 to 98969_GSM2629447_SRR5570923
Columns: 20499 entries, 100009600 to 99982
dtypes: float64(20499)
memory usage: 3.7+ GB
None


<class 'pandas.core.series.Series'>
(24244,)


<class 'pandas.core.series.Series'>
(24244,)


In [40]:
# PRINT INFO
print("---keys---\n", store.keys(), "\n")                           
print("---key values---\n", store['accessions'][:], )  # slice for reading   
print("---key values---\n", store['true_ids'][:], )  # slice for reading   
print(store)


---keys---
 ['/accessions', '/gene_symbols', '/labels', '/rpkm', '/true_ids'] 

---key values---
 22182_GSM552335_SRR057552      22182
22182_GSM552337_SRR057554      22182
22182_GSM552339_SRR057556      22182
22182_GSM552340_SRR057557      22182
22182_GSM552341_SRR057558      22182
                               ...  
98969_GSM2629438_SRR5570914    98969
98969_GSM2629442_SRR5570918    98969
98969_GSM2629443_SRR5570919    98969
98969_GSM2629446_SRR5570922    98969
98969_GSM2629447_SRR5570923    98969
Length: 24244, dtype: int64
---key values---
 22182_GSM552335_SRR057552        22182_GSM552335_SRR057552
22182_GSM552337_SRR057554        22182_GSM552337_SRR057554
22182_GSM552339_SRR057556        22182_GSM552339_SRR057556
22182_GSM552340_SRR057557        22182_GSM552340_SRR057557
22182_GSM552341_SRR057558        22182_GSM552341_SRR057558
                                          ...             
98969_GSM2629438_SRR5570914    98969_GSM2629438_SRR5570914
98969_GSM2629442_SRR5570918    98969

In [34]:
# 文件结构查看
fname='train_data.h5'

import h5py
import numpy as np

def h5list(f,tab):
    print(tab,'Group:',f.name,'len:%d'%len(f))
    mysp2=tab[:-1]+ '  |-*'
    for vv in f.attrs.keys():  # 打印属性
        print(mysp2,end=' ')
        print('%s = %s'% (vv,f.attrs[vv]))
    mysp=tab[:-1] + '  |-'
    for k in f.keys():
        d = f[k]
        if isinstance(d,h5py.Group):
            h5list(d,mysp)
        elif isinstance(d,h5py.Dataset):
            print(mysp,'Dataset:',d.name,'(size:%d)'%d.size)
            mysp1=mysp[:-1]+ '  |-'
            print(mysp1,'(dtype=%s)'%d.dtype)
            if d.dtype.names is not None:
                print(mysp,end=' ')
                for vv in d.dtype.names:
                    print(vv,end=',')
                print()
            mysp2=mysp1[:-1]+ '  |-*'
            for vv in d.attrs.keys():  # 打印属性
                print(mysp2,end=' ')
                try:
                    print('%s = %s'% (vv,d.attrs[vv]))
                except TypeError as e:
                    print('%s = %s'% (vv,e))
                except:
                    print('%s = ?? Other ERR'% (vv,))
            #print(d[:12])  # 打印12组数据看看
        else:
            print('??->',d,'Unkown Object!')

f = h5py.File(fname,'r')
h5list(f,'')
f.close()

 Group: / len:5
  |-* CLASS = b'GROUP'
  |-* PYTABLES_FORMAT_VERSION = b'2.1'
  |-* TITLE = Empty(dtype=dtype('S1'))
  |-* VERSION = b'1.0'
  |- Group: /accessions len:2
  |  |-* CLASS = b'GROUP'
  |  |-* TITLE = Empty(dtype=dtype('S1'))
  |  |-* VERSION = b'1.0'
  |  |-* encoding = b'UTF-8'
  |  |-* index_variety = b'regular'
  |  |-* name = b'N.'
  |  |-* pandas_type = b'series'
  |  |-* pandas_version = b'0.15.2'
  |  |- Dataset: /accessions/index (size:21389)
  |  |  |- (dtype=|S27)
  |  |  |  |-* CLASS = b'ARRAY'
  |  |  |  |-* FLAVOR = b'numpy'
  |  |  |  |-* TITLE = Empty(dtype=dtype('S1'))
  |  |  |  |-* VERSION = b'2.4'
  |  |  |  |-* kind = b'string'
  |  |  |  |-* name = b'N.'
  |  |  |  |-* transposed = No NumPy equivalent for TypeBitfieldID exists
  |  |- Dataset: /accessions/values (size:21389)
  |  |  |- (dtype=int64)
  |  |  |  |-* CLASS = b'ARRAY'
  |  |  |  |-* FLAVOR = b'numpy'
  |  |  |  |-* TITLE = Empty(dtype=dtype('S1'))
  |  |  |  |-* VERSION = b'2.4'
  |  |  | 