# h5/hdf5 to AnnData

In [2]:
# Import modules
import numpy as np
import scanpy as sc
import pandas as pd
import anndata as ad
import session_info
from scipy.sparse import csr_matrix
import dask.array as da
import h5py

In [None]:
# Information of the session

sc.settings.verbosity = 3

session_info.show()

### Use only if your data comes from 10x genomics

In [3]:
#10x genomics

adata = sc.read_10x_h5('/Users/Downloads/GSM3489183_IPF_01_filtered_gene_bc_matrices_h5.h5')

print(adata)

reading /Users/rafaelsalgueroraigon/Downloads/GSM3489183_IPF_01_filtered_gene_bc_matrices_h5.h5
 (0:00:00)
AnnData object with n_obs × n_vars = 2253 × 33694
    var: 'gene_ids'


  utils.warn_names_duplicates("var")


### Other method for data.h5

In [4]:
# Identify files
import h5py

with h5py.File('/Users/Downloads/GSM3489183_IPF_01_filtered_gene_bc_matrices_h5.h5', 'r') as archivo:
    def listar_objetos(name, obj):
        print(name)

    archivo.visititems(listar_objetos)

GRCh38
GRCh38/barcodes
GRCh38/data
GRCh38/gene_names
GRCh38/genes
GRCh38/indices
GRCh38/indptr
GRCh38/shape


In [5]:
# Path to your HDF5 file

file_path = "/Users/Downloads/GSM3489183_IPF_01_filtered_gene_bc_matrices_h5.h5"

In the next step, you need to specify paths to the data (values ​​other than 0), column indexes (position values), indptr (start and end of row), and shape (shape of the matrix). 

You also need to specify paths to the barcodes or cell_names, and the genes.

In [6]:
# Open and read the HDF5 file
with h5py.File(file_path, "r") as f:
    # Directly access the datasets inside the 'GRCh38' group
    data = f["GRCh38/data"][:] 
    indices = f["GRCh38/indices"][:]
    indptr = f["GRCh38/indptr"][:]
    shape = tuple(f["GRCh38/shape"][:])[::-1]  

    # Reconstruct the sparse matrix (cells x genes)
    X = csr_matrix((data, indices, indptr), shape=shape)

    # Load barcodes and gene names (decode bytes to strings)
    cell_names = [b.decode("utf-8") for b in f["GRCh38/barcodes"][:]]
    gene_names = [g.decode("utf-8") for g in f["GRCh38/gene_names"][:]]

    # Create metadata DataFrames
    obs = pd.DataFrame(index=cell_names)
    var = pd.DataFrame(index=gene_names)

In [7]:
# Create AnnData

adata = sc.AnnData(
    X=X,  
    obs=pd.DataFrame(index=cell_names),
    var=pd.DataFrame(index=gene_names)
)

  utils.warn_names_duplicates("var")


In [8]:
# Save anndata

adata.write ("/Users/Desktop/New atlas format/adata.h5ad") # compression="gzip" (optional) 

In [9]:
# Check adata

adata

AnnData object with n_obs × n_vars = 2253 × 33694

## Reference data:

https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi 

Sample: GSM3489183

