# Notebook to convert raw data to h5ad

**Created by :** Srivalli Kolla

**Created on :** 16 April, 2025

**Modified on :** 16 April, 2025

**University of Würzburg**

Env : scanpy (Python 3.12.2)

# Importing Packages

In [1]:
import anndata as ad
import scanpy as sc
import os
import datetime

In [2]:
sc.settings.verbosity = 3
sc.logging.print_versions()

timestamp = datetime.datetime.now().strftime("%d_%m_%y")

-----
anndata     0.11.3
scanpy      1.10.4
-----
Cython                      3.0.12
PIL                         11.1.0
anyio                       NA
arrow                       1.3.0
asttokens                   NA
attr                        25.1.0
attrs                       25.1.0
babel                       2.17.0
certifi                     2025.01.31
charset_normalizer          3.4.1
colorama                    0.4.6
comm                        0.2.2
cycler                      0.12.1
cython                      3.0.12
cython_runtime              NA
dateutil                    2.9.0.post0
debugpy                     1.8.12
decorator                   5.2.1
defusedxml                  0.7.1
executing                   2.1.0
fastjsonschema              NA
fqdn                        NA
h5py                        3.13.0
idna                        3.10
igraph                      0.11.8
ipykernel                   6.29.5
ipywidgets                  8.1.5
isoduration               

  mod_version = _find_version(mod.__version__)


# Data loading

In [3]:
cd './Github/ACM_sn_2025'

/home/gruengroup/srivalli/Github/ACM_sn_2025


In [4]:
base_dir = "./data/Library2/"

samples = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
samples

['B10_Lib2',
 'B5_Lib2',
 'B7_Lib2',
 'B6_Lib2',
 'B8_Lib2',
 'B9_Lib2',
 'B11_Lib2',
 'B3_Lib2',
 'B4_Lib2']

# Data export

1. Load matrix files
2. Add Sample name by splitting the id by '_' and considering only part before split(B3_Lib2 = B3)
3. Writing h5ad file in output path

In [5]:
for sample in samples:
    matrix_dir = os.path.join(base_dir, sample, "filtered_feature_bc_matrix")

    print(f"Loading matrix from {matrix_dir}...")
    adata = sc.read_10x_mtx(matrix_dir, var_names='gene_symbols', cache=False)
    sample_id = sample.split("_")[0]
    adata.obs['sample'] = sample_id
    
    out_file = os.path.join(base_dir, sample, f"{sample_id}_{timestamp}.h5ad")
    adata.write(out_file)

    print(f"Saved to {out_file}")

Loading matrix from ./data/Library2/B10_Lib2/filtered_feature_bc_matrix...
--> This might be very slow. Consider passing `cache=True`, which enables much faster reading from a cache file.
Saved to ./data/Library2/B10_Lib2/B10_16_04_25.h5ad
Loading matrix from ./data/Library2/B5_Lib2/filtered_feature_bc_matrix...
--> This might be very slow. Consider passing `cache=True`, which enables much faster reading from a cache file.
Saved to ./data/Library2/B5_Lib2/B5_16_04_25.h5ad
Loading matrix from ./data/Library2/B7_Lib2/filtered_feature_bc_matrix...
--> This might be very slow. Consider passing `cache=True`, which enables much faster reading from a cache file.
Saved to ./data/Library2/B7_Lib2/B7_16_04_25.h5ad
Loading matrix from ./data/Library2/B6_Lib2/filtered_feature_bc_matrix...
--> This might be very slow. Consider passing `cache=True`, which enables much faster reading from a cache file.
Saved to ./data/Library2/B6_Lib2/B6_16_04_25.h5ad
Loading matrix from ./data/Library2/B8_Lib2/filte