# Notebook to convert raw data to h5ad

**Created by :** Srivalli Kolla

**Created on :** 13 May, 2025

**Modified on :** 13 May, 2025

**University of Würzburg**

Env : scanpy (Python 3.12.2)

# Importing Packages

In [50]:
import anndata as ad
import scanpy as sc
import pandas as pd
import os
import datetime

In [51]:
sc.settings.verbosity = 3
sc.logging.print_versions()

timestamp = datetime.datetime.now().strftime("%d_%m_%y")

-----
anndata     0.11.3
scanpy      1.10.4
-----
Cython                      3.0.12
PIL                         11.1.0
anyio                       NA
arrow                       1.3.0
asttokens                   NA
attr                        25.1.0
attrs                       25.1.0
babel                       2.17.0
certifi                     2025.01.31
charset_normalizer          3.4.1
colorama                    0.4.6
comm                        0.2.2
cycler                      0.12.1
cython                      3.0.12
cython_runtime              NA
dateutil                    2.9.0.post0
debugpy                     1.8.12
decorator                   5.2.1
defusedxml                  0.7.1
executing                   2.1.0
fastjsonschema              NA
fqdn                        NA
h5py                        3.13.0
idna                        3.10
igraph                      0.11.8
ipykernel                   6.29.5
ipywidgets                  8.1.5
isoduration               

  mod_version = _find_version(mod.__version__)


In [52]:
base_dir = "./Github/Matthias_sn_data_2025/data"

samples = [
    d
    for d in os.listdir(base_dir)
    if os.path.isdir(os.path.join(base_dir, d)) and d.endswith("_count")
]

print(samples)

['SCC0203_1_Becker_GEX_D11_count', 'SCC0203_2_Becker_GEX_E5_count', 'SCC0203_2_Becker_GEX_E6_count', 'SCC0203_2_Becker_GEX_E7_count']


# Data export

1. Load matrix files
2. Add Sample name by splitting the id by '_' and considering only part before split(B3_Lib2 = B3)
3. Writing h5ad file in output path

In [53]:
for sample in samples:
    matrix_dir = os.path.join(base_dir, sample, "outs", "filtered_feature_bc_matrix")

    print(f"Loading matrix from {matrix_dir}...")
    adata = sc.read_10x_mtx(matrix_dir, var_names='gene_symbols', cache=True)
    sample_id = sample.split("_")[4]
    adata.obs['sample_name'] = sample_id
    
    out_file = os.path.join(base_dir, sample, f"{sample_id}_{timestamp}.h5ad")
    adata.write(out_file)

    print(f"Saved to {out_file}")

Loading matrix from ./Github/Matthias_sn_data_2025/data/SCC0203_1_Becker_GEX_D11_count/outs/filtered_feature_bc_matrix...
... reading from cache file cache/Github-Matthias_sn_data_2025-data-SCC0203_1_Becker_GEX_D11_count-outs-filtered_feature_bc_matrix-matrix.h5ad
Saved to ./Github/Matthias_sn_data_2025/data/SCC0203_1_Becker_GEX_D11_count/D11_13_05_25.h5ad
Loading matrix from ./Github/Matthias_sn_data_2025/data/SCC0203_2_Becker_GEX_E5_count/outs/filtered_feature_bc_matrix...
... reading from cache file cache/Github-Matthias_sn_data_2025-data-SCC0203_2_Becker_GEX_E5_count-outs-filtered_feature_bc_matrix-matrix.h5ad
Saved to ./Github/Matthias_sn_data_2025/data/SCC0203_2_Becker_GEX_E5_count/E5_13_05_25.h5ad
Loading matrix from ./Github/Matthias_sn_data_2025/data/SCC0203_2_Becker_GEX_E6_count/outs/filtered_feature_bc_matrix...
... reading from cache file cache/Github-Matthias_sn_data_2025-data-SCC0203_2_Becker_GEX_E6_count-outs-filtered_feature_bc_matrix-matrix.h5ad
Saved to ./Github/Matth

# Data merging

In [54]:
h5ad_paths = [
    os.path.join(root, file)
    for root, _, files in os.walk(base_dir,sample)
    for file in files
    if file.endswith(".h5ad")
]

In [55]:
adatas = [sc.read_h5ad(p) for p in h5ad_paths]

adata_combined = ad.concat(adatas, join='inner')
adata_combined

  utils.warn_names_duplicates("obs")


AnnData object with n_obs × n_vars = 79592 × 32285
    obs: 'sample_name'

In [56]:
adata_combined.obs

Unnamed: 0,sample_name
0,D11
1,D11
2,D11
3,D11
4,D11
...,...
TTTGTGGCAGTTATGT-1,E7
TTTGTGTTCACAAGCT-1,E7
TTTGTGTTCACCTGTC-1,E7
TTTGTGTTCTTGTCTG-1,E7


In [57]:
adata_combined.var

Xkr4
Gm1992
Gm19938
Gm37381
Rp1
...
AC124606.1
AC133095.2
AC133095.1
AC234645.1
AC149090.1


# Add metadata

In [58]:
metadata = pd.read_csv("./Github/Matthias_sn_data_2025/data/metadata_Matthias_sn_data_2025.csv",sep= ',') 
metadata

Unnamed: 0,sample_name,Genotype,Sample_ID
0,D11,KDM6A_WT,D11_KDM6A_WT
1,E7,KDM6A_WT,E7_KDM6A_WT
2,E5,KDM6A_KO,E5_KDM6A_KO
3,E6,KDM6A_KO,E6_KDM6A_KO


In [59]:
adata_combined.obs = adata_combined.obs.merge(metadata, how='left', on='sample_name')
adata_combined

AnnData object with n_obs × n_vars = 79592 × 32285
    obs: 'sample_name', 'Genotype', 'Sample_ID'

In [60]:
adata_combined.obs

Unnamed: 0,sample_name,Genotype,Sample_ID
0,D11,KDM6A_WT,D11_KDM6A_WT
1,D11,KDM6A_WT,D11_KDM6A_WT
2,D11,KDM6A_WT,D11_KDM6A_WT
3,D11,KDM6A_WT,D11_KDM6A_WT
4,D11,KDM6A_WT,D11_KDM6A_WT
...,...,...,...
79587,E7,KDM6A_WT,E7_KDM6A_WT
79588,E7,KDM6A_WT,E7_KDM6A_WT
79589,E7,KDM6A_WT,E7_KDM6A_WT
79590,E7,KDM6A_WT,E7_KDM6A_WT


In [61]:
combined_path = os.path.join(base_dir, f"matthias_sn_concat_raw_{timestamp}.h5ad")
adata_combined.write(combined_path)