In [1]:
import scanpy as sc
import pandas as pd
import glob, os

# Read metadata as a flat DataFrame (we’ll merge on two columns)
meta = pd.read_csv("GSE212217_seurat_scRNAseq_metadata.txt", sep="\t")
print("Metadata:", meta.shape, "columns:", meta.columns.tolist())

# Find all your HDF5s
h5_files = sorted(glob.glob("GSM*_scRNA_filtered_feature_bc_matrix.h5"))
print("Found", len(h5_files), "files, e.g.:", h5_files[:2])

adatas = []

for f in h5_files:
    sample = os.path.basename(f).split("_")[0]
    print(f"→ loading {sample} from {f}")
    
    # load with Scanpy
    ad = sc.read_10x_h5(f)
    # join the metadata on BOTH barcode and orig.ident
    #    first—extract the sample‐ID that matches your meta.orig.ident
    ident = os.path.basename(f).split("_")[1]      # e.g. "PEM1C1"
    ad.obs["orig.ident"] = ident
    
    #    bring the obs index (the 10× barcode) into a column so we can merge on it
    ad.obs["cellbarcode"] = ad.obs.index.astype(str)
    
    #    reset_index so everything is in columns, merge, then restore the barcode index
    meta2 = meta.reset_index()  # so meta2 has a "cellbarcode" column + orig.ident
    meta2["cellbarcode"] = meta2["cellbarcode"].str.replace(r"(_\d+)+$", "", regex=True)
    
    obs = (
        ad.obs.reset_index(drop=True)
               .merge(meta2, on=["cellbarcode","orig.ident"], how="left")
               .set_index("cellbarcode")
    )


    mask = obs["nCount_RNA"].notna()

    obs = obs[mask].copy()
    ad = ad[mask.values, :]

    # rebuild the composite index
    composite = obs.index + "_" + obs["orig.ident"]
    obs.index = composite
    ad.obs = obs
    ad.obs_names = composite
    ad.obs["sample"] = ident
    
    #    put it back into your AnnData
    ad.obs = obs
    
    #  now add your sample label
    ad.obs["sample"] = sample
    ad.var_names_make_unique()
    print(f"   → {ad.n_obs} cells × {ad.n_vars} genes, now obs has {ad.obs.shape[1]} columns")
    adatas.append(ad)

  meta = pd.read_csv("GSE212217_seurat_scRNAseq_metadata.txt", sep="\t")


Metadata: (315843, 14) columns: ['cellbarcode', 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'propMt', 'nCount_SCT', 'nFeature_SCT', 'integrated_snn_res.0.9', 'seurat_clusters', 'timepoint', 'timepointBinary', 'patient', 'clinical', 'finalIdent']
Found 52 files, e.g.: ['GSM6514096_PEM1C1_scRNA_filtered_feature_bc_matrix.h5', 'GSM6514097_PEM1C3_scRNA_filtered_feature_bc_matrix.h5']
→ loading GSM6514096 from GSM6514096_PEM1C1_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 4831 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514097 from GSM6514097_PEM1C3_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 3734 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514098 from GSM6514098_PEM2C1_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 5056 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514099 from GSM6514099_PEM2C3_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 4029 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514100 from GSM6514100_PEM2C5_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 4251 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514101 from GSM6514101_PEM3C1_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 4406 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514102 from GSM6514102_PEM3C3_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 3665 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514103 from GSM6514103_PEM3C5_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 3008 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514104 from GSM6514104_PEM5C1_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")


   → 1463 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514105 from GSM6514105_PEM5C3_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 5205 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514106 from GSM6514106_PEM5C5_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 6986 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514107 from GSM6514107_PEM6C1_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 7720 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514108 from GSM6514108_PEM6C3_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 7401 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514109 from GSM6514109_PEM6C5_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 5026 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514110 from GSM6514110_PEM7C1_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 9715 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514111 from GSM6514111_PEM7C3_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 6929 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514112 from GSM6514112_PEM7C5_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 2933 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514113 from GSM6514113_PEM8C1_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 4793 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514114 from GSM6514114_PEM8C3_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 4546 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514115 from GSM6514115_PEM8C5_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 2890 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514116 from GSM6514116_PEM9C1_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 5267 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514117 from GSM6514117_PEM9C3_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 2334 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514118 from GSM6514118_PEM9C5_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 5202 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514119 from GSM6514119_PEM10C1_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 2469 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514120 from GSM6514120_PEM10C3_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 4488 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514121 from GSM6514121_PEM10C5_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 4507 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514122 from GSM6514122_PEM11C1_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 5500 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514123 from GSM6514123_PEM11C3_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 6143 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514124 from GSM6514124_PEM11C5_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 9101 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514125 from GSM6514125_PEM12C1_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 3019 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514126 from GSM6514126_PEM12C3_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 6887 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514127 from GSM6514127_PEM12C5_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 9320 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514128 from GSM6514128_PEM13C1_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 5532 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514129 from GSM6514129_PEM13C5_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 5208 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514130 from GSM6514130_PEM14C1_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 7085 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514131 from GSM6514131_PEM14C5_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 5602 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514132 from GSM6514132_PEM15C1_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 5735 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514133 from GSM6514133_PEM15C5_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 7899 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514134 from GSM6514134_PEM16C1_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 6140 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514135 from GSM6514135_PEM16C5_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 5588 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514136 from GSM6514136_PEM17C1_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 4130 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514137 from GSM6514137_PEM17C5_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 5402 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514138 from GSM6514138_PEM18C1_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 3117 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514139 from GSM6514139_PEM18C5_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 6282 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514140 from GSM6514140_PEM19C1_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 4456 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514141 from GSM6514141_PEM19C5_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 5606 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514142 from GSM6514142_PEM20C1_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 5726 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514143 from GSM6514143_PEM20C5_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 324 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514144 from GSM6514144_PEM21C1_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")


   → 753 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514145 from GSM6514145_PEM22C5_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 6116 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514146 from GSM6514146_PEM23C1_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")


   → 65 cells × 36601 genes, now obs has 15 columns
→ loading GSM6514147 from GSM6514147_PEM23C5_scRNA_filtered_feature_bc_matrix.h5


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


   → 6244 cells × 36601 genes, now obs has 15 columns


  utils.warn_names_duplicates("var")


In [2]:
adata_all = sc.concat(
    adatas,
    join="outer",
    label="sample",
    index_unique=None
)
print("Combined AnnData:", adata_all)

Combined AnnData: AnnData object with n_obs × n_vars = 259834 × 36601
    obs: 'orig.ident', 'index', 'nCount_RNA', 'nFeature_RNA', 'propMt', 'nCount_SCT', 'nFeature_SCT', 'integrated_snn_res.0.9', 'seurat_clusters', 'timepoint', 'timepointBinary', 'patient', 'clinical', 'finalIdent', 'sample'


In [3]:
print("Combined AnnData:", adata_all)
print("Combined obs columns:", adata_all.obs.columns.tolist())

Combined AnnData: AnnData object with n_obs × n_vars = 259834 × 36601
    obs: 'orig.ident', 'index', 'nCount_RNA', 'nFeature_RNA', 'propMt', 'nCount_SCT', 'nFeature_SCT', 'integrated_snn_res.0.9', 'seurat_clusters', 'timepoint', 'timepointBinary', 'patient', 'clinical', 'finalIdent', 'sample'
Combined obs columns: ['orig.ident', 'index', 'nCount_RNA', 'nFeature_RNA', 'propMt', 'nCount_SCT', 'nFeature_SCT', 'integrated_snn_res.0.9', 'seurat_clusters', 'timepoint', 'timepointBinary', 'patient', 'clinical', 'finalIdent', 'sample']


In [4]:
for col in adata_all.obs.columns:
    if adata_all.obs[col].dtype == object:
        adata_all.obs[col] = adata_all.obs[col].fillna("").astype(str)

In [5]:
#save out
adata_all.write_h5ad("GSE212217_all_samples_with_metadata.h5ad")
print("Wrote merged AnnData ➔ GSE212217_all_samples_with_metadata.h5ad")

Wrote merged AnnData ➔ GSE212217_all_samples_with_metadata.h5ad
