In [2]:
import scanpy as sc
import pandas as pd
import os
from pathlib import Path

# ========== CONFIGURE PATHS ==========
# Base directory where your mapped samples are
base_dir = Path("/rds/general/user/pr422/projects/puklandmarkproject/ephemeral/alex/mapping_outs")

# Output directory for saving converted h5ad files
output_dir = Path("/rds/general/user/pr422/projects/puklandmarkproject/ephemeral/Parisa_scdownstream/scdown_old_samples/h5ad_converted")
output_dir.mkdir(parents=True, exist_ok=True)

# Path to save the samplesheet CSV
samplesheet_path = Path("/rds/general/user/pr422/projects/puklandmarkproject/ephemeral/Parisa_scdownstream/scdown_old_samples/input/samplesheet_converted_remapped_h5ad_epilep.csv")
samplesheet_path.parent.mkdir(parents=True, exist_ok=True)

# ========== START PROCESSING ==========

samples = [d for d in base_dir.iterdir() if d.is_dir() and d.name.endswith("_mapped")]
samples = sorted(samples)

records = []

for sample_path in samples:
    sample_id = sample_path.name.replace("_mapped", "")
    outs_path = sample_path / "outs"
    
    filtered_matrix_dir = outs_path / "filtered_feature_bc_matrix"
    raw_matrix_dir = outs_path / "raw_feature_bc_matrix"

    if not filtered_matrix_dir.exists() or not raw_matrix_dir.exists():
        print(f"Warning: Missing filtered or raw matrix for sample {sample_id}")
        continue

    # --------- Convert filtered
    filtered_adata = sc.read_10x_mtx(
        str(filtered_matrix_dir),
        var_names="gene_symbols",
        cache=False
    )
    filtered_outfile = output_dir / f"{sample_id}_filtered.h5ad"
    filtered_adata.write_h5ad(str(filtered_outfile))

    # --------- Convert raw
    raw_adata = sc.read_10x_mtx(
        str(raw_matrix_dir),
        var_names="gene_symbols",
        cache=False
    )
    raw_outfile = output_dir / f"{sample_id}_raw.h5ad"
    raw_adata.write_h5ad(str(raw_outfile))

    # --------- Save to records
    records.append({
        "sample": sample_id,
        "filtered": str(filtered_outfile),
        "unfiltered": str(raw_outfile)
    })

# ========== WRITE SAMPLE SHEET ==========

samplesheet_df = pd.DataFrame(records, columns=["sample", "filtered", "unfiltered"])
samplesheet_df.to_csv(samplesheet_path, index=False)

print(f"\n✅ Done! Converted {len(records)} samples.")
print(f"✅ Samplesheet saved at: {samplesheet_path}")


✅ Done! Converted 32 samples.
✅ Samplesheet saved at: /rds/general/user/pr422/projects/puklandmarkproject/ephemeral/Parisa_scdownstream/scdown_old_samples/input/samplesheet_converted_remapped_h5ad_epilep.csv


In [3]:
import scanpy as sc

for sample in ["S2A", "S18B"]:
    print(f"Checking {sample}:")
    adata_raw = sc.read_10x_h5(f"{sample}_unfiltered.h5")
    adata_filt = sc.read_10x_h5(f"{sample}_filtered.h5")
    
    print(f"Raw cells: {adata_raw.n_obs}, Filtered cells: {adata_filt.n_obs}")
    print(f"Features match: {adata_raw.var_names.equals(adata_filt.var_names)}")


Checking S2A:


FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = 'S2A_unfiltered.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)