# 10xのデータをraw_dataから変換

In [2]:
import h5py, gzip, csv
import numpy as np
import pandas as pd
import scipy.sparse as sp
from tqdm.auto import tqdm
from pathlib import Path

def read_10x_h5_to_sparse_with_progress(h5_path):
    """
    10x Genomics の raw_feature_bc_matrix.h5 を読み込み，CSC スパース行列と各種IDを返す（tqdmつき）．
    """
    with h5py.File(h5_path, "r") as f:
        g = f["matrix"]

        tqdm.write("Reading sparse matrix components...")
        data    = g["data"][:]
        indices = g["indices"][:]
        indptr  = g["indptr"][:]
        shape   = tuple(g["shape"][:])

        X = sp.csc_matrix((data, indices, indptr), shape=shape)  # shape: (genes × cells)

        tqdm.write("Reading features...")
        feats = g["features"]
        feature_ids = [x.decode() if isinstance(x, bytes) else str(x) for x in tqdm(feats["id"], desc="feature_id")]
        gene_names  = [x.decode() if isinstance(x, bytes) else str(x) for x in tqdm(feats["name"], desc="gene_name")]

        tqdm.write("Reading barcodes...")
        barcodes = [x.decode() if isinstance(x, bytes) else str(x) for x in tqdm(g["barcodes"], desc="barcodes")]

    return X, feature_ids, gene_names, barcodes

def to_pickle_like_legacy(X_csc, feature_ids, barcodes, out_path):
    """
    Geneformer風の pickle format に変換（列＝細胞，行＝遺伝子）．
    """
    tqdm.write("Converting to DataFrame (CSR)...")
    X_csr = X_csc.tocsr()

    tqdm.write("Building sparse DataFrame...")
    df = pd.DataFrame.sparse.from_spmatrix(X_csr)
    df.columns = barcodes
    df.insert(0, "feature_id", feature_ids)

    tqdm.write(f"Saving to: {out_path}")
    df.to_pickle(out_path, compression="gzip")

    return df.shape

# 例：使い方
in_path  = Path("/work/kyushu_univ_lecture/10x/neuron_1k_v2_raw_feature_bc_matrix.h5")
out_path = in_path.with_suffix(".pkl.gz")

X, feature_ids, gene_names, barcodes = read_10x_h5_to_sparse_with_progress(str(in_path))
shape = to_pickle_like_legacy(X, feature_ids, barcodes, str(out_path))

print("wrote:", out_path)
print("shape (genes, cells) =", shape)

Reading sparse matrix components...
Reading features...


feature_id:   0%|          | 0/31053 [00:00<?, ?it/s]

gene_name:   0%|          | 0/31053 [00:00<?, ?it/s]

Reading barcodes...


barcodes:   0%|          | 0/737280 [00:00<?, ?it/s]

Converting to DataFrame (CSR)...
Building sparse DataFrame...
Saving to: /work/kyushu_univ_lecture/10x/neuron_1k_v2_raw_feature_bc_matrix.pkl.gz
wrote: /work/kyushu_univ_lecture/10x/neuron_1k_v2_raw_feature_bc_matrix.pkl.gz
shape (genes, cells) = (31053, 737281)


In [5]:
import sys, time
import numpy as np, pandas as pd, scipy.sparse as sp, scanpy as sc
from pathlib import Path
import matplotlib.pyplot as plt
from typing import Union, Optional
from tqdm.notebook import tqdm

# ---- ログユーティリティ ----
_t0 = time.perf_counter()
def log(msg):
    now = time.perf_counter() - _t0
    print(f"[{now:8.2f}s] {msg}")
    sys.stdout.flush()

# ---------- 入力を AnnData に統一（.pkl は絶対に densify しない） ----------
def load_to_adata(input_obj: Union[str, Path, sc.AnnData, pd.DataFrame]):
    if isinstance(input_obj, sc.AnnData):
        ad = input_obj
        if "counts" not in ad.layers:
            ad.layers["counts"] = ad.X.copy()
        return ad

    if isinstance(input_obj, pd.DataFrame):
        log("DataFrame→AnnData 変換開始")
        df = input_obj
        feat = df.iloc[:, 0].astype(str).values
        # ★ SparseDataFrame を想定し，COO→CSR に変換（densify回避）
        mat_df = df.iloc[:, 1:]
        if hasattr(mat_df, "sparse"):
            X = mat_df.sparse.to_coo().tocsr()
        else:
            X = sp.csr_matrix(mat_df.values)  # 最悪ここで densify
        # 向きは (genes, cells) と仮定→転置
        X = X.T
        ad = sc.AnnData(X)
        ad.var_names = feat
        ad.obs_names = mat_df.columns.astype(str)
        ad.layers["counts"] = ad.X.copy()
        log(f"DataFrame→AnnData 完了：shape={ad.shape}")
        return ad

    path = str(input_obj)
    if path.endswith(".h5ad"):
        log(f"read_h5ad 開始: {path}")
        ad = sc.read_h5ad(path)
        if "counts" not in ad.layers:
            ad.layers["counts"] = ad.X.copy()
        log(f"read_h5ad 完了：shape={ad.shape}")
        return ad

    if path.endswith(".pkl") or path.endswith(".pkl.gz"):
        log(f"read_pickle 開始: {path}")
        df = pd.read_pickle(path)
        log("read_pickle 完了")
        # 先頭列（遺伝子）＋残り（細胞）
        feat = df.iloc[:, 0].astype(str).values
        barcodes = df.columns[1:].astype(str).values
        mat_df = df.iloc[:, 1:]

        # ★ ここが重要：pandas Sparse を COOrdinate で取り出して scipy.sparse に直す
        if hasattr(mat_df, "sparse"):
            log("DataFrame.sparse.to_coo() → CSR 変換開始")
            X = mat_df.sparse.to_coo().tocsr()  # shape: (genes, cells)
        else:
            log("警告: densify の可能性あり（.sparse が無い）")
            X = sp.csr_matrix(mat_df.values)

        # 向き自動判定
        if X.shape[0] == len(feat) and X.shape[1] == len(barcodes):
            Xt = X.T  # (cells, genes)
            obs_names, var_names = barcodes, feat
        elif X.shape[1] == len(feat) and X.shape[0] == len(barcodes):
            Xt = X    # 既に (cells, genes)
            obs_names, var_names = barcodes, feat
        else:
            raise ValueError(f"形状不一致：feat={len(feat):,}，mat={X.shape}，barcodes={len(barcodes):,}")

        # 重複名対策（必要ならユニーク化）
        var_idx = pd.Index(var_names).astype(str)
        if var_idx.has_duplicates:
            var_idx = (var_idx + pd.Series(range(len(var_idx))).astype(str).values)
        obs_idx = pd.Index(obs_names).astype(str)
        if obs_idx.has_duplicates:
            obs_idx = (obs_idx + pd.Series(range(len(obs_idx))).astype(str).values)

        ad = sc.AnnData(Xt)
        ad.obs_names = obs_idx
        ad.var_names = var_idx
        ad.layers["counts"] = ad.X.copy()
        log(f".pkl 読み込み→AnnData 完了：shape={ad.shape}，sparse={sp.issparse(ad.layers['counts'])}")
        return ad

    if path.endswith(".h5"):
        log(f"10x .h5 読み込み開始: {path}")
        ad = sc.read_10x_h5(path)
        ad.layers["counts"] = ad.X.copy()
        log(f"10x .h5 読み込み完了：shape={ad.shape}")
        return ad

    raise ValueError("Unsupported input: " + path)

# ---- チャンク生成 ----
def _chunk_ranges(n: int, chunk_size: int):
    for i in range(0, n, chunk_size):
        yield slice(i, min(i + chunk_size, n))

# ---- tqdm 付き compute_qc（心拍ログあり）----
def compute_qc(ad: sc.AnnData, force: bool = False, show_progress: bool = True, chunk_size: int = 50_000) -> sc.AnnData:
    log("compute_qc 開始")
    needed = ["n_counts", "n_genes_by_counts", "mt_counts"]
    if not force and all(col in ad.obs.columns for col in needed):
        log("既存 QC 列を流用（再計算なし）")
        return ad
    if "counts" not in ad.layers:
        raise KeyError("ad.layers['counts'] が見つかりません．")

    X = ad.layers["counts"]
    is_sparse = sp.issparse(X)
    X = X.tocsr() if is_sparse else np.asarray(X)
    n_cells, n_genes = X.shape
    log(f"counts 取得：sparse={is_sparse}，shape={n_cells:,}×{n_genes:,}")

    # ---- MT フラグ作成（NumPyベースで安定）----
    if "symbol" in ad.var.columns:
        var_names = ad.var["symbol"].astype(str).to_numpy()
    else:
        var_names = np.asarray(ad.var_names).astype(str)
    mt_flag = np.char.startswith(np.char.upper(var_names), "MT-")
    mt_any  = bool(mt_flag.any())
    log(f"MT フラグ作成：mt_any={mt_any}")

    n_counts = np.zeros(n_cells, dtype=np.float64)
    n_genes_by_counts = np.zeros(n_cells, dtype=np.int32)
    mt_counts = np.zeros(n_cells, dtype=np.float64)

    if is_sparse:
        it = list(_chunk_ranges(n_cells, chunk_size))
        pbar = tqdm(it, desc=f"QC（sparse）cells={n_cells:,}", unit="chunk") if show_progress else it
        last_hb = time.perf_counter()
        for k, sl in enumerate(pbar):
            Xs = X[sl]
            n_counts[sl] = np.asarray(Xs.sum(axis=1)).ravel()
            n_genes_by_counts[sl] = np.diff(Xs.indptr).astype(np.int32)
            if mt_any:
                mt_counts[sl] = np.asarray(Xs[:, mt_flag].sum(axis=1)).ravel()
            else:
                mt_counts[sl] = 0.0
            now = time.perf_counter()
            if now - last_hb > 5:
                log(f"進捗: chunk {k+1}/{len(it)}（cells ~{sl.stop:,}）")
                last_hb = now
    else:
        log("dense 経路で集計")
        n_counts[:] = X.sum(axis=1).astype(np.float64)
        n_genes_by_counts[:] = (X > 0).sum(axis=1).astype(np.int32)
        mt_counts[:] = (X[:, mt_flag].sum(axis=1).astype(np.float64) if mt_any else 0.0)

    ad.obs["n_counts"] = n_counts
    ad.obs["n_genes_by_counts"] = n_genes_by_counts
    ad.obs["mt_counts"] = mt_counts
    log("compute_qc 完了")
    return ad


# ---- フィルタ ----
def filter_by_original_spec(
    ad: sc.AnnData,
    min_genes: int = 7,
    total_cap: int = 20_000,
    write_loom: Optional[Union[str, Path]] = None,
    write_h5ad: Optional[Union[str, Path]] = None,
    verbose: bool = True,
):
    log("filter_by_original_spec 開始")
    for col in ["n_counts", "n_genes_by_counts", "mt_counts"]:
        if col not in ad.obs.columns:
            raise KeyError(f"{col} がありません．compute_qc() を先に実行してください．")

    c = ad.obs["n_counts"].values
    g = ad.obs["n_genes_by_counts"].values
    m = ad.obs["mt_counts"].values

    mu_c = float(c.mean()); sd_c = float(c.std())
    lo_c = max(0.0, mu_c - 3.0*sd_c); hi_c = mu_c + 3.0*sd_c

    mu_m = float(m.mean()); sd_m = float(m.std())
    lo_m = max(0.0, mu_m - 3.0*sd_m); hi_m = mu_m + 3.0*sd_m

    mask = (
        (c >= lo_c) & (c <= hi_c) &
        (m >= lo_m) & (m <= hi_m) &
        (c > 0) & (c <= total_cap) &
        (g >= int(min_genes))
    )
    ad_f = ad[mask].copy()

    if verbose:
        before = int(ad.n_obs); after = int(ad_f.n_obs)
        removed = before - after; rate = 100.0 * removed / max(before, 1)
        print("=== thresholds (dataset-wise) ===")
        print(f"n_counts in [{lo_c:,.2f}, {hi_c:,.2f}]  (μ={mu_c:,.2f}, σ={sd_c:,.2f})")
        print(f"mt_counts in [{lo_m:,.2f}, {hi_m:,.2f}] (μ={mu_m:,.2f}, σ={sd_m:,.2f})")
        print(f"0 < total ≤ {total_cap:,}")
        print(f"n_genes_by_counts ≥ {min_genes}")
        print(f"cells: {before:,} → {after:,}  (removed {removed:,}, {rate:.2f}%)")

    if write_loom:
        Path(write_loom).parent.mkdir(parents=True, exist_ok=True)
        ad_f.write_loom(str(write_loom)); log(f"wrote loom: {write_loom}")
    if write_h5ad:
        Path(write_h5ad).parent.mkdir(parents=True, exist_ok=True)
        ad_f.write_h5ad(str(write_h5ad), compression="gzip"); log(f"wrote h5ad: {write_h5ad}")

    log("filter_by_original_spec 完了")
    # 可視化用もしきい値返したいならここで dict 返却にしてもOK
    thr = dict(lo_c=lo_c, hi_c=hi_c, lo_m=lo_m, hi_m=hi_m, total_cap=total_cap, min_genes=int(min_genes))
    return ad_f, thr, mask

# ---- 一括ラッパ（ログ込み）----
def run_qc_on_multiple_files(
    input_files: list[str],
    output_loom_dir: Union[str, Path],
    min_genes: int = 7,
    total_cap: int = 20_000,
    show_progress: bool = True,
    plot: bool = False,
    chunk_size: int = 50_000,
) -> list[sc.AnnData]:
    """
    複数ファイルを順にフィルタリングし，出力 loom を保存．
    戻り値：フィルタ後の AnnData オブジェクト一覧
    """
    results = []
    output_loom_dir = Path(output_loom_dir)
    output_loom_dir.mkdir(parents=True, exist_ok=True)

    for path in input_files:
        print("="*80)
        print(f"Processing: {path}")
        name = Path(path).stem  # ex: neuron_1k_v2_raw_feature_bc_matrix
        loom_path = output_loom_dir / f"{name}.filtered.loom"
        try:
            ad_f, ad, thr, mask = run_qc_filter_with_plots(
                path,
                min_genes=min_genes,
                total_cap=total_cap,
                show_progress=show_progress,
                plot=plot,
                write_loom=loom_path,
                chunk_size=chunk_size,
            )
            results.append(ad_f)
        except Exception as e:
            print(f"[ERROR] Skipping {path}: {e}")
    return results



In [None]:
import glob

input_files = sorted(glob.glob("/work/raw_data/10x/*.pkl.gz"))

# フィルタ＋loom出力
filtered_ads = run_qc_on_multiple_files(
    input_files,
    output_loom_dir="/work/raw_data/10x/loom",
    min_genes=7,
    total_cap=20_000,
    show_progress=True,
    plot=False,        # 多数処理時はFalse推奨
    chunk_size=10_000,
)

print("完了：", len(filtered_ads), "ファイル処理済み")


Processing: /work/raw_data/10x/10k_Brain_Cells_from_an_E18_Mouse_v3_chemistry.pkl.gz
[    0.64s] === パイプライン開始 ===
[    0.64s] read_pickle 開始: /work/raw_data/10x/10k_Brain_Cells_from_an_E18_Mouse_v3_chemistry.pkl.gz
[  604.13s] read_pickle 完了
[  814.17s] DataFrame.sparse.to_coo() → CSR 変換開始
[ 1199.87s] .pkl 読み込み→AnnData 完了：shape=(6794880, 31053)，sparse=True
[ 1214.29s] QC 前 スパース判定: True
[ 1214.29s] compute_qc 開始
[ 1215.79s] counts 取得：sparse=True，shape=6,794,880×31,053
[ 1215.80s] MT フラグ作成：mt_any=False


QC（sparse）cells=6,794,880:   0%|          | 0/680 [00:00<?, ?chunk/s]

[ 1216.71s] compute_qc 完了
[ 1216.72s] filter_by_original_spec 開始
=== thresholds (dataset-wise) ===
n_counts in [0.00, 1,178.94]  (μ=14.84, σ=388.03)
mt_counts in [0.00, 0.00] (μ=0.00, σ=0.00)
0 < total ≤ 20,000
n_genes_by_counts ≥ 7
cells: 6,794,880 → 111,484  (removed 6,683,396, 98.36%)
[ 1504.17s] wrote loom: /work/raw_data/10x/loom/10k_Brain_Cells_from_an_E18_Mouse_v3_chemistry.pkl.filtered.loom
[ 1504.17s] filter_by_original_spec 完了
[ 1504.17s] === パイプライン完了 ===
Processing: /work/raw_data/10x/10k_Heart_Cells_from_an_E18_mouse_v3_chemistry.pkl.gz
[ 1504.17s] === パイプライン開始 ===
[ 1504.17s] read_pickle 開始: /work/raw_data/10x/10k_Heart_Cells_from_an_E18_mouse_v3_chemistry.pkl.gz
[ 2255.77s] read_pickle 完了
[ 2488.18s] DataFrame.sparse.to_coo() → CSR 変換開始
[ 2878.96s] .pkl 読み込み→AnnData 完了：shape=(6794880, 31053)，sparse=True
[ 2893.48s] QC 前 スパース判定: True
[ 2893.48s] compute_qc 開始
[ 2894.59s] counts 取得：sparse=True，shape=6,794,880×31,053
[ 2894.60s] MT フラグ作成：mt_any=False


QC（sparse）cells=6,794,880:   0%|          | 0/680 [00:00<?, ?chunk/s]

[ 2895.37s] compute_qc 完了
[ 2895.37s] filter_by_original_spec 開始
=== thresholds (dataset-wise) ===
n_counts in [0.00, 1,655.39]  (μ=13.81, σ=547.19)
mt_counts in [0.00, 0.00] (μ=0.00, σ=0.00)
0 < total ≤ 20,000
n_genes_by_counts ≥ 7
cells: 6,794,880 → 99,743  (removed 6,695,137, 98.53%)
[ 3109.49s] wrote loom: /work/raw_data/10x/loom/10k_Heart_Cells_from_an_E18_mouse_v3_chemistry.pkl.filtered.loom
[ 3109.49s] filter_by_original_spec 完了
[ 3109.49s] === パイプライン完了 ===
Processing: /work/raw_data/10x/10k_Mouse_E18_Combined_Cortex_Hippocampus_and_Subventricular_Zone_Cells_Dual_Indexed.pkl.gz
[ 3109.69s] === パイプライン開始 ===
[ 3109.69s] read_pickle 開始: /work/raw_data/10x/10k_Mouse_E18_Combined_Cortex_Hippocampus_and_Subventricular_Zone_Cells_Dual_Indexed.pkl.gz
[ 3918.34s] read_pickle 完了
[ 4188.20s] DataFrame.sparse.to_coo() → CSR 変換開始
[ 4612.22s] .pkl 読み込み→AnnData 完了：shape=(6794880, 32285)，sparse=True
[ 4629.21s] QC 前 スパース判定: True
[ 4629.21s] compute_qc 開始
[ 4631.63s] counts 取得：sparse=True，shape=

QC（sparse）cells=6,794,880:   0%|          | 0/680 [00:00<?, ?chunk/s]

[ 4633.22s] compute_qc 完了
[ 4633.22s] filter_by_original_spec 開始
=== thresholds (dataset-wise) ===
n_counts in [0.00, 1,947.32]  (μ=23.69, σ=641.21)
mt_counts in [0.00, 0.00] (μ=0.00, σ=0.00)
0 < total ≤ 20,000
n_genes_by_counts ≥ 7
cells: 6,794,880 → 67,557  (removed 6,727,323, 99.01%)
[ 4758.10s] wrote loom: /work/raw_data/10x/loom/10k_Mouse_E18_Combined_Cortex_Hippocampus_and_Subventricular_Zone_Cells_Dual_Indexed.pkl.filtered.loom
[ 4758.10s] filter_by_original_spec 完了
[ 4758.10s] === パイプライン完了 ===
Processing: /work/raw_data/10x/10k_Mouse_E18_Combined_Cortex_Hippocampus_and_Subventricular_Zone_Cells_Single_Indexed.pkl.gz
[ 4758.29s] === パイプライン開始 ===
[ 4758.29s] read_pickle 開始: /work/raw_data/10x/10k_Mouse_E18_Combined_Cortex_Hippocampus_and_Subventricular_Zone_Cells_Single_Indexed.pkl.gz
[ 5594.50s] read_pickle 完了
[ 5835.70s] DataFrame.sparse.to_coo() → CSR 変換開始
[ 6354.68s] .pkl 読み込み→AnnData 完了：shape=(6794880, 32285)，sparse=True
[ 6372.65s] QC 前 スパース判定: True
[ 6372.66s] compute_qc 開

QC（sparse）cells=6,794,880:   0%|          | 0/680 [00:00<?, ?chunk/s]

[ 6377.39s] compute_qc 完了
[ 6377.39s] filter_by_original_spec 開始
=== thresholds (dataset-wise) ===
n_counts in [0.00, 1,936.68]  (μ=24.20, σ=637.49)
mt_counts in [0.00, 0.00] (μ=0.00, σ=0.00)
0 < total ≤ 20,000
n_genes_by_counts ≥ 7
cells: 6,794,880 → 139,316  (removed 6,655,564, 97.95%)
[ 7010.90s] wrote loom: /work/raw_data/10x/loom/10k_Mouse_E18_Combined_Cortex_Hippocampus_and_Subventricular_Zone_Cells_Single_Indexed.pkl.filtered.loom
[ 7010.91s] filter_by_original_spec 完了
[ 7010.91s] === パイプライン完了 ===
Processing: /work/raw_data/10x/1k_Brain_Cells_from_an_E18_Mouse.pkl.gz
[ 7011.14s] === パイプライン開始 ===
[ 7011.14s] read_pickle 開始: /work/raw_data/10x/1k_Brain_Cells_from_an_E18_Mouse.pkl.gz
[ 7065.89s] read_pickle 完了
[ 7075.81s] DataFrame.sparse.to_coo() → CSR 変換開始
[ 7102.27s] .pkl 読み込み→AnnData 完了：shape=(737280, 27998)，sparse=True
[ 7104.63s] QC 前 スパース判定: True
[ 7104.63s] compute_qc 開始
[ 7104.74s] counts 取得：sparse=True，shape=737,280×27,998
[ 7104.76s] MT フラグ作成：mt_any=False


QC（sparse）cells=737,280:   0%|          | 0/74 [00:00<?, ?chunk/s]

[ 7105.12s] compute_qc 完了
[ 7105.12s] filter_by_original_spec 開始
=== thresholds (dataset-wise) ===
n_counts in [0.00, 1,595.99]  (μ=18.76, σ=525.74)
mt_counts in [0.00, 0.00] (μ=0.00, σ=0.00)
0 < total ≤ 20,000
n_genes_by_counts ≥ 7
cells: 737,280 → 62,150  (removed 675,130, 91.57%)
[ 7187.84s] wrote loom: /work/raw_data/10x/loom/1k_Brain_Cells_from_an_E18_Mouse.pkl.filtered.loom
[ 7187.84s] filter_by_original_spec 完了
[ 7187.84s] === パイプライン完了 ===
Processing: /work/raw_data/10x/1k_Brain_Cells_from_an_E18_Mouse_v3_chemistry.pkl.gz
[ 7188.04s] === パイプライン開始 ===
[ 7188.04s] read_pickle 開始: /work/raw_data/10x/1k_Brain_Cells_from_an_E18_Mouse_v3_chemistry.pkl.gz
[ 8041.05s] read_pickle 完了
[ 8317.55s] DataFrame.sparse.to_coo() → CSR 変換開始
[ 8749.18s] .pkl 読み込み→AnnData 完了：shape=(6794880, 31053)，sparse=True
[ 8767.41s] QC 前 スパース判定: True
[ 8767.41s] compute_qc 開始
[ 8767.65s] counts 取得：sparse=True，shape=6,794,880×31,053
[ 8767.66s] MT フラグ作成：mt_any=False


QC（sparse）cells=6,794,880:   0%|          | 0/680 [00:00<?, ?chunk/s]

[ 8769.15s] compute_qc 完了
[ 8769.15s] filter_by_original_spec 開始
=== thresholds (dataset-wise) ===
n_counts in [0.00, 616.82]  (μ=2.56, σ=204.75)
mt_counts in [0.00, 0.00] (μ=0.00, σ=0.00)
0 < total ≤ 20,000
n_genes_by_counts ≥ 7
cells: 6,794,880 → 95,400  (removed 6,699,480, 98.60%)
[ 8918.94s] wrote loom: /work/raw_data/10x/loom/1k_Brain_Cells_from_an_E18_Mouse_v3_chemistry.pkl.filtered.loom
[ 8918.95s] filter_by_original_spec 完了
[ 8918.95s] === パイプライン完了 ===
Processing: /work/raw_data/10x/1k_Brain_Nuclei_from_an_E18_Mouse.pkl.gz
[ 8918.98s] === パイプライン開始 ===
[ 8918.98s] read_pickle 開始: /work/raw_data/10x/1k_Brain_Nuclei_from_an_E18_Mouse.pkl.gz
[ 8972.90s] read_pickle 完了
[ 9162.43s] DataFrame.sparse.to_coo() → CSR 変換開始
[ 9218.45s] .pkl 読み込み→AnnData 完了：shape=(737280, 27998)，sparse=True
[ 9220.58s] QC 前 スパース判定: True
[ 9220.58s] compute_qc 開始
[ 9220.72s] counts 取得：sparse=True，shape=737,280×27,998
[ 9220.74s] MT フラグ作成：mt_any=False


QC（sparse）cells=737,280:   0%|          | 0/74 [00:00<?, ?chunk/s]

[ 9221.07s] compute_qc 完了
[ 9221.07s] filter_by_original_spec 開始
=== thresholds (dataset-wise) ===
n_counts in [0.00, 1,473.26]  (μ=20.10, σ=484.39)
mt_counts in [0.00, 0.00] (μ=0.00, σ=0.00)
0 < total ≤ 20,000
n_genes_by_counts ≥ 7
cells: 737,280 → 69,256  (removed 668,024, 90.61%)
