In [1]:
# Notebook 02: Dropout Simulation

import scanpy as sc
import numpy as np
import pandas as pd
import os



In [2]:
# -----------------------------
# Load Ground Truth
# -----------------------------
adata = sc.read_h5ad('Data/adata_raw_qc.h5ad')

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


In [4]:
# -----------------------------
# Function to Simulate Dropout
# -----------------------------
def simulate_dropout_sparse(X, missing_fraction):
    X = X.astype(np.float32).copy()
    mask = np.random.rand(*X.shape) < missing_fraction
    X[mask] = 0
    return sparse.csr_matrix(X)

# -----------------------------
# Dropout Fractions & Runs
# -----------------------------
missing_fractions = [0.1, 0.2, 0.3]
n_runs = 10



In [4]:
import scanpy as sc
import numpy as np
import os
from scipy import sparse

# ------------------------
# Read original AnnData
# ------------------------
adata = sc.read_h5ad('Data/adata_raw_qc.h5ad')

# Create folder to save dropout h5ads
os.makedirs('dropout_h5ad', exist_ok=True)

# Parameters
missing_fractions = [0.1, 0.2, 0.3]
n_runs = 10
chunk_size = 10000  # number of cells per chunk


# ------------------------
# Dropout simulation (only on nonzeros)
# ------------------------
def simulate_dropout_chunked_sparse_nonzero(X, missing_fraction, chunk_size=10000):
    """Simulate dropout by masking only nonzero entries in sparse chunks."""
    rows, cols = X.shape
    data_chunks = []
    for start in range(0, rows, chunk_size):
        end = min(start + chunk_size, rows)
        # always work with dense copy for masking
        chunk = X[start:end].toarray().astype(np.float32) if sparse.issparse(X) else X[start:end].astype(np.float32).copy()
        
        # Mask only nonzeros
        nonzero_mask = (chunk > 0).astype(bool)
        random_mask = np.random.rand(*chunk.shape) < missing_fraction
        dropout_mask = nonzero_mask & random_mask
        chunk[dropout_mask] = 0
        
        data_chunks.append(sparse.csr_matrix(chunk))
        del chunk
    return sparse.vstack(data_chunks, format='csr')



# ------------------------
# Compute true missing fraction introduced
# ------------------------
def compute_true_missing_fraction(X_dropout, X_original):
    """Compute fraction of originally nonzero values that were zeroed out."""
    if sparse.issparse(X_dropout):
        Xd = X_dropout.toarray()
    else:
        Xd = X_dropout.copy()

    if sparse.issparse(X_original):
        Xo = X_original.toarray()
    else:
        Xo = X_original.copy()

    nonzero_original = Xo > 0
    newly_zeroed = ((Xd == 0) & nonzero_original).sum()
    true_fraction = newly_zeroed / nonzero_original.sum()
    return true_fraction


# ------------------------
# Generate dropout datasets
# ------------------------
for mf in missing_fractions:
    print(f"\nProcessing missing fraction: {mf}")
    for run in range(n_runs):
        # Simulate dropout
        X_dropout = simulate_dropout_chunked_sparse_nonzero(
            adata.X, missing_fraction=mf, chunk_size=chunk_size
        )
        
        # Create new AnnData with sparse X
        adata_dropout = sc.AnnData(
            X_dropout, obs=adata.obs.copy(), var=adata.var.copy()
        )
        
        # Save to disk
        filename = f'dropout_h5ad/adata_dropout_mf{int(mf*100)}_run{run+1}.h5ad'
        adata_dropout.write(filename)
        print(f"Saved: {filename}")
        
        # Compute true missing fraction
        true_mf = compute_true_missing_fraction(X_dropout, adata.X)
        print(f"Run {run+1} → True missing fraction applied: {true_mf:.4f}")
        
        # Cleanup
        del X_dropout, adata_dropout


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")



Processing missing fraction: 0.1
Saved: dropout_h5ad/adata_dropout_mf10_run1.h5ad
Run 1 → True missing fraction applied: 0.1000


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


Saved: dropout_h5ad/adata_dropout_mf10_run2.h5ad
Run 2 → True missing fraction applied: 0.0997


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


Saved: dropout_h5ad/adata_dropout_mf10_run3.h5ad
Run 3 → True missing fraction applied: 0.1001


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


Saved: dropout_h5ad/adata_dropout_mf10_run4.h5ad
Run 4 → True missing fraction applied: 0.1001


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


Saved: dropout_h5ad/adata_dropout_mf10_run5.h5ad
Run 5 → True missing fraction applied: 0.0998


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


Saved: dropout_h5ad/adata_dropout_mf10_run6.h5ad
Run 6 → True missing fraction applied: 0.0995


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


Saved: dropout_h5ad/adata_dropout_mf10_run7.h5ad
Run 7 → True missing fraction applied: 0.1002


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


Saved: dropout_h5ad/adata_dropout_mf10_run8.h5ad
Run 8 → True missing fraction applied: 0.1004


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


Saved: dropout_h5ad/adata_dropout_mf10_run9.h5ad
Run 9 → True missing fraction applied: 0.0996


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


Saved: dropout_h5ad/adata_dropout_mf10_run10.h5ad
Run 10 → True missing fraction applied: 0.0997

Processing missing fraction: 0.2


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


Saved: dropout_h5ad/adata_dropout_mf20_run1.h5ad
Run 1 → True missing fraction applied: 0.1998


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


Saved: dropout_h5ad/adata_dropout_mf20_run2.h5ad
Run 2 → True missing fraction applied: 0.1996


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


Saved: dropout_h5ad/adata_dropout_mf20_run3.h5ad
Run 3 → True missing fraction applied: 0.1995


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


Saved: dropout_h5ad/adata_dropout_mf20_run4.h5ad
Run 4 → True missing fraction applied: 0.2006


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


Saved: dropout_h5ad/adata_dropout_mf20_run5.h5ad
Run 5 → True missing fraction applied: 0.2003


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


Saved: dropout_h5ad/adata_dropout_mf20_run6.h5ad
Run 6 → True missing fraction applied: 0.1998


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


Saved: dropout_h5ad/adata_dropout_mf20_run7.h5ad
Run 7 → True missing fraction applied: 0.2002


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


Saved: dropout_h5ad/adata_dropout_mf20_run8.h5ad
Run 8 → True missing fraction applied: 0.1999


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


Saved: dropout_h5ad/adata_dropout_mf20_run9.h5ad
Run 9 → True missing fraction applied: 0.1993


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


Saved: dropout_h5ad/adata_dropout_mf20_run10.h5ad
Run 10 → True missing fraction applied: 0.2000

Processing missing fraction: 0.3


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


Saved: dropout_h5ad/adata_dropout_mf30_run1.h5ad
Run 1 → True missing fraction applied: 0.2998


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


Saved: dropout_h5ad/adata_dropout_mf30_run2.h5ad
Run 2 → True missing fraction applied: 0.2997


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


Saved: dropout_h5ad/adata_dropout_mf30_run3.h5ad
Run 3 → True missing fraction applied: 0.2993


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


Saved: dropout_h5ad/adata_dropout_mf30_run4.h5ad
Run 4 → True missing fraction applied: 0.2993


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


Saved: dropout_h5ad/adata_dropout_mf30_run5.h5ad
Run 5 → True missing fraction applied: 0.2999


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


Saved: dropout_h5ad/adata_dropout_mf30_run6.h5ad
Run 6 → True missing fraction applied: 0.2990


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


Saved: dropout_h5ad/adata_dropout_mf30_run7.h5ad
Run 7 → True missing fraction applied: 0.2999


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


Saved: dropout_h5ad/adata_dropout_mf30_run8.h5ad
Run 8 → True missing fraction applied: 0.3000


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


Saved: dropout_h5ad/adata_dropout_mf30_run9.h5ad
Run 9 → True missing fraction applied: 0.3004


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


Saved: dropout_h5ad/adata_dropout_mf30_run10.h5ad
Run 10 → True missing fraction applied: 0.2990
