In [None]:
import scipy
import matplotlib
import numpy as np
import skimage
import anndata 
import os
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns
from skimage.filters import threshold_minimum
import scrublet
import importlib.metadata

print("scipy:", scipy.__version__)
print("matplotlib:", matplotlib.__version__)
print("numpy:", np.__version__)
print("skimage:", skimage.__version__)
print("anndata:", anndata.__version__)
print("pandas:", pd.__version__)
print("scanpy:", sc.__version__)
print("seaborn:", sns.__version__)
print("skimage:", skimage.__version__)

scrublet_version = importlib.metadata.version("scrublet")
print("scrublet:", scrublet_version)

In [None]:
from anndata import read_h5ad

# Define the folder path
folder_path = "Intermediate_Files/QC_07232025"

# Load the AnnData objects
PBMC1_gene = read_h5ad(os.path.join(folder_path, "PBMC1_gene_AnnData_v2.h5ad"))
PBMC2_gene = read_h5ad(os.path.join(folder_path, "PBMC2_gene_AnnData_v2.h5ad"))
PBMC1_iso = read_h5ad(os.path.join(folder_path, "PBMC1_iso_AnnData_v2.h5ad"))
PBMC2_iso = read_h5ad(os.path.join(folder_path, "PBMC2_iso_AnnData_v2.h5ad"))

# Verify the structure
print(PBMC1_gene)
print(PBMC2_gene)
print(PBMC1_iso)
print(PBMC2_iso)

In [None]:
# Add sample labels to each dataset for grouping in plots
PBMC1_gene.obs['sample'] = 'PBMC1'
PBMC2_gene.obs['sample'] = 'PBMC2'
PBMC1_iso.obs['sample'] = 'PBMC1'
PBMC2_iso.obs['sample'] = 'PBMC2'

In [None]:
# --- 1. Record initial cell counts per dataset and level ---
# n_obs gives the number of cells (observations) in each AnnData object
init_cell_counts_1g = PBMC1_gene.n_obs   # PBMC1, gene-level data
init_cell_counts_2g = PBMC2_gene.n_obs   # PBMC2, gene-level data
init_cell_counts_1i = PBMC1_iso.n_obs    # PBMC1, isoform-level data
init_cell_counts_2i = PBMC2_iso.n_obs    # PBMC2, isoform-level data

# --- 2. Combine counts into a summary DataFrame ---
init_cell_counts_df = pd.DataFrame({
    'Sample': ['PBMC1 Gene', 'PBMC2 Gene', 'PBMC1 Iso', 'PBMC2 Iso'],
    'Initial Cell #': [init_cell_counts_1g, init_cell_counts_2g, init_cell_counts_1i, init_cell_counts_2i]
})

# --- 3. Display summary ---
print(init_cell_counts_df)

In [None]:
# Identify hemoglobin genes and mitochondrial genes
for dataset in [PBMC1_gene, PBMC2_gene, 
                PBMC1_iso, PBMC2_iso
               ]:
    dataset.var['mt'] = dataset.var_names.str.startswith("MT-")       # mitochondrial genes (e.g., MT-CO1)
    dataset.var['hb'] = dataset.var_names.str.contains("^HB[^(P)]")   # hemoglobin genes (e.g., HBA1, HBB; excludes HBP)

In [None]:
# Calculate quality control (QC) metrics per-cell and store in adata.obs
for dataset in [PBMC1_gene, PBMC2_gene, PBMC1_iso, PBMC2_iso]:
    sc.pp.calculate_qc_metrics(
        dataset,
        qc_vars=["mt", "hb"],   # use flagged mt/hb genes for percent calculations
        percent_top=None,       # skip top-gene summaries
        inplace=True,           # modify AnnData in place
        log1p=False              # log-transform count metrics
    )

In [None]:
# Rename the QC metric columns for isoform data in PBMC1 and PBMC2
PBMC1_iso.obs.rename(
    columns={
        "n_genes_by_counts": "n_isoforms_by_counts",
        "total_counts": "total_counts_isoforms"
    },
    inplace=True
)

PBMC2_iso.obs.rename(
    columns={
        "n_genes_by_counts": "n_isoforms_by_counts",
        "total_counts": "total_counts_isoforms"
    },
    inplace=True
)

In [None]:
# --- Count number of cells expressing each gene ---
# For each dataset, sum across rows (cells) to get how many cells show >0 counts per gene
PBMC1_gene_counts = np.asarray((PBMC1_gene.X > 0).sum(axis=0)).ravel()
if isinstance(PBMC1_gene_counts, np.matrix):  # convert if sparse
    PBMC1_gene_counts = np.array(PBMC1_gene_counts).flatten()

PBMC2_gene_counts = np.asarray((PBMC2_gene.X > 0).sum(axis=0)).ravel()
if isinstance(PBMC2_gene_counts, np.matrix):
    PBMC2_gene_counts = np.array(PBMC2_gene_counts).flatten()

# --- Count number of cells expressing each isoform ---
PBMC1_iso_counts = np.asarray((PBMC1_iso.X > 0).sum(axis=0)).ravel()
if isinstance(PBMC1_iso_counts, np.matrix):
    PBMC1_iso_counts = np.array(PBMC1_iso_counts).flatten()

PBMC2_iso_counts = np.asarray((PBMC2_iso.X > 0).sum(axis=0)).ravel()
if isinstance(PBMC2_iso_counts, np.matrix):
    PBMC2_iso_counts = np.array(PBMC2_iso_counts).flatten()

In [None]:
# --- Plot distributions of cell detection counts ---
fig, axs = plt.subplots(2, 2, figsize=(12, 10))

# Top row: gene-level detection
axs[0, 0].hist(PBMC1_gene_counts, bins=100)
axs[0, 0].set_title('PBMC1 - Cell Counts per Gene')
axs[0, 0].set_xlabel('Number of Cells')
axs[0, 0].set_ylabel('Number of Genes')
axs[0, 0].set_xlim(0, 6000)
axs[0, 0].axvline(10, color='r', linestyle='--', label='Lower Threshold')
axs[0, 0].legend()

axs[0, 1].hist(PBMC2_gene_counts, bins=100)
axs[0, 1].set_title('PBMC2 - Cell Counts per Gene')
axs[0, 1].set_xlabel('Number of Cells')
axs[0, 1].set_ylabel('Number of Genes')
axs[0, 1].set_xlim(0, 6000)
axs[0, 1].axvline(10, color='r', linestyle='--', label='Lower Threshold')
axs[0, 1].legend()

# Bottom row: isoform-level detection
axs[1, 0].hist(PBMC1_iso_counts, bins=100)
axs[1, 0].set_title('PBMC1 - Cell Counts per Isoform')
axs[1, 0].set_xlabel('Number of Cells')
axs[1, 0].set_ylabel('Number of Isoforms')
axs[1, 0].set_xlim(0, 6000)
axs[1, 0].axvline(10, color='r', linestyle='--', label='Lower Threshold')
axs[1, 0].legend()

axs[1, 1].hist(PBMC2_iso_counts, bins=100)
axs[1, 1].set_title('PBMC2 - Cell Counts per Isoform')
axs[1, 1].set_xlabel('Number of Cells')
axs[1, 1].set_ylabel('Number of Isoforms')
axs[1, 1].set_xlim(0, 6000)
axs[1, 1].axvline(10, color='r', linestyle='--', label='Lower Threshold')
axs[1, 1].legend()

# --- Final formatting ---
plt.tight_layout()   # Prevent overlap between subplots
plt.show()

In [None]:
# Identify genes detected in fewer than the threshold in each dataset
lower_threshold = 10
PBMC1_genes_below_threshold = set(PBMC1_gene.var_names[PBMC1_gene_counts < lower_threshold])
PBMC2_genes_below_threshold = set(PBMC2_gene.var_names[PBMC2_gene_counts < lower_threshold])

# Only keep genes below threshold in *both* datasets
genes_to_remove = PBMC1_genes_below_threshold & PBMC2_genes_below_threshold

# Display the genes that will be removed
print(f"Genes expressed in fewer than {lower_threshold} cells in both datasets:", genes_to_remove)

In [None]:
## Initial filtering removed genes and isoforms in less than 10 cells across both datasets

## --- Define stricter filtering threshold ---
# Genes must be expressed in at least 10 cells in *both* PBMC1 and PBMC2
lower_threshold = 10

## --- Count expressing cells per gene in each dataset ---
# (PBMC_gene.X > 0) creates a boolean matrix where True = detected
# Summing across rows gives number of cells with nonzero expression per gene
PBMC1_gene_counts = (PBMC1_gene.X > 0).sum(axis=0).flatten()
PBMC2_gene_counts = (PBMC2_gene.X > 0).sum(axis=0).flatten()

## --- Identify genes below threshold in each dataset ---
PBMC1_genes_below_threshold = set(PBMC1_gene.var_names[PBMC1_gene_counts < lower_threshold])
PBMC2_genes_below_threshold = set(PBMC2_gene.var_names[PBMC2_gene_counts < lower_threshold])

## --- Keep only those below threshold in *both* datasets ---
genes_to_remove = PBMC1_genes_below_threshold & PBMC2_genes_below_threshold

## --- Report summary ---
print(f"🔍 Number of genes to remove (below {lower_threshold} cells in both): {len(genes_to_remove)}")

In [None]:
# Identify isoforms detected in fewer than the threshold in each dataset
lower_threshold = 10

PBMC1_isos_below_threshold = set(PBMC1_iso.var_names[PBMC1_iso_counts < lower_threshold])
PBMC2_isos_below_threshold = set(PBMC2_iso.var_names[PBMC2_iso_counts < lower_threshold])

# Only keep genes below threshold in *both* datasets
isos_to_remove = PBMC1_isos_below_threshold & PBMC2_isos_below_threshold

print(f"🔍 Number of isoforms to remove: {len(isos_to_remove)}")

In [None]:
# --- imports ---
import os
import numpy as np
import pandas as pd
import scipy.sparse as sp

# --- parameters ---
LOWER_THRESHOLD = 10                    # "at least 10 cells" threshold
OUTDIR = "removed_feature_lists"        # where to save the ID lists
TREAT_MISSING_AS_ZERO = False           # False: only consider IDs present in BOTH datasets
                                        # True: union of IDs, treat missing in one dataset as 0 cells

os.makedirs(OUTDIR, exist_ok=True)

# --- helpers ---
def cells_expressing_per_feature(adata) -> pd.Series:
    """
    Number of cells with non-zero expression per feature (column).
    Sparse-safe and memory-light.
    Returns a float Series indexed by adata.var_names.
    """
    X = adata.X
    if sp.issparse(X):
        # Count non-zeros per column without materializing dense arrays
        counts = np.asarray(X.getnnz(axis=0)).ravel()
    else:
        counts = np.asarray((X > 0).sum(axis=0)).ravel()
    # ensure finite
    counts = np.nan_to_num(counts, nan=0.0, posinf=0.0, neginf=0.0)
    return pd.Series(counts, index=adata.var_names)

def ids_below_threshold_both(ser1: pd.Series,
                             ser2: pd.Series,
                             threshold: float,
                             treat_missing_as_zero: bool = False) -> set:
    """
    Return IDs with (# expressing cells) < threshold in BOTH datasets.
    """
    if not treat_missing_as_zero:
        common = ser1.index.intersection(ser2.index)
        a = ser1.reindex(common)
        b = ser2.reindex(common)
        mask = (a < threshold) & (b < threshold)
        return set(common[mask])
    else:
        all_ids = ser1.index.union(ser2.index)
        a = ser1.reindex(all_ids).fillna(0.0)
        b = ser2.reindex(all_ids).fillna(0.0)
        mask = (a < threshold) & (b < threshold)
        return set(all_ids[mask])

def save_id_list(ids: set, path: str):
    with open(path, "w") as f:
        for _id in sorted(ids):
            f.write(f"{_id}\n")

# --- ISOFORMS ---
iso1_cells = cells_expressing_per_feature(PBMC1_iso)
iso2_cells = cells_expressing_per_feature(PBMC2_iso)
isos_to_remove = ids_below_threshold_both(
    iso1_cells, iso2_cells, LOWER_THRESHOLD, treat_missing_as_zero=TREAT_MISSING_AS_ZERO
)
iso_out = os.path.join(OUTDIR, f"removed_isoforms_lt{LOWER_THRESHOLD}_cells_in_both.txt")
save_id_list(isos_to_remove, iso_out)
print(f"🔍 Number of isoforms to remove (below {LOWER_THRESHOLD} cells in both): {len(isos_to_remove)}")
print(f"    → {iso_out}")

# --- GENES ---
gene1_cells = cells_expressing_per_feature(PBMC1_gene)
gene2_cells = cells_expressing_per_feature(PBMC2_gene)
genes_to_remove = ids_below_threshold_both(
    gene1_cells, gene2_cells, LOWER_THRESHOLD, treat_missing_as_zero=TREAT_MISSING_AS_ZERO
)
gene_out = os.path.join(OUTDIR, f"removed_genes_lt{LOWER_THRESHOLD}_cells_in_both.txt")
save_id_list(genes_to_remove, gene_out)
print(f"🔍 Number of genes to remove (below {LOWER_THRESHOLD} cells in both): {len(genes_to_remove)}")
print(f"    → {gene_out}")

In [None]:
def filter_gene_data(adata, genes_to_remove):
    """
    Remove low-expression or undesired genes from an AnnData object.
    
    Parameters:
        adata (AnnData): Input gene expression matrix
        genes_to_remove (list or set): Gene IDs to exclude from the dataset
    
    Returns:
        AnnData: Filtered AnnData object containing only retained genes
    """

    # Convert to set for faster membership checking
    genes_to_remove = set(genes_to_remove)

    # Extract all gene IDs from the dataset
    gene_ids = adata.var_names

    # Create a boolean mask for genes to keep
    # True = keep, False = remove
    mask = ~gene_ids.isin(genes_to_remove)

    # Subset AnnData using the mask
    adata_filtered = adata[:, mask].copy()

    return adata_filtered

In [None]:
# Filter PBMC1 and PBMC2 gene data based on the combined genes to remove
PBMC1_gene = filter_gene_data(PBMC1_gene, genes_to_remove)
PBMC2_gene = filter_gene_data(PBMC2_gene, genes_to_remove)

# Display the filtered isoform data shapes to ensure same number of genes remain
print("Filtered PBMC1 gene data shape:", PBMC1_gene.shape)
print("Filtered PBMC2 gene data shape:", PBMC2_gene.shape)

In [None]:
def filter_iso_data(adata, isos_to_remove):
    # Extract the gene IDs (var_names)
   iso_ids = adata.var_names
   # Filter out genes where GENEID:ENSEMBLEID matches the ones to remove
   mask = ~iso_ids.str.startswith(tuple(isos_to_remove))  # Create a mask for isoforms to keep
    
   # Filter the AnnData object
   adata_filtered = adata[:, mask].copy()  # Filter the columns (features/isoforms) based on the mask
    
   return adata_filtered

In [None]:
# Filter PBMC1 and PBMC2 isoform data based on the combined genes to remove
PBMC1_iso = filter_iso_data(PBMC1_iso, isos_to_remove)
PBMC2_iso = filter_iso_data(PBMC2_iso, isos_to_remove)

In [None]:
# Display the filtered isoform data shapes to ensure isoform numbers between datasets match
print("Filtered PBMC1 isoform data shape:", PBMC1_iso.shape)
print("Filtered PBMC2 isoform data shape:", PBMC2_iso.shape)
print("Filtered PBMC1 gene data shape:", PBMC1_gene.shape)
print("Filtered PBMC2 gene data shape:", PBMC2_gene.shape)

In [None]:
# --- Re-Count number of cells expressing each gene with new dataset ---
# For each dataset, sum across rows (cells) to get how many cells show >0 counts per gene
PBMC1_gene_counts = np.asarray((PBMC1_gene.X > 0).sum(axis=0)).ravel()
if isinstance(PBMC1_gene_counts, np.matrix):  # convert if sparse
    PBMC1_gene_counts = np.array(PBMC1_gene_counts).flatten()

PBMC2_gene_counts = np.asarray((PBMC2_gene.X > 0).sum(axis=0)).ravel()
if isinstance(PBMC2_gene_counts, np.matrix):
    PBMC2_gene_counts = np.array(PBMC2_gene_counts).flatten()

# --- Count number of cells expressing each isoform ---
PBMC1_iso_counts = np.asarray((PBMC1_iso.X > 0).sum(axis=0)).ravel()
if isinstance(PBMC1_iso_counts, np.matrix):
    PBMC1_iso_counts = np.array(PBMC1_iso_counts).flatten()

PBMC2_iso_counts = np.asarray((PBMC2_iso.X > 0).sum(axis=0)).ravel()
if isinstance(PBMC2_iso_counts, np.matrix):
    PBMC2_iso_counts = np.array(PBMC2_iso_counts).flatten()

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(12, 10))

# Plot the distribution of cell counts per gene for PBMC1
axs[0, 0].hist(PBMC1_gene_counts, bins=100)
axs[0, 0].set_xlabel('Number of Cells')
axs[0, 0].set_ylabel('Number of Genes')
axs[0, 0].set_xlim(0, 6000)
axs[0, 0].set_title('PBMC1 - Cell Counts per Gene')
axs[0, 0].axvline(10, color='r', linestyle='--', label='Lower Threshold')
axs[0, 0].legend()

# Plot the distribution of cell counts per gene for PBMC2              
axs[0, 1].hist(PBMC2_gene_counts, bins=100)
axs[0, 1].set_xlabel('Number of Cells')
axs[0, 1].set_ylabel('Number of Genes')
axs[0, 1].set_xlim(0, 6000)
axs[0, 1].set_title('PBMC2 - Cell Counts per Gene')
axs[0, 1].axvline(10, color='r', linestyle='--', label='Lower Threshold')
axs[0, 1].legend()

# Plot the distribution of cell counts per isoform for PBMC1
axs[1, 0].hist(PBMC1_iso_counts, bins=100)
axs[1, 0].set_xlabel('Number of Cells')
axs[1, 0].set_ylabel('Number of Isoforms')
axs[1, 0].set_xlim(0, 6000)
axs[1, 0].set_title('PBMC1 - Cell Counts per Isoform')
axs[1, 0].axvline(10, color='r', linestyle='--', label='Lower Threshold')
axs[1, 0].legend()

# Plot the distribution of cell counts per isoform for PBMC2
axs[1, 1].hist(PBMC2_iso_counts, bins=100)
axs[1, 1].set_xlabel('Number of Cells')
axs[1, 1].set_ylabel('Number of Isoforms')
axs[1, 1].set_xlim(0, 6000)
axs[1, 1].set_title('PBMC2 - Cell Counts per Isoform')
axs[1, 1].axvline(10, color='r', linestyle='--', label='Lower Threshold')
axs[1, 1].legend()

plt.tight_layout()  # Adjust layout to prevent overlapping
plt.show()

In [None]:
# Combine PBMC1 and PBMC2 datasets for plotting
import anndata as ad  # Import anndata explicitly

combined_gene = ad.concat([PBMC1_gene, PBMC2_gene], join="inner")
combined_iso = ad.concat([PBMC1_iso, PBMC2_iso], join="inner")

In [None]:
# --- Define per-cell quality control (QC) thresholds ---
# These thresholds are used to filter out low-quality or outlier cells 
# based on mitochondrial content, hemoglobin expression, total counts, 
# and detected gene/isoform diversity.

# Mitochondrial gene percentage (high values = stressed/dying cells)
mt_upper_threshold = 15       # max % mitochondrial reads per cell

# Hemoglobin gene percentage (used to flag potential erythrocyte contamination)
hb_lower_threshold = 5        # min % Hb reads (exclude low / no Hb if expected)
hb_upper_threshold = 100      # max % Hb reads (exclude RBC-dominant cells), optional

# Total read counts per cell (too low = dropout, too high = doublet)
total_counts_lower_threshold = 600     # min reads per cell
total_counts_upper_threshold = 11000   # max reads per cell

# Number of detected genes per cell (low = poor coverage, high = possible doublet)
n_genes_lower_threshold = 250          # min genes detected
n_genes_upper_threshold = 1800         # max genes detected

# Number of detected isoforms per cell (similar logic to genes)
n_isoforms_lower_threshold = 350       # min isoforms detected
n_isoforms_upper_threshold = 3000      # max isoforms detected

In [None]:
#ensure that combined shape matches (total cells, total genes)
print("Shape of PBMC1_gene:",PBMC1_gene.shape)
print("Shape of PBMC2_gene:",PBMC2_gene.shape)
print("Shape of combined gene dataset:",combined_gene.shape) 
print("")
#ensure that combined shape matches (total cells, total isoforms)
print("Shape of PBMC1_iso:",PBMC1_iso.shape)
print("Shape of PBMC2_iso:",PBMC2_iso.shape)
print("Shape of combined isoform dataset:",combined_iso.shape)

In [None]:
# Function to plot QC metrics with thresholds
def plot_violin_with_thresholds(adata, metric, title, lower_thresh=None, upper_thresh=None, ax=None):
    """
    Plots a violin + scatter plot of a QC metric across samples,
    with optional horizontal lines showing filtering thresholds.
    
    Parameters:
        adata (AnnData): Object containing cell metadata in .obs
        metric (str): Column name in adata.obs to plot (e.g., 'percent_mt')
        title (str): Plot title
        lower_thresh, upper_thresh (float, optional): Threshold values to display
        ax (matplotlib.axes.Axes, optional): Axis to plot on (creates new one if None)
    """
    if ax is None:
        fig, ax = plt.subplots(figsize=(8, 6))  # Create a new figure if no axis is provided

    # Use violin plot to show metric distribution per sample
    sns.violinplot(x='sample', y=metric, data=adata.obs, inner=None, ax=ax)

    # Overlay individual points for cells
    sns.stripplot(x='sample', y=metric, data=adata.obs, color='k', jitter=True, size=1.0, alpha=0.5, ax=ax)

    # Add threshold lines
    if lower_thresh is not None:
        ax.axhline(y=lower_thresh, color='r', linestyle='--', label=f'Lower Threshold ({lower_thresh})')
    if upper_thresh is not None:
        ax.axhline(y=upper_thresh, color='r', linestyle='--', label=f'Upper Threshold ({upper_thresh})')

    ax.set_title(title)
    ax.legend() 

In [None]:
# Plot QC metrics for genes
fig, axs = plt.subplots(2, 2, figsize=(12, 10))  # Create a 2x2 grid of subplots

plot_violin_with_thresholds(combined_gene, "n_genes_by_counts", 
                            "Unique Genes Per Cell", n_genes_lower_threshold, n_genes_upper_threshold, ax=axs[0, 0])

plot_violin_with_thresholds(combined_gene, "total_counts", 
                            "Total Gene Transcripts Per Cell", total_counts_lower_threshold, total_counts_upper_threshold, ax=axs[0, 1])

plot_violin_with_thresholds(combined_gene, "pct_counts_mt", 
                            "Percent Mitochondrial Genes per Cell", None, mt_upper_threshold, ax=axs[1, 0])

plot_violin_with_thresholds(combined_gene, "pct_counts_hb", 
                            "Percent Hemoglobin Genes per Cell", hb_lower_threshold, hb_upper_threshold, ax=axs[1, 1])

plt.tight_layout()  # Adjust layout to prevent overlapping
plt.show()

In [None]:
# Plot QC metrics for isoforms (adjusted column names)
fig, axs = plt.subplots(2, 2, figsize=(12, 10))  # Create a 2x2 grid of subplots

plot_violin_with_thresholds(combined_iso, "n_isoforms_by_counts", "Unique Isoforms per Cell", n_isoforms_lower_threshold, n_isoforms_upper_threshold, ax=axs[0, 0])
plot_violin_with_thresholds(combined_iso, "total_counts_isoforms", "Total Isoform Transcripts per Cell", total_counts_lower_threshold, total_counts_upper_threshold, ax=axs[0, 1])
plot_violin_with_thresholds(combined_iso, "pct_counts_mt", "Percentage of Mitochondrial Isoforms", None, mt_upper_threshold, ax=axs[1, 0])
plot_violin_with_thresholds(combined_iso, "pct_counts_hb", "Percentage of Hemoglobin Isoforms", hb_lower_threshold, hb_upper_threshold, ax=axs[1, 1])

In [None]:
# Plot scatter for Gene-level
sc.pl.scatter(combined_gene, "total_counts", "n_genes_by_counts", color="pct_counts_mt", title="Total counts vs. detected genes (colored by % mitochondrial reads)")
plt.show()

In [None]:
# Plot scatter for Isoform-level
sc.pl.scatter(combined_iso, "total_counts_isoforms", "n_isoforms_by_counts", color="pct_counts_mt", title="Total counts vs. detected isoforms (colored by % mitochondrial reads)")
plt.show()

In [None]:
## Filter cells based on quality control metrics

PBMC1_g_filtered = PBMC1_gene[
    (PBMC1_gene.obs['total_counts'].between(total_counts_lower_threshold, total_counts_upper_threshold)) &
    (PBMC1_gene.obs['n_genes_by_counts'].between(n_genes_lower_threshold, n_genes_upper_threshold)) &
    (PBMC1_gene.obs['pct_counts_mt'] < mt_upper_threshold) &
    ((PBMC1_gene.obs['pct_counts_hb'] <= hb_lower_threshold) | (PBMC1_gene.obs['pct_counts_hb'] >= hb_upper_threshold)),
    :
]

PBMC2_g_filtered = PBMC2_gene[
    (PBMC2_gene.obs['total_counts'].between(total_counts_lower_threshold, total_counts_upper_threshold)) &
    (PBMC2_gene.obs['n_genes_by_counts'].between(n_genes_lower_threshold, n_genes_upper_threshold)) &
    (PBMC2_gene.obs['pct_counts_mt'] < mt_upper_threshold) &
    ((PBMC2_gene.obs['pct_counts_hb'] <= hb_lower_threshold) | (PBMC2_gene.obs['pct_counts_hb'] >= hb_upper_threshold)),
    :
]

In [None]:
PBMC1_i_filtered = PBMC1_iso[
    (PBMC1_iso.obs['total_counts_isoforms'].between(total_counts_lower_threshold, total_counts_upper_threshold)) &
    (PBMC1_iso.obs['n_isoforms_by_counts'].between(n_isoforms_lower_threshold, n_isoforms_upper_threshold)) &
    (PBMC1_iso.obs['pct_counts_mt'] < mt_upper_threshold) &
    ((PBMC1_iso.obs['pct_counts_hb'] <= hb_lower_threshold) | (PBMC1_iso.obs['pct_counts_hb'] >= hb_upper_threshold)),
    :
]

PBMC2_i_filtered = PBMC2_iso[
    (PBMC2_iso.obs['total_counts_isoforms'].between(total_counts_lower_threshold, total_counts_upper_threshold)) &
    (PBMC2_iso.obs['n_isoforms_by_counts'].between(n_isoforms_lower_threshold, n_isoforms_upper_threshold)) &
    (PBMC2_iso.obs['pct_counts_mt'] < mt_upper_threshold) &
    ((PBMC2_iso.obs['pct_counts_hb'] <= hb_lower_threshold) | (PBMC2_iso.obs['pct_counts_hb'] >= hb_upper_threshold)),
    :
]

In [None]:
# Post QC cell counts
post_qc_cell_counts_1g = PBMC1_g_filtered.n_obs
post_qc_cell_counts_2g = PBMC2_g_filtered.n_obs
post_qc_cell_counts_1i = PBMC1_i_filtered.n_obs
post_qc_cell_counts_2i = PBMC2_i_filtered.n_obs

post_qc_cell_counts_df = pd.DataFrame({
    'Sample': ['PBMC1 Gene', 'PBMC2 Gene', 
               'PBMC1 Isoform', 'PBMC2 Isoform'
              ],
    'Initial Cell #': [init_cell_counts_1g, init_cell_counts_2g, 
                       init_cell_counts_1i, init_cell_counts_2i
                      ],
    'Post QC Counts': [post_qc_cell_counts_1g, post_qc_cell_counts_2g, 
                       post_qc_cell_counts_1i, post_qc_cell_counts_2i
                      ],
    'Percent Remaining': [post_qc_cell_counts_1g / init_cell_counts_1g * 100, post_qc_cell_counts_2g / init_cell_counts_2g * 100, 
                          post_qc_cell_counts_1i / init_cell_counts_1i * 100, post_qc_cell_counts_2i / init_cell_counts_2i * 100
                         ]
})

print(post_qc_cell_counts_df)

In [None]:
# Combine PBMC1 and PBMC2 gene datasets for plotting
combined_gene_filtered = ad.concat([PBMC1_g_filtered, PBMC2_g_filtered], join="inner")

# Combine PBMC1 and PBMC2 isoformdatasets for plotting
combined_iso_filtered = ad.concat([PBMC1_i_filtered, PBMC2_i_filtered], join="inner")

In [None]:
# Plot QC metrics for genes
fig, axs = plt.subplots(2, 2, figsize=(12, 10))  # Create a 2x2 grid of subplots

plot_violin_with_thresholds(combined_gene_filtered, "n_genes_by_counts", 
                            "Unique Genes Per Cell", n_genes_lower_threshold, n_genes_upper_threshold, ax=axs[0, 0])

plot_violin_with_thresholds(combined_gene_filtered, "total_counts", 
                            "Total Gene Transcripts Per Cell", total_counts_lower_threshold, total_counts_upper_threshold, ax=axs[0, 1])

plot_violin_with_thresholds(combined_gene_filtered, "pct_counts_mt", 
                            "Percent Mitochondrial Genes per Cell", None, mt_upper_threshold, ax=axs[1, 0])

plot_violin_with_thresholds(combined_gene_filtered, "pct_counts_hb", 
                            "Percent Hemoglobin Genes per Cell", hb_lower_threshold, hb_upper_threshold, ax=axs[1, 1])

plt.tight_layout()  # Adjust layout to prevent overlapping
plt.show()

In [None]:
# Plot QC metrics for isoforms (adjusted column names)
fig, axs = plt.subplots(2, 2, figsize=(12, 10))  # Create a 2x2 grid of subplots

plot_violin_with_thresholds(combined_iso_filtered, "n_isoforms_by_counts", "Unique Isoforms per Cell", n_genes_lower_threshold, n_isoforms_upper_threshold, ax=axs[0, 0])
plot_violin_with_thresholds(combined_iso_filtered, "total_counts_isoforms", "Total Isoform Transcripts per Cell", total_counts_lower_threshold, total_counts_upper_threshold, ax=axs[0, 1])
plot_violin_with_thresholds(combined_iso_filtered, "pct_counts_mt", "Percentage of Mitochondrial Isoforms", None, mt_upper_threshold, ax=axs[1, 0])
plot_violin_with_thresholds(combined_iso_filtered, "pct_counts_hb", "Percentage of Hemoglobin Isoforms", hb_lower_threshold, hb_upper_threshold, ax=axs[1, 1])

In [None]:
import os, gc
import numpy as np
import scipy.sparse as sp

# Avoid HDF5/GPFS lock issues during write
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"

def prepare_for_write(adata,
                      *,
                      downcast_ints=False):
    """
    Make AnnData safe/small to write:
      - materialize view -> copy
      - ensure unique obs/var names (prevents future error)
      - cast X to float32
      - downcast float cols to float32 (and optionally ints)
    Returns a (possibly new) AnnData.
    """
    # Materialize view to avoid ImplicitModificationWarning / FutureError
    if adata.is_view:
        adata = adata.copy()

    # Ensure unique names (your warning mentioned non-unique indices)
    if not adata.var_names.is_unique:
        adata.var_names_make_unique()
    if not adata.obs_names.is_unique:
        adata.obs_names_make_unique()

    # Cast X -> float32 (no view now, so no warning)
    if sp.issparse(adata.X):
        # keep structure, shrink data
        adata.X.data = adata.X.data.astype(np.float32, copy=False)
    else:
        adata.X = np.asarray(adata.X, dtype=np.float32)

    # Downcast numeric columns in obs/var (floats -> float32; optional ints)
    for df in (adata.obs, adata.var):
        # floats
        float_cols = df.select_dtypes(include=["float64", "float32"]).columns
        if len(float_cols):
            df[float_cols] = df[float_cols].astype("float32")
        # ints (optional; keeps exact values if no NA)
        if downcast_ints:
            int_cols = df.select_dtypes(include=["int64", "int32", "Int64"]).columns
            for c in int_cols:
                # preserve nullable integer dtype if there are NAs
                if pd.api.types.is_integer_dtype(df[c].dtype) and getattr(df[c], "isna", lambda: False)().any():
                    continue
                df[c] = pd.to_numeric(df[c], downcast="integer")

    gc.collect()
    return adata

In [None]:
# Use it (returns real, slimmed objects; no view warnings)
PBMC1_g_filtered = prepare_for_write(PBMC1_g_filtered, downcast_ints=False)
PBMC2_g_filtered = prepare_for_write(PBMC2_g_filtered, downcast_ints=False)
PBMC1_i_filtered = prepare_for_write(PBMC1_i_filtered, downcast_ints=False)
PBMC2_i_filtered = prepare_for_write(PBMC2_i_filtered,downcast_ints=False)
gc.collect()

In [None]:
PBMC1_i_filtered

In [None]:
int_folder = "Intermediate_Files/QC_07232025"

# Atomic write to avoid partial files on failure; stays on the same filesystem
import tempfile, os

def atomic_write_h5ad(adata, final_path):
    folder = os.path.dirname(final_path)
    fd, tmp_path = tempfile.mkstemp(dir=folder, prefix=".tmp_write_", suffix=".h5ad")
    os.close(fd)
    try:
        adata.write(tmp_path, compression="gzip")  # anndata >= 0.9
        os.replace(tmp_path, final_path)
    finally:
        if os.path.exists(tmp_path):
            try: os.remove(tmp_path)
            except OSError: pass

In [None]:
atomic_write_h5ad(PBMC1_g_filtered, os.path.join(int_folder, "PBMC1_gene_AnnData_filtered.h5ad"))
atomic_write_h5ad(PBMC2_g_filtered, os.path.join(int_folder, "PBMC2_gene_AnnData_filtered.h5ad"))
atomic_write_h5ad(PBMC1_i_filtered, os.path.join(int_folder, "PBMC1_iso_AnnData_filtered.h5ad"))
atomic_write_h5ad(PBMC2_i_filtered, os.path.join(int_folder, "PBMC2_iso_AnnData_filtered.h5ad"))

In [None]:
### Doublet detection ###

In [None]:
from anndata import read_h5ad

# Define the folder path
folder_path = "Intermediate_Files/QC_07232025"

# Load the AnnData objects
PBMC1_g_filtered = read_h5ad(os.path.join(folder_path, 
                                     "PBMC1_gene_AnnData_filtered.h5ad"))
PBMC2_g_filtered = read_h5ad(os.path.join(folder_path, 
                                     "PBMC2_gene_AnnData_filtered.h5ad"))
PBMC1_i_filtered = read_h5ad(os.path.join(folder_path, 
                                     "PBMC1_iso_AnnData_filtered.h5ad"))
PBMC2_i_filtered = read_h5ad(os.path.join(folder_path, 
                                     "PBMC2_iso_AnnData_filtered.h5ad"))

In [None]:
# Verify the structure
print(PBMC1_g_filtered)
print(PBMC2_g_filtered)
print(PBMC1_i_filtered)
print(PBMC2_i_filtered)

In [None]:
import numpy as np

# Convert sparse matrix to dense format before checking NaNs
print("Initial NaN values in PBMC1 gene matrix:", np.isnan(PBMC1_g_filtered.X).sum())
print("Initial NaN values in PBMC2 gene matrix:", np.isnan(PBMC2_g_filtered.X).sum())
print("Initial NaN values in PBMC1 isoform matrix:", np.isnan(PBMC1_i_filtered.X).sum())
print("Initial NaN values in PBMC2 isoform matrix:", np.isnan(PBMC2_i_filtered.X).sum())

In [None]:
# Confirm that datasets are dense matrices. If so, commands will return "False"
from scipy.sparse import issparse

print("PBMC1_gene sparse:", issparse(PBMC1_g_filtered.X))
print("PBMC2_gene sparse:", issparse(PBMC2_g_filtered.X))
print("PBMC1_iso sparse:", issparse(PBMC1_i_filtered.X))
print("PBMC2_iso sparse:", issparse(PBMC2_i_filtered.X))

In [None]:
## Convert Matrices to Dense for Scrublet, if not already, and confirm that it worked properly
#PBMC1_g_filtered.X = PBMC1_g_filtered.X.toarray()
#PBMC2_g_filtered.X = PBMC2_g_filtered.X.toarray()
#PBMC1_i_filtered.X = PBMC1_i_filtered.X.toarray()
#PBMC2_i_filtered.X = PBMC2_i_filtered.X.toarray()

## If datasets are dense matrices, commands will return "False"
#from scipy.sparse import issparse

#print("PBMC_gene_1 sparse:", issparse(PBMC1_g_filtered.X))
#print("PBMC_gene_2 sparse:", issparse(PBMC2_g_filtered.X))
#print("PBMC_iso_1 sparse:", issparse(PBMC1_i_filtered.X))
#print("PBMC_iso_2 sparse:", issparse(PBMC2_i_filtered.X))

In [None]:
## Check For Doublets
sc.pp.scrublet(PBMC1_g_filtered)

In [None]:
sc.pp.scrublet(PBMC2_g_filtered)

In [None]:
sc.pp.scrublet(PBMC1_i_filtered)

In [None]:
sc.pp.scrublet(PBMC2_i_filtered)

In [None]:
from skimage.filters import threshold_minimum

## Calculate the automatic threshold using Scrublet's method
auto_gene1_threshold_PBMC = threshold_minimum(PBMC1_g_filtered.obs['doublet_score'].values)
auto_gene2_threshold_PBMC = threshold_minimum(PBMC2_g_filtered.obs['doublet_score'].values)
auto_iso1_threshold_PBMC = threshold_minimum(PBMC1_i_filtered.obs['doublet_score'].values)
auto_iso2_threshold_PBMC = threshold_minimum(PBMC2_i_filtered.obs['doublet_score'].values)

In [None]:
# Plot the gene histogram of doublet scores with both thresholds
plt.figure(figsize=(10, 6))
plt.hist(PBMC1_g_filtered.obs['doublet_score'], bins=50, alpha=0.6, color='g')
plt.axvline(auto_gene1_threshold_PBMC, color='b', linestyle='dashed', linewidth=2, label=f'Automatic Threshold ({auto_gene1_threshold_PBMC:.3f})')
plt.xlabel('Doublet Score')
plt.ylabel('Number of Cells')
plt.title('Distribution of Doublet Scores in PBMC1 Gene Dataset')
plt.legend()
plt.show()

In [None]:
# Plot the gene histogram of doublet scores with both thresholds
plt.figure(figsize=(10, 6))
plt.hist(PBMC2_g_filtered.obs['doublet_score'], bins=50, alpha=0.6, color='g')
plt.axvline(auto_gene2_threshold_PBMC, color='b', linestyle='dashed', linewidth=2, label=f'Automatic Threshold ({auto_gene2_threshold_PBMC:.3f})')
plt.xlabel('Doublet Score')
plt.ylabel('Number of Cells')
plt.title('Distribution of Doublet Scores in PBMC2 Gene Dataset')
plt.legend()
plt.show()

In [None]:
# Plot the gene histogram of doublet scores with both thresholds
plt.figure(figsize=(10, 6))
plt.hist(PBMC1_i_filtered.obs['doublet_score'], bins=50, alpha=0.6, color='g')
plt.axvline(auto_iso1_threshold_PBMC, color='b', linestyle='dashed', linewidth=2, label=f'Automatic Threshold ({auto_iso1_threshold_PBMC:.3f})')
plt.xlabel('Doublet Score')
plt.ylabel('Number of Cells')
plt.title('Distribution of Doublet Scores in PBMC1 Isoform Dataset')
plt.legend()
plt.show()

In [None]:
# Plot the gene histogram of doublet scores with both thresholds
plt.figure(figsize=(10, 6))
plt.hist(PBMC2_i_filtered.obs['doublet_score'], bins=50, alpha=0.6, color='g')
plt.axvline(auto_iso2_threshold_PBMC, color='b', linestyle='dashed', linewidth=2, label=f'Automatic Threshold ({auto_iso2_threshold_PBMC:.3f})')
plt.xlabel('Doublet Score')
plt.ylabel('Number of Cells')
plt.title('Distribution of Doublet Scores in PBMC2 Isoform Dataset')
plt.legend()
plt.show()

In [None]:
# Set final threshold where threshold starts to level out
gene_threshold = 0.3
iso_threshold = 0.3

# Add doublet scores to AnnData object
PBMC1_g_filtered.obs['predicted_doublet_auto'] = PBMC1_g_filtered.obs['doublet_score'] > auto_gene1_threshold_PBMC
PBMC2_g_filtered.obs['predicted_doublet_auto'] = PBMC2_g_filtered.obs['doublet_score'] > auto_gene2_threshold_PBMC
PBMC1_i_filtered.obs['predicted_doublet_auto'] = PBMC1_i_filtered.obs['doublet_score'] > auto_iso1_threshold_PBMC
PBMC2_i_filtered.obs['predicted_doublet_auto'] = PBMC2_i_filtered.obs['doublet_score'] > auto_iso2_threshold_PBMC

PBMC1_g_filtered.obs['predicted_doublet_final'] = PBMC1_g_filtered.obs['doublet_score'] > gene_threshold
PBMC2_g_filtered.obs['predicted_doublet_final'] = PBMC2_g_filtered.obs['doublet_score'] > gene_threshold
PBMC1_i_filtered.obs['predicted_doublet_final'] = PBMC1_i_filtered.obs['doublet_score'] > iso_threshold
PBMC2_i_filtered.obs['predicted_doublet_final'] = PBMC2_i_filtered.obs['doublet_score'] > iso_threshold

In [None]:
#print(f"Manual threshold: {manual_gene_threshold_PBMC}")
print(f"Automatic PBMC1 Gene threshold: {auto_gene1_threshold_PBMC}")
print(f"Automatic PBMC2 Gene threshold: {auto_gene2_threshold_PBMC}")
print(f"Final Gene threshold: {gene_threshold}")
print(f"Number of cells predicted as doublets in PBMC1 Gene with automatic threshold: {PBMC1_g_filtered.obs['predicted_doublet_auto'].sum()}")
print(f"Number of cells predicted as doublets in PBMC2 Gene with automatic threshold: {PBMC2_g_filtered.obs['predicted_doublet_auto'].sum()}")
print(f"Number of cells predicted as doublets in PBMC1 Gene with final threshold: {PBMC1_g_filtered.obs['predicted_doublet_final'].sum()}")
print(f"Number of cells predicted as doublets in PBMC2 Gene with final threshold: {PBMC2_g_filtered.obs['predicted_doublet_final'].sum()}")

In [None]:
#print(f"Manual threshold: {manual_gene_threshold_PBMC}")
print(f"Automatic PBMC1 Iso threshold: {auto_iso1_threshold_PBMC}")
print(f"Automatic PBMC2 Iso threshold: {auto_iso2_threshold_PBMC}")
print(f"Final Iso threshold: {iso_threshold}")
print(f"Number of cells predicted as doublets in PBMC1 Iso with automatic threshold: {PBMC1_i_filtered.obs['predicted_doublet_auto'].sum()}")
print(f"Number of cells predicted as doublets in PBMC2 Iso with automatic threshold: {PBMC2_i_filtered.obs['predicted_doublet_auto'].sum()}")
print(f"Number of cells predicted as doublets in PBMC1 Iso with final threshold: {PBMC1_i_filtered.obs['predicted_doublet_final'].sum()}")
print(f"Number of cells predicted as doublets in PBMC2 Iso with final threshold: {PBMC2_i_filtered.obs['predicted_doublet_final'].sum()}")

In [None]:
# Plot the gene histogram of doublet scores with both thresholds
plt.figure(figsize=(10, 6))
plt.hist(PBMC1_g_filtered.obs['doublet_score'], bins=50, alpha=0.6, color='g')
plt.axvline(gene_threshold, color='r', linestyle='dashed', linewidth=2, label=f'Final Threshold ({gene_threshold:.3f})')
plt.axvline(auto_gene1_threshold_PBMC, color='b', linestyle='dashed', linewidth=2, label=f'Automatic Threshold ({auto_gene1_threshold_PBMC:.3f})')
plt.xlabel('Doublet Score')
plt.ylabel('Number of Cells')
plt.title('Distribution of Doublet Scores in PBMC1 Gene Dataset')
plt.legend()
plt.show()

In [None]:
# Plot the gene histogram of doublet scores with both thresholds
plt.figure(figsize=(10, 6))
plt.hist(PBMC2_g_filtered.obs['doublet_score'], bins=50, alpha=0.6, color='g')
plt.axvline(gene_threshold, color='r', linestyle='dashed', linewidth=2, label=f'Final Threshold ({gene_threshold:.3f})')
plt.axvline(auto_gene2_threshold_PBMC, color='b', linestyle='dashed', linewidth=2, label=f'Automatic Threshold ({auto_gene2_threshold_PBMC:.3f})')
plt.xlabel('Doublet Score')
plt.ylabel('Number of Cells')
plt.title('Distribution of Doublet Scores in PBMC2 Gene Dataset')
plt.legend()
plt.show()

In [None]:
# Plot the gene histogram of doublet scores with both thresholds
plt.figure(figsize=(10, 6))
plt.hist(PBMC1_i_filtered.obs['doublet_score'], bins=50, alpha=0.6, color='g')
plt.axvline(iso_threshold, color='r', linestyle='dashed', linewidth=2, label=f'Final Threshold ({iso_threshold:.3f})')
plt.axvline(auto_iso1_threshold_PBMC, color='b', linestyle='dashed', linewidth=2, label=f'Automatic Threshold ({auto_iso1_threshold_PBMC:.3f})')
plt.xlabel('Doublet Score')
plt.ylabel('Number of Cells')
plt.title('Distribution of Doublet Scores in PBMC1 Iso Dataset')
plt.legend()
plt.show()

In [None]:
# Plot the gene histogram of doublet scores with both thresholds
plt.figure(figsize=(10, 6))
plt.hist(PBMC2_i_filtered.obs['doublet_score'], bins=50, alpha=0.6, color='g')
plt.axvline(iso_threshold, color='r', linestyle='dashed', linewidth=2, label=f'Final Threshold ({iso_threshold:.3f})')
plt.axvline(auto_iso2_threshold_PBMC, color='b', linestyle='dashed', linewidth=2, label=f'Automatic Threshold ({auto_iso2_threshold_PBMC:.3f})')
plt.xlabel('Doublet Score')
plt.ylabel('Number of Cells')
plt.title('Distribution of Doublet Scores in PBMC2 Iso Dataset')
plt.legend()
plt.show()

In [None]:
# Filter out predicted doublets using the final threshold
PBMC1_g_std_filtered = PBMC1_g_filtered[~PBMC1_g_filtered.obs['predicted_doublet_final']]
PBMC2_g_std_filtered = PBMC2_g_filtered[~PBMC2_g_filtered.obs['predicted_doublet_final']]
PBMC1_i_std_filtered = PBMC1_i_filtered[~PBMC1_i_filtered.obs['predicted_doublet_final']]
PBMC2_i_std_filtered = PBMC2_i_filtered[~PBMC2_i_filtered.obs['predicted_doublet_final']]

print(f"Number of cells in PBMC1 Gene dataset before filtering: {PBMC1_g_filtered.shape[0]}")
print(f"Number of cells in PBMC1 Gene dataset after filtering with final threshold: {PBMC1_g_std_filtered.shape[0]}")
print("")
print(f"Number of cells in PBMC2 Gene dataset before filtering: {PBMC2_g_filtered.shape[0]}")
print(f"Number of cells in PBMC2 Gene dataset after filtering with final threshold: {PBMC2_g_std_filtered.shape[0]}")
print("")
print(f"Number of cells in PBMC1 Iso dataset before filtering: {PBMC1_i_filtered.shape[0]}")
print(f"Number of cells in PBMC1 Iso dataset after filtering with final threshold: {PBMC1_i_std_filtered.shape[0]}")
print("")
print(f"Number of cells in PBMC2 Iso dataset before filtering: {PBMC2_i_filtered.shape[0]}")
print(f"Number of cells in PBMC2 Iso dataset after filtering with final threshold: {PBMC2_i_std_filtered.shape[0]}")
print("")

In [None]:
#If files are big, you can convert to sparse matrices for efficiency. AutoZI will convert to dense matrices before modeling

#from scipy.sparse import csr_matrix
#PBMC1_g_std_filtered.X = csr_matrix(PBMC1_g_std_filtered.X)  # Converts to sparse format
#PBMC2_g_std_filtered.X = csr_matrix(PBMC2_g_std_filtered.X)  # Converts to sparse format
#PBMC1_i_std_filtered.X = csr_matrix(PBMC1_i_std_filtered.X)  # Converts to sparse format
#PBMC2_i_std_filtered.X = csr_matrix(PBMC2_i_std_filtered.X)  # Converts to sparse format

In [None]:
# Define the folder path
Int_folder = "Intermediate_Files/QC_07232025"

# Save the AnnData objects
PBMC1_g_std_filtered.write(os.path.join(Int_folder, "PBMC1_Gene_Data_Filtered_PostScrublet.h5ad"))
PBMC2_g_std_filtered.write(os.path.join(Int_folder, "PBMC2_Gene_Data_Filtered_PostScrublet.h5ad"))
PBMC1_i_std_filtered.write(os.path.join(Int_folder, "PBMC1_Iso_Data_Filtered_PostScrublet.h5ad"))
PBMC2_i_std_filtered.write(os.path.join(Int_folder, "PBMC2_Iso_Data_Filtered_PostScrublet.h5ad"))

In [None]:
import anndata

# Concatenate the two AnnData objects
adata_g_filter = anndata.concat([PBMC1_g_std_filtered, PBMC2_g_std_filtered], join="inner", label="batch", keys=["PBMC1_gene", "PBMC2_gene"])

# Verify the result
print(adata_g_filter)
print(adata_g_filter.obs["batch"].value_counts())

In [None]:
import anndata
adata_i_filter = anndata.concat([PBMC1_i_std_filtered, PBMC2_i_std_filtered], join="inner", label="batch", keys=["PBMC1_iso", "PBMC2_iso"])

# Verify the result
print(adata_i_filter)
print(adata_i_filter.obs["batch"].value_counts())

In [None]:
# Define the folder path
Int_folder = "Intermediate_Files/QC_07232025"

# Save the AnnData objects
adata_g_filter.write(os.path.join(Int_folder, "Concatenated_Gene_Data.h5ad"))
adata_i_filter.write(os.path.join(Int_folder, "Concatenated_Iso_Data.h5ad"))

In [None]:
import numpy as np

# Assign sample label by batch
adata_g_filter.obs["sample"] = adata_g_filter.obs["batch"].astype(str)
adata_i_filter.obs["sample"] = adata_i_filter.obs["batch"].astype(str)

# Summary function
def compute_summary_stats(adata, is_isoform=False):
    sample_name = adata.obs['sample'].unique()[0]
    print(f"\n### Summary Stats for Sample: {sample_name} ({'Isoform' if is_isoform else 'Gene'})")

    if not is_isoform:
        total_counts = adata.obs['total_counts']
        unique_features = adata.obs['n_genes_by_counts']
        mt_percentage = adata.obs['pct_counts_mt']

        print("\n--- Total Gene Counts ---")
        print(f"Min: {total_counts.min()}")
        print(f"Median: {np.median(total_counts)}")
        print(f"Mean: {total_counts.mean()}")
        print(f"Max: {total_counts.max()}")

        transcripts_per_gene = total_counts / unique_features
        print("\n--- Transcripts per Gene ---")
        print(f"Min: {transcripts_per_gene.min()}")
        print(f"Median: {np.median(transcripts_per_gene)}")
        print(f"Mean: {transcripts_per_gene.mean()}")
        print(f"Max: {transcripts_per_gene.max()}")

        print("\n--- Unique Genes per Cell ---")
        print(f"Min: {unique_features.min()}")
        print(f"Median: {np.median(unique_features)}")
        print(f"Mean: {unique_features.mean()}")
        print(f"Max: {unique_features.max()}")

        print("\n--- Mitochondrial Percentage ---")
        print(f"Min: {mt_percentage.min()}")
        print(f"Median: {np.median(mt_percentage)}")
        print(f"Mean: {mt_percentage.mean()}")
        print(f"Max: {mt_percentage.max()}")

    else:
        total_counts = adata.obs['total_counts_isoforms']
        unique_features = adata.obs['n_isoforms_by_counts']
        reads_per_isoform = np.array(adata.X.sum(axis=1)).flatten() / unique_features

        print("\n--- Total Isoform Counts ---")
        print(f"Min: {total_counts.min()}")
        print(f"Median: {np.median(total_counts)}")
        print(f"Mean: {total_counts.mean()}")
        print(f"Max: {total_counts.max()}")

        print("\n--- Reads per Isoform per Cell ---")
        print(f"Min: {reads_per_isoform.min()}")
        print(f"Median: {np.median(reads_per_isoform)}")
        print(f"Mean: {reads_per_isoform.mean()}")
        print(f"Max: {reads_per_isoform.max()}")

        print("\n--- Unique Isoforms per Cell ---")
        print(f"Min: {unique_features.min()}")
        print(f"Median: {np.median(unique_features)}")
        print(f"Mean: {unique_features.mean()}")
        print(f"Max: {unique_features.max()}")

# Run per batch
for batch in adata_g_filter.obs["batch"].unique():
    g_sample = adata_g_filter[adata_g_filter.obs["batch"] == batch].copy()
    compute_summary_stats(g_sample, is_isoform=False)

for batch in adata_i_filter.obs["batch"].unique():
    i_sample = adata_i_filter[adata_i_filter.obs["batch"] == batch].copy()
    compute_summary_stats(i_sample, is_isoform=True)

In [None]:
import numpy as np
import pandas as pd
import scipy.sparse as sp

def _row_sums(X):
    return np.asarray(X.sum(axis=1)).ravel() if sp.issparse(X) else X.sum(axis=1)

def _vec(adata, key, fallback=None):
    """Get obs vector; if missing and fallback is callable, compute it."""
    if key in adata.obs:
        v = adata.obs[key].to_numpy()
    elif callable(fallback):
        v = fallback(adata)
    else:
        raise KeyError(f"Missing required obs field: {key}")
    return np.asarray(v).ravel()

def _one_group_summary(adata, is_isoform):
    out = {}

    if not is_isoform:
        total = _vec(adata, "total_counts", fallback=lambda ad: _row_sums(ad.X))
        unique = _vec(adata, "n_genes_by_counts")
        mt_pct = _vec(adata, "pct_counts_mt", fallback=lambda ad: np.zeros(ad.n_obs, dtype=float))
        tpg = np.divide(total, unique, out=np.zeros_like(total, dtype=float), where=unique>0)

        out.update({
            "Total counts (min)": float(total.min()),
            "Total counts (median)": float(np.median(total)),
            "Total counts (mean)": float(total.mean()),
            "Total counts (max)": float(total.max()),
            "Transcripts per gene (min)": float(tpg.min()),
            "Transcripts per gene (median)": float(np.median(tpg)),
            "Transcripts per gene (mean)": float(tpg.mean()),
            "Transcripts per gene (max)": float(tpg.max()),
            "Unique genes per cell (min)": float(unique.min()),
            "Unique genes per cell (median)": float(np.median(unique)),
            "Unique genes per cell (mean)": float(unique.mean()),
            "Unique genes per cell (max)": float(unique.max()),
            "Mito % (min)": float(mt_pct.min()),
            "Mito % (median)": float(np.median(mt_pct)),
            "Mito % (mean)": float(mt_pct.mean()),
            "Mito % (max)": float(mt_pct.max()),
            "Cells (n_obs)": int(adata.n_obs),
            "Features (n_vars)": int(adata.n_vars),
        })
    else:
        total = _vec(adata, "total_counts_isoforms", fallback=lambda ad: _row_sums(ad.X))
        unique = _vec(adata, "n_isoforms_by_counts", fallback=lambda ad: np.asarray((ad.X>0).sum(axis=1)).ravel() if sp.issparse(ad.X) else (ad.X>0).sum(axis=1))
        rpi = np.divide(total, unique, out=np.zeros_like(total, dtype=float), where=unique>0)

        out.update({
            "Total isoform counts (min)": float(total.min()),
            "Total isoform counts (median)": float(np.median(total)),
            "Total isoform counts (mean)": float(total.mean()),
            "Total isoform counts (max)": float(total.max()),
            "Reads per isoform per cell (min)": float(rpi.min()),
            "Reads per isoform per cell (median)": float(np.median(rpi)),
            "Reads per isoform per cell (mean)": float(rpi.mean()),
            "Reads per isoform per cell (max)": float(rpi.max()),
            "Unique isoforms per cell (min)": float(unique.min()),
            "Unique isoforms per cell (median)": float(np.median(unique)),
            "Unique isoforms per cell (mean)": float(unique.mean()),
            "Unique isoforms per cell (max)": float(unique.max()),
            "Cells (n_obs)": int(adata.n_obs),
            "Features (n_vars)": int(adata.n_vars),
        })

    return out

def summarize_by_batch_and_overall(adata, batch_key="batch", is_isoform=False, sample_key="sample"):
    # ensure a simple sample label if you like
    if sample_key not in adata.obs and batch_key in adata.obs:
        adata.obs[sample_key] = adata.obs[batch_key].astype(str)

    rows = []

    # Per-batch
    for b in adata.obs[batch_key].unique():
        sub = adata[adata.obs[batch_key] == b]
        stats = _one_group_summary(sub, is_isoform=is_isoform)
        stats.update({"Group": str(b), "Level": "batch"})
        rows.append(stats)

    # Overall (ALL)
    overall = _one_group_summary(adata, is_isoform=is_isoform)
    overall.update({"Group": "ALL", "Level": "overall"})
    rows.append(overall)

    df = pd.DataFrame(rows).set_index(["Level", "Group"]).sort_index()
    return df

# ---- Use it ----
gene_stats = summarize_by_batch_and_overall(adata_g_filter, batch_key="batch", is_isoform=False)
iso_stats  = summarize_by_batch_and_overall(adata_i_filter, batch_key="batch", is_isoform=True)

# Pretty print to console
with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.width", 140):
    print("\n=== Gene summary ===")
    print(gene_stats)
    print("\n=== Isoform summary ===")
    print(iso_stats)

# Optionally save
gene_stats.to_csv("QC_gene_summary_by_batch_and_overall.tsv", sep="\t")
iso_stats.to_csv("QC_isoform_summary_by_batch_and_overall.tsv", sep="\t")