# Quality Control and Filtering of scRNA-seq Data

## Import required libraries

In [None]:
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import numpy as np

## Configure Environment


In [13]:
# Configure Scanpy settings
sc.settings.verbosity = 3  # Show more output by default
sc.settings.set_figure_params(dpi=100, figsize=(8, 8))
np.random.seed(42)

# Project Configuration and paths
PROJ_NAME = ""
PROJ_DESCRIPTION = ""
FULL_PROJ_NAME = f"{PROJ_NAME}_{PROJ_DESCRIPTION}"

PROJECT_DIR = Path("/path/to/project")
OUTPUT_DIR = PROJECT_DIR / "output"

## Data Loading

In [None]:
print("Loading data...")
adata = sc.read_h5ad(OUTPUT_DIR / f"{FULL_PROJ_NAME}_raw.h5ad")
print(f"Initial data shape: {adata.shape[0]} cells and {adata.shape[1]} genes")

## QC

In [None]:
# Calculate QC metrics
print("\nCalculating QC metrics...")
# Annotate mitochondrial genes -- mouse
adata.var['mt'] = adata.var_names.str.startswith('mt-')

sc.pp.calculate_qc_metrics(
    adata,
    qc_vars=['mt'],
    percent_top=None,
    log1p=False,
    inplace=True
)

In [None]:
# Plot QC distributions before filtering
fig, axs = plt.subplots(1, 3, figsize=(15, 5))

sc.pl.violin(adata, 'n_genes_by_counts', ax=axs[0], show=False)
axs[0].set_title('Genes per Cell')

sc.pl.violin(adata, 'total_counts', ax=axs[1], show=False)
axs[1].set_title('UMI Counts per Cell')

sc.pl.violin(adata, 'pct_counts_mt', ax=axs[2], show=False)
axs[2].set_title('Mitochondrial Content')

plt.tight_layout()
plt.show()


In [None]:
# Plot relationships between QC metrics
sc.pl.scatter(adata, 'total_counts', 'n_genes_by_counts')
plt.show()

sc.pl.scatter(adata, 'total_counts', 'pct_counts_mt')
plt.show()

In [18]:
# Set filtering parameters
qc_params = {
    'min_genes': 200,   
    'max_genes': 5000,  
    'min_counts': 1000,   
    'max_counts': 30000,  
    'max_mt': 5          
}

In [None]:
# Apply QC filters
print("\nApplying QC filters...")
print(f"Initial cells: {adata.shape[0]}")

# Filter cells
adata = adata[adata.obs.n_genes_by_counts >= qc_params['min_genes']]
adata = adata[adata.obs.n_genes_by_counts < qc_params['max_genes']]
adata = adata[adata.obs.total_counts >= qc_params['min_counts']]
adata = adata[adata.obs.total_counts < qc_params['max_counts']]
adata = adata[adata.obs.pct_counts_mt < qc_params['max_mt']]

# Filter genes
sc.pp.filter_genes(adata, min_cells=3)

print(f"Remaining cells: {adata.shape[0]}")
print(f"Remaining genes: {adata.shape[1]}")

In [None]:
# Plot post-filtering QC metrics
fig, axs = plt.subplots(1, 3, figsize=(15, 5))

sc.pl.violin(adata, 'n_genes_by_counts', ax=axs[0], show=False)
axs[0].set_title('Genes per Cell (After QC)')

sc.pl.violin(adata, 'total_counts', ax=axs[1], show=False)
axs[1].set_title('UMI Counts per Cell (After QC)')

sc.pl.violin(adata, 'pct_counts_mt', ax=axs[2], show=False)
axs[2].set_title('Mitochondrial Content (After QC)')

plt.tight_layout()
plt.show()

## Saving Data

In [None]:

# Save the filtered data
output_file = OUTPUT_DIR / f"{FULL_PROJ_NAME}_qc.h5ad"
print(f"\nSaving filtered data to: {output_file}")
adata.write(output_file)
print("Quality control and filtering complete!")