# Normalization, Dimensionality Reduction, and Batch Correction

## Import required libraries

In [22]:
import scanpy as sc
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from scanpy.external.pp import harmony_integrate

## Configure Environment


In [59]:
# Configure Scanpy settings
sc.settings.verbosity = 3  # Show more output by default
sc.settings.set_figure_params(dpi=100, figsize=(8, 8))
np.random.seed(42)

# Project Configuration and paths
PROJ_NAME = ""
PROJ_DESCRIPTION = ""
FULL_PROJ_NAME = f"{PROJ_NAME}_{PROJ_DESCRIPTION}"

PROJECT_DIR = Path("/path/to/project")
OUTPUT_DIR = PROJECT_DIR / "output"

## Load QC data

In [None]:
# Load QC data
print("Loading filtered data...")
adata = sc.read_h5ad(OUTPUT_DIR / f"{FULL_PROJ_NAME}_qc.h5ad")
print(f"Data shape: {adata.shape[0]} cells and {adata.shape[1]} genes")

## Normalization and PCA

In [None]:
# Normalization
print("\nPerforming normalization...")
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

In [None]:
# Find highly variable genes
print("\nIdentifying highly variable genes...")
sc.pp.highly_variable_genes(
    adata,
    min_mean=0.0125,
    max_mean=3,
    min_disp=0.5,
    n_top_genes=2000
)

print(f"Number of highly variable genes: {sum(adata.var.highly_variable)}")
# Plot highly variable genes
sc.pl.highly_variable_genes(adata)
plt.show()

In [None]:
# Scale data
print("\nScaling data...")
sc.pp.scale(adata, max_value=10)

# Run PCA
print("\nRunning PCA...")
sc.tl.pca(adata, svd_solver='arpack')

# Plot PCA variance ratio
sc.pl.pca_variance_ratio(adata, n_pcs=50, log=True)
plt.show()

# Plot PCA by batch
print("\nPlotting PCA colored by batch...")
sc.pl.pca(adata, color='batch')
plt.show()

## Batch correction

In [None]:
adata.obs['batch'] = adata.obs['batch'].astype(str)
print("\nUnique batch values:")
print(adata.obs['batch'].unique())


In [None]:
# Convert batch to string
adata.obs['batch'] = adata.obs['batch'].astype(str)

# Run Harmony and capture the output
print("\nPerforming batch correction with Harmony...")
harmony_integrate(adata, 'batch', basis='X_pca', adjusted_basis='X_pca_harmony', max_iter_harmony=20)

# Compare PCA and Harmony
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Plot original PCA
sc.pl.pca(adata, color='batch',ax=ax1, show=False)
ax1.set_title('Before batch correction (PCA)')

# Plot Harmony-corrected result
sc.pl.embedding(adata, basis='X_pca_harmony', color='batch', ax=ax2, show=False)
ax2.set_title('After batch correction (Harmony)')

plt.tight_layout()
plt.show()

## Clustering

In [None]:
# Clustering using Harmony corrected matrix
print("\nPerforming clustering...")
# Computing neighborhood graph
sc.pp.neighbors(adata, use_rep='X_pca_harmony')

# Testing multiple resolutions
resolutions = [0.05, 0.1, 0.3, 0.5, 0.75, 1, 1.25, 1.5]

for res in resolutions:
    print(f"\nFinding clusters at resolution {res}...")
    sc.tl.leiden(adata, resolution=res, key_added=f'leiden_res{res}')

In [None]:
# Run UMAP
print("\nRunning UMAP...")
sc.tl.umap(adata)



In [None]:
# Visualization
print("\nGenerating visualizations...")

# Plot UMAP with different clustering resolutions
fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.flatten()

for i, res in enumerate(resolutions):
    sc.pl.umap(adata, color=f'leiden_res{res}', ax=axes[i], show=False)
    axes[i].set_title(f'Resolution: {res}')

plt.tight_layout()
plt.show()

In [None]:
# Plot UMAP with different groupings
sc.pl.umap(adata, color=['treatment', 'batch', f'leiden_res{0.5}'])
plt.show()

## Data Saving

In [None]:



# Save normalized and processed data
output_file = OUTPUT_DIR / f"{FULL_PROJ_NAME}_normalized.h5ad"
print(f"\nSaving processed data to: {output_file}")
adata.write(output_file)

print("Normalization and Clustering complete!")
