In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad  # Make sure to import anndata
import matplotlib.pyplot as pl
import seaborn as sns
import bbknn
import scvelo as scv
import anndata
import leidenalg
import loompy
from scipy import io
from scipy.sparse import coo_matrix, csr_matrix
from matplotlib.pyplot import rc_context
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
import os
import random
import mellon

In [None]:
import os
import scanpy as sc
import numpy as np

# Set working directory
os.chdir("P:/Tolulope/Manuscript/Yuan Analysis")

# List of .h5ad files and corresponding sample names
samples = {
    'YSham_GFP_adata_filtered_with_SOLO.h5ad': 'YSham_GFP',
    'YSham_noGFP_adata_filtered_with_SOLO.h5ad': 'YSham_noGFP',
    'YOV_GFP_adata_filtered_with_SOLO.h5ad': 'YOV_GFP',
    'YOV_noGFP_adata_filtered_with_SOLO.h5ad': 'YOV_noGFP',
    'ASham_GFP_adata_filtered_with_SOLO.h5ad': 'ASham_GFP',
    'ASham_noGFP_adata_filtered_with_SOLO.h5ad': 'ASham_noGFP',
    'AOV_GFP_adata_filtered_with_SOLO.h5ad': 'AOV_GFP',
    'AOV_noGFP_adata_filtered_with_SOLO.h5ad': 'AOV_noGFP'
}

# Define preprocessing function
def preprocess_adata(adata):
    """Preprocess an AnnData object before concatenation."""
    # Create a raw layer to preserve the original data as an AnnData object
    adata.raw = adata.copy()
    
    adata.var_names_make_unique()
    sc.pp.filter_cells(adata, min_genes=200)  # Remove cells with <200 genes
    sc.pp.filter_genes(adata, min_cells=3)  # Remove genes found in <3 cells

    # Annotate mitochondrial genes
    adata.var["mt"] = adata.var_names.str.upper().str.startswith("MT-")
    
    # Identify hemoglobin genes, excluding those starting with "HbP"
    adata.var['hb'] = adata.var_names.str.contains(r'^Hb(?!P)', regex=True)
    
    # Remove hemoglobin genes
    adata = adata[:, ~adata.var['hb']]

    # Calculate quality control (QC) metrics
    sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True)

    # Remove cells with high mitochondrial content (>5%)
    adata = adata[adata.obs.pct_counts_mt < 5]

    return adata

# Process each file and store preprocessed objects
out = []
for file, sample_name in samples.items():
    file_path = os.path.join("P:/Tolulope/Manuscript/Yuan Analysis", file)
    if os.path.exists(file_path):  # Ensure file exists
        adata = sc.read_h5ad(file_path)  # Load .h5ad file
        adata = preprocess_adata(adata)  # Preprocess the data
        adata.obs["Sample"] = sample_name  # Assign sample name
        out.append(adata)  # Append preprocessed data to list
    else:
        print(f"File not found: {file_path}")

# Concatenate the list of preprocessed AnnData objects
if out:
    adata_combined = sc.concat(out, join='outer', label="batch", keys=list(samples.values()))

    # Ensure 'Sample' column is correctly assigned
    adata_combined.obs["Sample"] = adata_combined.obs["batch"]

    # Display the number of cells in each sample
    print(adata_combined.obs.groupby('Sample').count())

    # Store raw counts
    adata_combined.layers['counts'] = adata_combined.X.copy()

    # Save the combined AnnData object
    adata_combined.write_h5ad('combined.h5ad')

    # Reload the combined data
    adata = sc.read_h5ad('combined.h5ad')

    # Normalize and log-transform data
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)

    # Save raw counts for later analysis
    adata.raw = adata

    # Identify highly variable genes
    sc.pp.highly_variable_genes(adata, n_top_genes=3000, subset=True, layer='counts', flavor="seurat_v3", batch_key="Sample")

    # Scale the data
    sc.pp.scale(adata, max_value=10)

    # Perform PCA
    sc.tl.pca(adata)
else:
    print("No valid AnnData files to process.")
