In [None]:
import os
import urllib.request
import gzip
import shutil
import scanpy as sc
import pandas as pd
import numpy as np
import anndata as ad
from scipy import sparse, io
import warnings
import re

warnings.filterwarnings('ignore')

# ---------------------------
# 1. Constants and file lists
# ---------------------------
GEO_ACCESSION = "GSE267070"
FILES = [
    f"{GEO_ACCESSION}_barcodes.tsv.gz",
    f"{GEO_ACCESSION}_features.tsv.gz",
    f"{GEO_ACCESSION}_matrix.mtx.gz",
    f"{GEO_ACCESSION}_feature_reference.csv.gz"
]


# ---------------------------
# 2. Helper functions
# ---------------------------

def download_files(data_dir):
    """Download the required files if they don't exist."""
    os.makedirs(data_dir, exist_ok=True)
    
    # Simple HTTP request headers
    headers = {'User-Agent': 'Mozilla/5.0'}
    base_url = f"https://www.ncbi.nlm.nih.gov/geo/download/?acc={GEO_ACCESSION}&format=file"
    
    for file in FILES:
        file_path = os.path.join(data_dir, file)
        if not os.path.exists(file_path):
            print(f"Downloading {file}...")
            req = urllib.request.Request(url=f"{base_url}&file={file}", headers=headers)
            try:
                with urllib.request.urlopen(req) as response, open(file_path, 'wb') as out_file:
                    shutil.copyfileobj(response, out_file)
                print(f"Downloaded {file}")
            except urllib.error.HTTPError as e:
                print(f"Error downloading {file}: {e}")
                print("Consider using GEOquery in R or manual download from the GEO website.")
        else:
            print(f"File {file} already exists, skipping download.")


def rename_protein(raw_name: str) -> str:
    """
    Simplify a protein marker name. 
    Examples:
      - 'CD279-(PD-1)-RMP1-30'  -> 'CD279'
      - 'CD200-(OX2)-OX-90'     -> 'CD200'
      - 'CD8a-53-6.7'           -> 'CD8a'
      - 'CD45.2-104'            -> 'CD45.2'
    If it starts with 'HTO', keep it intact.
    If it doesn't match the pattern, returns the original full name.
    """
    # Keep HTO names as-is
    if raw_name.startswith("HTO"):
        return raw_name
    
    # Regex to capture strings starting with "CD", followed by numbers/letters/dots
    # capturing only up until the first dash or parenthesis (if any).
    # Example: "CD279-(PD-1)-RMP1-30" -> match group is "CD279"
    match = re.match(r'^(CD[0-9a-zA-Z\.]+)(?=[\(-]|$)', raw_name)
    if match:
        return match.group(1)
    
    # Otherwise, return the whole raw_name, or customize further if needed
    return raw_name


def rename_proteins(adata_protein):
    """
    Modify .var_names of the protein AnnData in-place using 'rename_protein'.
    """
    cleaned_names = [rename_protein(n) for n in adata_protein.var_names]
    adata_protein.var_names = cleaned_names


def load_data(data_dir):
    """Load the 10X data into separate AnnData objects for gene expression and protein."""
    print("Loading data...")

    # --- Load feature info ---
    features_path = os.path.join(data_dir, f"{GEO_ACCESSION}_features.tsv.gz")
    features = pd.read_csv(features_path, sep='\t', header=None)
    features.columns = ['id', 'name', 'feature_type']
    
    gene_features = features[features['feature_type'] == 'Gene Expression']
    protein_features = features[features['feature_type'] == 'Antibody Capture']
    
    print(f"Number of gene features: {len(gene_features)}")
    print(f"Number of protein features: {len(protein_features)}")
    
    # --- Load barcodes ---
    barcodes_path = os.path.join(data_dir, f"{GEO_ACCESSION}_barcodes.tsv.gz")
    barcodes = pd.read_csv(barcodes_path, header=None, sep='\t')[0].values
    
    # --- Load the matrix ---
    matrix_path = os.path.join(data_dir, f"{GEO_ACCESSION}_matrix.mtx.gz")
    matrix = io.mmread(matrix_path).tocsr()
    print(f"Matrix shape (features x cells): {matrix.shape}")
    
    # Transpose to get cells x features
    matrix = matrix.transpose()
    print(f"Transposed matrix shape (cells x features): {matrix.shape}")
    
    # --- Subset matrix into gene vs. protein ---
    gene_indices = gene_features.index.tolist()
    protein_indices = protein_features.index.tolist()
    
    adata_gene = ad.AnnData(
        X=matrix[:, gene_indices],
        obs=pd.DataFrame(index=barcodes),
        var=pd.DataFrame(index=gene_features['name'].values)
    )
    adata_protein = ad.AnnData(
        X=matrix[:, protein_indices],
        obs=pd.DataFrame(index=barcodes),
        var=pd.DataFrame(index=protein_features['name'].values)
    )
    
    # --- Rename the protein var_names ---
    rename_proteins(adata_protein)
    
    print(f"Gene expression data shape: {adata_gene.shape}")
    print(f"Protein data shape: {adata_protein.shape}")
    
    return adata_gene, adata_protein, features


def harmonize_metadata(adata_gene, adata_protein, features, data_dir):
    """Example: Add some meta-fields and parse HTO to get condition."""
    print("Harmonizing metadata...")
    
    # (Optional) load the feature reference file if needed
    feature_ref_path = os.path.join(data_dir, f"{GEO_ACCESSION}_feature_reference.csv.gz")
    if os.path.exists(feature_ref_path):
        feature_ref = pd.read_csv(feature_ref_path)
    else:
        feature_ref = pd.DataFrame()
    
    # Create a metadata DataFrame
    metadata = pd.DataFrame(index=adata_gene.obs.index)
    metadata['organism'] = 'Mus musculus'
    metadata['cell_type'] = 'CD45+ Immune Cells'
    metadata['crispr_type'] = 'None'
    metadata['cancer_type'] = 'Melanoma'
    metadata['condition'] = 'Unknown'       # We might overwrite this below
    metadata['perturbation_name'] = 'None'
    
    # Attach to obs
    for col in metadata.columns:
        adata_gene.obs[col] = metadata[col].values
        adata_protein.obs[col] = metadata[col].values
    
    # --- Attempt to derive condition from HTO data (if present) ---
    try:
        # The rename_proteins step preserves 'HTO1', 'HTO2', etc.
        hto_names = [f for f in adata_protein.var_names if f.startswith('HTO')]
        if hto_names:
            # Extract HTO counts
            hto_counts = pd.DataFrame(adata_protein[:, hto_names].X.toarray(),
                                      index=adata_protein.obs.index,
                                      columns=hto_names)
            # Assign each cell to the "max" HTO
            hto_counts['max_hto'] = hto_counts.idxmax(axis=1)
            
            # For example, if 'HTO1'..'HTO4' = 'Dietary Restriction',
            # and 'HTO5'..'HTO8' = 'Ad Libitum'.
            condition_map = {
                'HTO1': 'Dietary Restriction', 'HTO2': 'Dietary Restriction',
                'HTO3': 'Dietary Restriction', 'HTO4': 'Dietary Restriction',
                'HTO5': 'Ad Libitum', 'HTO6': 'Ad Libitum',
                'HTO7': 'Ad Libitum', 'HTO8': 'Ad Libitum'
            }
            hto_counts['condition'] = hto_counts['max_hto'].map(condition_map)
            
            adata_gene.obs['condition'] = hto_counts['condition']
            adata_protein.obs['condition'] = hto_counts['condition']
            
            print("Successfully derived condition from HTO data.")
        else:
            print("No HTO features found in protein data; condition remains 'Unknown'.")
    except Exception as e:
        print(f"Could not parse HTO condition: {e}")
    
    return adata_gene, adata_protein


def filter_paired_data(adata_gene, adata_protein):
    """Keep only cells that appear in both gene and protein data."""
    print("Filtering to keep only paired data (common barcodes)...")
    common_barcodes = sorted(set(adata_gene.obs_names) & set(adata_protein.obs_names))
    print(f"Common cells: {len(common_barcodes)}")
    
    adata_gene_paired = adata_gene[common_barcodes].copy()
    adata_protein_paired = adata_protein[common_barcodes].copy()
    
    return adata_gene_paired, adata_protein_paired


def check_gene_symbols(adata):
    """Check for duplicate gene symbols and make them unique if needed."""
    print("Checking for duplicate gene symbols...")
    if adata.var_names.duplicated().any():
        dup_count = adata.var_names.duplicated().sum()
        print(f"Found {dup_count} duplicated gene name(s). Making unique...")
        
        var_df = adata.var.copy()
        var_df["original_name"] = adata.var_names
        var_df["dup_num"] = var_df.groupby("original_name").cumcount().astype(str)
        var_df["unique_name"] = var_df.apply(
            lambda x: x["original_name"] if x["dup_num"] == "0"
            else f"{x['original_name']}_{x['dup_num']}", axis=1
        )
        adata.var_names = var_df["unique_name"].values
        print("Duplicate gene symbols have been made unique.")
    else:
        print("No duplicate gene symbols found.")
    return adata


def save_data(adata_gene, adata_protein, data_dir):
    """Save the two AnnData objects."""
    print("Saving final AnnData objects...")
    
    gene_output_path = os.path.join(data_dir, f"{GEO_ACCESSION}_gene_expression_harmonized.h5ad")
    protein_output_path = os.path.join(data_dir, f"{GEO_ACCESSION}_protein_expression_harmonized.h5ad")
    
    adata_gene.write(gene_output_path, compression='gzip')
    adata_protein.write(protein_output_path, compression='gzip')
    
    print(f"  -> {gene_output_path}")
    print(f"  -> {protein_output_path}")


# ---------------------------
# 3. Master pipeline function
# ---------------------------
def run_pipeline(data_dir=None):
    """Run the end-to-end data processing and harmonization pipeline."""
    if data_dir is None:
        data_dir = os.getcwd()
    print(f"Using data directory: {data_dir}")
    
    # 1. Download data if needed
    download_files(data_dir)
    
    # 2. Load data
    adata_gene, adata_protein, features = load_data(data_dir)
    
    # 3. Harmonize metadata
    adata_gene, adata_protein = harmonize_metadata(adata_gene, adata_protein, features, data_dir)
    
    # 4. Filter to keep only cells with both gene/protein data
    adata_gene, adata_protein = filter_paired_data(adata_gene, adata_protein)
    
    # 5. Fix duplicate gene symbols (if any)
    adata_gene = check_gene_symbols(adata_gene)
    
    # 6. Save
    save_data(adata_gene, adata_protein, data_dir)
    
    print("Data harmonization complete!")
    return adata_gene, adata_protein


# ---------------------------
# 4. Example of actually running
# ---------------------------
processed_adata_gene, processed_adata_protein = run_pipeline("/home/ubuntu/GSE267070")


In [None]:
import scanpy as sc

# Load the gene expression AnnData object from the saved h5ad file.
adata_gene = sc.read_h5ad("/content/GSE267070/GSE267070_gene_expression_harmonized.h5ad")

# Print the number of cells before QC filtering.
print("Number of cells before QC filtering:", adata_gene.n_obs)

# Compute QC metrics for each cell, which adds metrics like 'n_genes_by_counts' to adata_gene.obs.
sc.pp.calculate_qc_metrics(adata_gene, inplace=True)

# Define a QC threshold: keep cells with at least 200 genes detected.
qc_threshold = 200
adata_gene_qc = adata_gene[adata_gene.obs['n_genes_by_counts'] >= qc_threshold].copy()

# Print the number of cells after QC filtering.
print("Number of cells after QC filtering:", adata_gene_qc.n_obs)
