In [None]:
import os
import gzip
import urllib.request
from pathlib import Path
import numpy as np
import pandas as pd
import anndata as ad
from scipy import sparse
import time

# URLs for the dataset files
DATASET_FILES = {
    "metadata_CER": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE261nnn/GSE261157/suppl/GSE261157_metadata_CER.csv.gz",
    "metadata_Ctx": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE261nnn/GSE261157/suppl/GSE261157_metadata_Ctx.csv.gz",
    "preprocessed_data": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE261nnn/GSE261157/suppl/GSE261157_preprocessed_data.txt.gz"
}

def download_files(data_dir):
    """Download dataset files if they don't exist."""
    os.makedirs(data_dir, exist_ok=True)
    
    for file_name, url in DATASET_FILES.items():
        file_path = os.path.join(data_dir, os.path.basename(url))
        if not os.path.exists(file_path):
            print(f"Downloading {file_name}...")
            urllib.request.urlretrieve(url, file_path)
            print(f"Downloaded {file_path}")
        else:
            print(f"File {file_path} already exists, skipping download.")

def load_metadata(data_dir):
    """Load and combine metadata from both CER and Ctx files."""
    metadata_cer_path = os.path.join(data_dir, "GSE261157_metadata_CER.csv.gz")
    metadata_ctx_path = os.path.join(data_dir, "GSE261157_metadata_Ctx.csv.gz")
    
    print("Loading CER metadata...")
    metadata_cer = pd.read_csv(metadata_cer_path, index_col=0)
    print("Loading Ctx metadata...")
    metadata_ctx = pd.read_csv(metadata_ctx_path, index_col=0)
    
    # Combine metadata
    metadata = pd.concat([metadata_cer, metadata_ctx])
    return metadata

def process_expression_data(data_path, metadata_df):
    """Process expression data efficiently using sparse matrices."""
    print(f"Processing expression data from {data_path}...")
    start_time = time.time()
    
    # First pass: get cell barcodes and gene names
    print("First pass: reading gene names and cell barcodes...")
    with gzip.open(data_path, 'rt') as f:
        header = f.readline().strip().replace('"', '').split(',')
        cell_barcodes = header[1:]  # Skip the first column (gene names)
        
        gene_names = []
        for line in f:
            gene_names.append(line.strip().split(',')[0].replace('"', ''))
    
    print(f"Found {len(cell_barcodes)} cells in expression data")
    
    # Find common cells between metadata and expression data
    common_cells = list(set(metadata_df.index) & set(cell_barcodes))
    print(f"Found {len(common_cells)} cells in both metadata and expression data")
    
    # Get indices of common cells in the expression data
    common_cell_indices = [cell_barcodes.index(cell) for cell in common_cells]
    
    # Filter metadata to include only the common cells
    filtered_metadata = metadata_df.loc[common_cells].copy()
    
    # Second pass: process gene expression data in chunks
    print("Second pass: processing gene expression data in chunks...")
    data_values = []
    row_indices = []
    col_indices = []
    
    with gzip.open(data_path, 'rt') as f:
        f.readline()  # skip header
        for gene_idx, line in enumerate(f):
            if gene_idx % 1000 == 0:
                print(f"Processed {gene_idx} genes...")
            
            parts = line.strip().replace('"', '').split(',')
            
            # Extract expression values for common cells only
            for i, cell_idx in enumerate(common_cell_indices):
                expr_value = int(parts[cell_idx+1]) if parts[cell_idx+1] else 0
                if expr_value > 0:  # only store non-zero values
                    row_indices.append(i)
                    col_indices.append(gene_idx)
                    data_values.append(expr_value)
    
    print(f"Processed {len(gene_names)} genes total")
    
    # Create sparse matrix (cells x genes)
    print("Creating sparse matrix...")
    expression_matrix = sparse.csr_matrix(
        (data_values, (row_indices, col_indices)),
        shape=(len(common_cells), len(gene_names))
    )
    
    elapsed_time = time.time() - start_time
    print(f"Expression data processing completed in {elapsed_time:.2f} seconds")
    return expression_matrix, gene_names, filtered_metadata

def create_harmonized_anndata(expression_matrix, gene_names, metadata_df):
    """Create an AnnData object with harmonized metadata."""
    print("Creating AnnData object...")
    adata = ad.AnnData(
        X=expression_matrix,
        obs=metadata_df,
        var=pd.DataFrame(index=gene_names)
    )
    
    print("Adding standardized metadata fields...")
    # Organism
    adata.obs['organism'] = 'Homo sapiens'
    
    # Cell type
    adata.obs['cell_type'] = adata.obs['clusters_annot']
    
    # Perturbation name: default is set to "Non-targeting"
    adata.obs['perturbation_name'] = 'Non-targeting'
    # For cells with Genotype "AxD", set perturbation name to "GFAP_R239C"
    adata.obs.loc[adata.obs['Genotype'] == 'AxD', 'perturbation_name'] = 'GFAP'
    
    # Condition: if perturbation is "Non-targeting", set to "Control", otherwise use Genotype and Cultivation
    adata.obs['condition'] = adata.obs.apply(
        lambda row: "Control" if row['perturbation_name'] == 'Non-targeting' 
        else row['Genotype'] + '_' + row['Cultivation'], axis=1)
    
    # CRISPR type (not applicable for this dataset)
    adata.obs['crispr_type'] = 'None'
    
    # Cancer type (not applicable for this dataset)
    adata.obs['cancer_type'] = 'Non-Cancer'
    
    # Ensure var_names are gene symbols
    adata.var_names_make_unique()
    
    return adata

def process_dataset(data_dir):
    """Process the GSE261157 dataset and create harmonized h5ad file."""
    start_time = time.time()
    
    download_files(data_dir)
    
    print("Loading metadata...")
    metadata = load_metadata(data_dir)
    print(f"Metadata loaded: {metadata.shape[0]} cells x {metadata.shape[1]} features")
    
    data_path = os.path.join(data_dir, "GSE261157_preprocessed_data.txt.gz")
    expression_matrix, gene_names, filtered_metadata = process_expression_data(data_path, metadata)
    
    adata = create_harmonized_anndata(expression_matrix, gene_names, filtered_metadata)
    
    output_file = os.path.join(data_dir, "GSE261157_harmonized.h5ad")
    print(f"Saving harmonized dataset to {output_file}...")
    adata.write(output_file)
    
    total_time = time.time() - start_time
    print(f"Harmonization complete in {total_time:.2f} seconds. Output file: {output_file}")
    print(f"AnnData object shape: {adata.shape}")
    
    print("\nHarmonized dataset summary:")
    print(f"Number of cells: {adata.n_obs}")
    print(f"Number of genes: {adata.n_vars}")
    print(f"Organism: {adata.obs['organism'].unique()[0]}")
    print(f"Cell types: {', '.join(adata.obs['cell_type'].unique())}")
    print(f"Conditions: {', '.join(adata.obs['condition'].unique())}")
    print(f"Perturbations: {', '.join(adata.obs['perturbation_name'].unique())}")
    
    return adata

def main(data_dir=None):
    """Main function to process the dataset. Provide a data_dir if desired."""
    if data_dir is None:
        data_dir = os.path.join(os.getcwd(), "GSE261157")
    
    print(f"Processing GSE261157 dataset in directory: {data_dir}")
    adata = process_dataset(data_dir)
    return adata

# Run the main function to process the dataset
adata = main()  # You can pass a custom directory by calling main('your_directory_path')
