In [None]:
import os
import urllib.request
import gzip
import shutil
import h5py
import numpy as np
import pandas as pd
from scipy import sparse
import anndata

def download_data(data_dir):
    """
    Download the dataset if it doesn't exist.
    
    Args:
        data_dir: Directory where data will be stored
    
    Returns:
        Path to the downloaded h5ad file
    """
    # Create directory if it doesn't exist
    os.makedirs(data_dir, exist_ok=True)
    
    # Define file paths
    h5ad_gz_path = os.path.join(data_dir, "GSE195510_ORGANOIDSEQ_COUNTS_FILTERED.h5ad.gz")
    h5ad_path = os.path.join(data_dir, "GSE195510_ORGANOIDSEQ_COUNTS_FILTERED.h5ad")
    
    # Download the file if it doesn't exist
    if not os.path.exists(h5ad_path):
        if not os.path.exists(h5ad_gz_path):
            print(f"Downloading data to {h5ad_gz_path}...")
            url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE195nnn/GSE195510/suppl/GSE195510_ORGANOIDSEQ_COUNTS_FILTERED.h5ad.gz"
            urllib.request.urlretrieve(url, h5ad_gz_path)
            print("Download complete.")
        
        # Decompress the file
        print(f"Decompressing {h5ad_gz_path}...")
        with gzip.open(h5ad_gz_path, 'rb') as f_in:
            with open(h5ad_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        print("Decompression complete.")
    else:
        print(f"File already exists at {h5ad_path}")
    
    return h5ad_path

def process_data(file_path):
    """
    Process the h5ad file to extract data and create a sparse matrix.
    
    Args:
        file_path: Path to the h5ad file
    
    Returns:
        Dictionary containing the matrix, cell barcodes, and gene names
    """
    print(f"Processing data from {file_path}...")
    
    try:
        with h5py.File(file_path, 'r') as f:
            # Extract data from the file
            data = f['unknown/data'][:]
            indices = f['unknown/indices'][:]
            indptr = f['unknown/indptr'][:]
            shape = f['unknown/shape'][:]
            
            # Get cell barcodes and gene names
            barcodes = [b.decode('utf-8') for b in f['unknown/barcodes'][:]]
            gene_names = [g.decode('utf-8') for g in f['unknown/gene_names'][:]]
            
            print(f"Shape from file: {shape}")
            print(f"Number of cells: {len(barcodes)}")
            print(f"Number of genes: {len(gene_names)}")
            
            # Create a CSC matrix
            print("Creating CSC matrix...")
            matrix = sparse.csc_matrix((data, indices, indptr), shape=tuple(shape))
            
            # Transpose the matrix to get [cells, genes] format
            print("Transposing matrix to get [cells, genes] format...")
            matrix = matrix.T
            
            print(f"Final matrix shape: {matrix.shape}")
            
            return {
                'X': matrix,
                'barcodes': barcodes,
                'gene_names': gene_names
            }
    except Exception as e:
        print(f"Error processing data: {str(e)}")
        raise

def create_metadata(barcodes):
    """
    Create standardized metadata for the dataset.
    
    Args:
        barcodes: List of cell barcodes
    
    Returns:
        Pandas DataFrame with standardized metadata
    """
    print("Creating metadata...")
    
    # Initialize metadata DataFrame
    metadata = pd.DataFrame(index=barcodes)
    
    # Extract sample information from barcodes
    # Format: EP_WT_1_AAACCCAAGACGGTCA-1_1 or EP_KO_1_AAACCCAAGACGGTCA-1_1
    metadata['sample'] = [barcode.split('_')[1] + '_' + barcode.split('_')[2] for barcode in barcodes]
    
    # Set condition based on WT/KO in the barcode
    metadata['condition'] = ['Control' if 'WT' in barcode else 'Test' for barcode in barcodes]
    
    # Set perturbation name based on condition
    metadata['perturbation_name'] = ['Non-targeting' if cond == 'Control' else 'FOXP1' for cond in metadata['condition']]
    
    # Set other standardized metadata
    metadata['organism'] = 'Homo sapiens'
    metadata['cell_type'] = 'iPSC-derived forebrain organoid cells'
    metadata['crispr_type'] = 'CRISPR KO'
    metadata['cancer_type'] = 'Non-Cancer'
    metadata['study_accession'] = 'GSE195510'
    metadata['study_title'] = 'FOXP1 Orchestrates Neurogenesis in Human Cortical Basal Progenitors'
    
    return metadata

def create_harmonized_anndata(data_dict, metadata):
    """
    Create a harmonized AnnData object.
    
    Args:
        data_dict: Dictionary containing matrix and gene names
        metadata: Pandas DataFrame with metadata
    
    Returns:
        AnnData object with harmonized data
    """
    print("Creating AnnData object...")
    
    # Create AnnData object
    adata = anndata.AnnData(
        X=data_dict['X'],
        obs=metadata,
        var=pd.DataFrame(index=data_dict['gene_names'])
    )
    
    return adata

def save_harmonized_data(adata, output_path):
    """
    Save the harmonized data to an h5ad file.
    
    Args:
        adata: AnnData object
        output_path: Path to save the h5ad file
    """
    try:
        print(f"Saving harmonized data to {output_path}...")
        adata.write_h5ad(output_path)
        print("Saved successfully.")
    except Exception as e:
        print(f"Error saving h5ad file: {str(e)}")
        raise

def main(data_dir=None):
    """
    Main function to harmonize the dataset.
    
    Args:
        data_dir: Optional directory path for data. Defaults to current working directory.
    """
    if data_dir is None:
        data_dir = os.getcwd()
    
    # Download data if needed
    h5ad_path = download_data(data_dir)
    
    try:
        # Process the data
        data_dict = process_data(h5ad_path)
        
        # Create metadata
        metadata = create_metadata(data_dict['barcodes'])
        
        # Create harmonized AnnData object
        adata = create_harmonized_anndata(data_dict, metadata)
        
        # Save harmonized data
        output_path = os.path.join(data_dir, "GSE195510_harmonized.h5ad")
        save_harmonized_data(adata, output_path)
        
        print("Processing complete!")
    except Exception as e:
        print(f"Failed to harmonize data: {str(e)}")

# Run the main function; adjust the data_dir if needed.
main()
