In [4]:
import os
import pandas as pd
import numpy as np
import scanpy as sc
import anndata as ad
import gzip
import shutil
import urllib.request
from pathlib import Path

def download_dataset(accession_id, output_dir):
    """
    Download dataset files if they don't exist
    
    Parameters:
    -----------
    accession_id : str
        GEO accession ID
    output_dir : str
        Directory to save downloaded files
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Correctly construct the base URL for GEO series files.
    # For example, for "GSE251715", we want "GSE251nnn"
    base_url = f"https://ftp.ncbi.nlm.nih.gov/geo/series/{accession_id[:6]}nnn/{accession_id}/suppl/"
    files = [
        f"{accession_id}_cell_annotation.csv.gz",
        f"{accession_id}_normalized_matrix.csv.gz"
    ]
    
    # Download files if they don't exist
    for file in files:
        output_file = os.path.join(output_dir, file)
        if not os.path.exists(output_file):
            print(f"Downloading {file}...")
            try:
                url = base_url + file
                urllib.request.urlretrieve(url, output_file)
                print(f"Downloaded {file}")
            except Exception as e:
                print(f"Error downloading {file}: {e}")
                if os.path.exists(output_file):
                    os.remove(output_file)
        else:
            print(f"File {file} already exists")

def process_GSE251715(data_dir):
    """
    Process GSE251715 dataset and harmonize it into h5ad format
    
    Parameters:
    -----------
    data_dir : str
        Directory containing the dataset files
        
    Returns:
    --------
    anndata.AnnData
        Harmonized dataset
    """
    # Define file paths
    annotation_file = os.path.join(data_dir, "GSE251715_cell_annotation.csv.gz")
    matrix_file = os.path.join(data_dir, "GSE251715_normalized_matrix.csv.gz")
    
    # Read annotation file
    print("Reading annotation file...")
    with gzip.open(annotation_file, 'rt') as f:
        annotations = pd.read_csv(f)
    
    # Try to read the expression matrix
    expression_data_available = False
    try:
        print("Attempting to read expression matrix...")
        # Try to decompress the file
        decompressed_file = os.path.join(data_dir, "GSE251715_normalized_matrix.csv")
        if not os.path.exists(decompressed_file):
            try:
                with gzip.open(matrix_file, 'rb') as f_in:
                    with open(decompressed_file, 'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)
                print("Successfully decompressed expression matrix file")
            except Exception as e:
                print(f"Error decompressing expression matrix file: {e}")
                if os.path.exists(decompressed_file):
                    os.remove(decompressed_file)
        
        # Check if decompression was successful
        if os.path.exists(decompressed_file) and os.path.getsize(decompressed_file) > 0:
            # Try to read the expression matrix
            try:
                print("Reading expression matrix from decompressed file...")
                # Read the first few lines to get the header
                with open(decompressed_file, 'r') as f:
                    header = f.readline().strip().split(',')
                
                # Read the matrix in chunks
                chunks = []
                for chunk in pd.read_csv(decompressed_file, chunksize=1000):
                    chunks.append(chunk)
                
                expr_matrix = pd.concat(chunks)
                
                # Set gene IDs as index
                gene_ids = expr_matrix.iloc[:, 0]
                expr_matrix = expr_matrix.iloc[:, 1:]
                expr_matrix.index = gene_ids
                
                # Convert barcodes to match annotation file format
                expr_matrix.columns = [col.replace('.', '-') for col in expr_matrix.columns]
                
                # Create AnnData object with expression data
                print("Creating AnnData object with expression data...")
                adata = ad.AnnData(
                    X=expr_matrix.T.values,
                    obs=pd.DataFrame(index=expr_matrix.columns),
                    var=pd.DataFrame(index=expr_matrix.index)
                )
                
                expression_data_available = True
                print(f"Successfully created AnnData object with expression data: {adata.n_obs} cells and {adata.n_vars} genes")
            except Exception as e:
                print(f"Error reading expression matrix: {e}")
                expression_data_available = False
    except Exception as e:
        print(f"Error processing expression matrix: {e}")
        expression_data_available = False
    
    # If expression data is not available, create a minimal AnnData object
    if not expression_data_available:
        print("Creating minimal AnnData object without expression data...")
        
        # Create a list of cell barcodes from the annotation file
        cell_barcodes = annotations['barcode'].values
        
        # Create a minimal AnnData object with empty expression matrix
        adata = ad.AnnData(
            X=np.zeros((len(cell_barcodes), 1), dtype=np.float32),
            obs=pd.DataFrame(index=cell_barcodes),
            var=pd.DataFrame(index=['placeholder_gene'])
        )
        
        # Add a note about the missing expression data
        adata.uns['data_status'] = 'Expression data not available due to corrupted file'
        print(f"Created minimal AnnData object: {adata.n_obs} cells and {adata.n_vars} genes")
    
    # Note that the dataset provides normalized data
    adata.uns['data_is_normalized'] = True
    
    # Add gene information to var if expression data is available
    if expression_data_available:
        adata.var['gene_id'] = adata.var.index
        adata.var['gene_source'] = 'FlyBase'
    
    # Set the index of the annotations dataframe to match the AnnData object
    annotations.set_index('barcode', inplace=True)
    
    # Ensure all barcodes in expression matrix are in annotations
    common_barcodes = list(set(adata.obs_names) & set(annotations.index))
    adata = adata[common_barcodes]
    
    # Add annotations to obs
    adata.obs['original_annotation'] = annotations.loc[adata.obs_names, 'annotation'].values
    adata.obs['original_sample'] = annotations.loc[adata.obs_names, 'sample'].values
    
    # Harmonize metadata
    print("Harmonizing metadata...")
    
    # Set organism
    adata.obs['organism'] = 'Drosophila melanogaster'
    
    # Extract cell type from original annotation
    adata.obs['cell_type'] = adata.obs['original_annotation'].apply(
        lambda x: x.split(' ', 1)[1] if ' ' in x else x
    )
    
    # Set condition based on sample information
    adata.obs['condition'] = adata.obs['original_sample'].apply(
        lambda x: 'Control' if 'w1118' in x or 'W1118' in x else 'Test'
    )
    
    # Set perturbation_name based on sample information
    def get_perturbation(sample):
        if 'Pvr RNAi' in sample:
            return 'Pvr'
        elif 'Pvr Activation' in sample:
            return 'Pvr'
        elif 'Yki' in sample:
            return 'Yki'
        else:
            return 'None'
    
    adata.obs['perturbation_name'] = adata.obs['original_sample'].apply(get_perturbation)
    
    # Set CRISPR type
    def get_crispr_type(sample):
        if 'RNAi' in sample:
            return 'RNAi'
        elif 'Activation' in sample:
            return 'Activation'
        elif 'Yki' in sample:
            return 'Overexpression'
        else:
            return 'None'
    
    adata.obs['crispr_type'] = adata.obs['original_sample'].apply(get_crispr_type)
    
    # Set cancer_type (this is a gut tumor model)
    adata.obs['cancer_type'] = adata.obs['original_sample'].apply(
        lambda x: 'Gut Tumor' if 'Yki' in x else 'Non-Cancer'
    )
    
    # Add additional metadata
    adata.uns['dataset_id'] = 'GSE251715'
    adata.uns['dataset_title'] = 'Mechanistic characterization of a Drosophila model of paraneoplastic nephrotic syndrome'
    adata.uns['dataset_description'] = 'snRNA-seq analysis of Drosophila Malpighian tubules (MTs) with and without Yki gut tumors'
    
    print("Harmonization complete")
    return adata

def main(root_path=None):
    """
    Main function to download and process the dataset
    
    Parameters:
    -----------
    root_path : str, optional
        Root directory to save the dataset. If None, uses current directory.
    """
    # Set default root path if not provided
    if root_path is None:
        root_path = os.getcwd()
    
    # Create dataset directory
    accession_id = "GSE251715"
    data_dir = os.path.join(root_path, accession_id)
    os.makedirs(data_dir, exist_ok=True)
    
    # Download dataset if files don't exist
    download_dataset(accession_id, data_dir)
    
    # Process dataset
    adata = process_GSE251715(data_dir)
    
    # Save harmonized dataset
    output_file = os.path.join(data_dir, f"{accession_id}_harmonized.h5ad")
    print(f"Saving harmonized dataset to {output_file}")
    adata.write(output_file)
    print(f"Dataset saved to {output_file}")
    
    return adata

# Run the main function and display the AnnData object
adata = main()
adata


In [None]:
import anndata
import pandas as pd

def update_adata_obs(adata):
    # Ensure the crispr_type column exists, then set all its values to "CRISPRa"
    adata.obs["crispr_type"] = "CRISPRa"

    # Handle categorical column issue
    if pd.api.types.is_categorical_dtype(adata.obs["perturbation_name"]):
        # Add "Non-targeting" to the category list
        adata.obs["perturbation_name"] = adata.obs["perturbation_name"].cat.add_categories(["Non-targeting"])

    # Update perturbation_name to "Non-targeting" where condition is "Control"
    adata.obs.loc[adata.obs["condition"] == "Control", "perturbation_name"] = "Non-targeting"

    return adata

# Example usage
# adata = anndata.read_h5ad("your_file.h5ad")  # Load your AnnData file
adata = update_adata_obs(adata)  # Apply modifications
# adata.write("updated_file.h5ad")  # Save the modified AnnData object
