In [None]:
import os
import sys
import h5py
import numpy as np
import pandas as pd
from scipy import sparse
import urllib.request
from pathlib import Path

# Try to import anndata, install if not available
try:
    import anndata as ad
except ImportError:
    print("Installing anndata package...")
    import subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "anndata"])
    import anndata as ad

# Constants
ACCESSION = "GSE213921"
BASE_URL = f"https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/{ACCESSION}/suppl/"
FILES = [
    f"{ACCESSION}_barcodes.tsv.gz",
    f"{ACCESSION}_features.tsv.gz",
    f"{ACCESSION}_filtered_feature_bc_matrix_crispr.h5",
    f"{ACCESSION}_matrix.mtx.gz"
]

def download_files(data_dir):
    """Download dataset files if they don't exist."""
    os.makedirs(data_dir, exist_ok=True)
    
    for file in FILES:
        file_path = os.path.join(data_dir, file)
        if not os.path.exists(file_path):
            print(f"Downloading {file}...")
            url = f"{BASE_URL}{file}"
            urllib.request.urlretrieve(url, file_path)
            print(f"Downloaded {file}")
        else:
            print(f"File {file} already exists")

def process_data(data_dir):
    """Process the dataset and return an AnnData object."""
    h5_file = os.path.join(data_dir, f"{ACCESSION}_filtered_feature_bc_matrix_crispr.h5")
    
    print("Reading H5 file...")
    with h5py.File(h5_file, 'r') as f:
        # Extract feature information
        feature_ids = [x.decode('utf-8') for x in f['matrix']['features']['id'][:]]
        feature_names = [x.decode('utf-8') for x in f['matrix']['features']['name'][:]]
        feature_types = [x.decode('utf-8') for x in f['matrix']['features']['feature_type'][:]]
        
        # Extract target gene information if available
        if 'target_gene_name' in f['matrix']['features']:
            target_gene_names = [x.decode('utf-8') if x else "" for x in f['matrix']['features']['target_gene_name'][:]]
        else:
            target_gene_names = [""] * len(feature_ids)
        
        # Extract cell barcodes
        barcodes = [x.decode('utf-8') for x in f['matrix']['barcodes'][:]]
        
        # Extract sparse matrix data
        data = f['matrix']['data'][:]
        indices = f['matrix']['indices'][:]
        indptr = f['matrix']['indptr'][:]
        
        # Get the correct shape from the indptr and feature length
        n_cells = len(barcodes)
        n_features = len(feature_ids)
        shape = (n_cells, n_features)
        
        # Create sparse matrix
        matrix = sparse.csr_matrix((data, indices, indptr), shape=shape)
    
    print("Creating feature dataframe...")
    var_df = pd.DataFrame({
        'feature_id': feature_ids,
        'feature_name': feature_names,
        'feature_type': feature_types,
        'target_gene_name': target_gene_names
    })
    
    print("Separating gene expression and CRISPR guide data...")
    # Separate gene expression and CRISPR guide data
    gene_expr_mask = np.array(feature_types) == 'Gene Expression'
    crispr_mask = np.array(feature_types) == 'CRISPR Guide Capture'
    
    gene_expr_var = var_df[gene_expr_mask].copy()
    gene_expr_matrix = matrix[:, gene_expr_mask]
    
    crispr_var = var_df[crispr_mask].copy()
    crispr_matrix = matrix[:, crispr_mask]
    
    print(f"Gene expression matrix: {gene_expr_matrix.shape}")
    print(f"CRISPR guide matrix: {crispr_matrix.shape}")
    
    # Create AnnData object for gene expression
    adata = ad.AnnData(
        X=gene_expr_matrix,
        obs=pd.DataFrame(index=barcodes),
        var=gene_expr_var.set_index('feature_id')
    )
    
    # Add CRISPR guide information to obs
    if crispr_matrix.shape[1] > 0:
        print("Processing CRISPR guide information...")
        # For each cell, find the guide with the highest count
        crispr_dense = crispr_matrix.toarray()
        max_guide_idx = np.argmax(crispr_dense, axis=1)
        
        # Get the corresponding guide names and target genes
        cell_guides = []
        cell_targets = []
        
        for i, idx in enumerate(max_guide_idx):
            if crispr_dense[i, idx] > 0:  # Only if there's a guide detected
                guide_name = crispr_var.iloc[idx]['feature_name']
                target_gene = crispr_var.iloc[idx]['target_gene_name']
                cell_guides.append(guide_name)
                # Use target gene name if available, otherwise use "Non-targeting"
                if target_gene and target_gene.strip():
                    cell_targets.append(target_gene)
                else:
                    cell_targets.append("Non-targeting")
            else:
                cell_guides.append("Unknown")
                cell_targets.append("Unknown")
        
        adata.obs['guide'] = cell_guides
        adata.obs['perturbation_name'] = cell_targets
    
    # Store CRISPR data in uns (dimensions don't match for layers)
    if crispr_matrix.shape[1] > 0:
        adata.uns['CRISPR_guides'] = {
            'matrix': crispr_matrix,
            'var': crispr_var.reset_index().to_dict('list')
        }
    
    # Set gene symbols as var_names
    adata.var_names = adata.var['feature_name'].values
    # Remove the feature_name column to avoid conflict with the index
    adata.var = adata.var.drop(columns=['feature_name'])
    
    return adata

def harmonize_data(adata):
    """Harmonize the data and update guide-related metadata."""
    print("Harmonizing data...")
    
    # Set metadata fields
    adata.obs['organism'] = 'Mus musculus'
    adata.obs['cell_type'] = 'CD8+ T Cells'
    adata.obs['crispr_type'] = 'CRISPR KO'
    adata.obs['cancer_type'] = 'EG.7-OVA tumor'
    adata.obs['condition'] = 'test'
    
    # Update perturbation name and condition based on the guide column.
    # If the guide contains "NonTargetingControl", update perturbation_name and condition.
    mask_ntc = adata.obs['guide'].str.contains("NonTargetingControl", na=False)
    adata.obs.loc[mask_ntc, "perturbation_name"] = "Non-targeting"
    adata.obs.loc[mask_ntc, "condition"] = "Control"
    
    # Drop cells with an unknown guide
    initial_cells = adata.shape[0]
    adata = adata[adata.obs['guide'] != "Unknown"].copy()
    dropped = initial_cells - adata.shape[0]
    print(f"Dropped {dropped} cells with unknown guide.")
    
    return adata

def run_pipeline(data_dir=os.getcwd()):
    """
    Run the full processing pipeline:
      1. Download data files (if needed).
      2. Process the downloaded data.
      3. Harmonize the AnnData object (including guide annotation updates).
      4. Save the harmonized data to an h5ad file.
    
    Returns the AnnData object and the output file path.
    """
    accession_dir = os.path.join(data_dir, ACCESSION)
    os.makedirs(accession_dir, exist_ok=True)
    
    download_files(accession_dir)
    adata = process_data(accession_dir)
    adata = harmonize_data(adata)
    
    output_file = os.path.join(data_dir, f"{ACCESSION}_harmonized.h5ad")
    print(f"Saving harmonized data to {output_file}")
    adata.write_h5ad(output_file)
    
    print("Done!")
    print(f"Final AnnData object: {adata.shape[0]} cells × {adata.shape[1]} genes")
    print(f"Harmonized metadata columns: {list(adata.obs.columns)}")
    
    return adata, output_file

# Run the pipeline (specify data_dir if needed)
adata, output_file = run_pipeline()

