In [None]:
import os
import sys
import h5py
import numpy as np
import pandas as pd
from scipy import sparse
import urllib.request
import tarfile
from pathlib import Path

# Try to import anndata, install if not available
try:
    import anndata as ad
except ImportError:
    import subprocess
    print("Installing anndata package...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "anndata"])
    import anndata as ad

# Constants
DATASET_ID = "GSE221882"
DATASET_URL = f"https://ftp.ncbi.nlm.nih.gov/geo/series/GSE221nnn/{DATASET_ID}/suppl/{DATASET_ID}_RAW.tar"
FILE_PATTERNS = [
    "GSM6908651_Clay_ALI_w10_filtered_feature_bc_matrix.h5",
    "GSM6908652_Arch_correction_ALI_w10_filtered_feature_bc_matrix.h5"
]

def download_dataset(data_dir):
    """Download the dataset if not already present."""
    tar_path = os.path.join(data_dir, f"{DATASET_ID}_RAW.tar")
    
    # Check if the tar file already exists
    if not os.path.exists(tar_path):
        print(f"Downloading {DATASET_ID} dataset...")
        urllib.request.urlretrieve(DATASET_URL, tar_path)
        print("Download complete.")
    else:
        print(f"Found existing tar file: {tar_path}")
    
    # Extract the tar file if the h5 files don't exist
    files_exist = all(os.path.exists(os.path.join(data_dir, pattern)) for pattern in FILE_PATTERNS)
    if not files_exist:
        print("Extracting tar file...")
        with tarfile.open(tar_path, 'r') as tar:
            tar.extractall(path=data_dir)
        print("Extraction complete.")
    else:
        print("All required files already extracted.")
    
    # Verify files exist
    missing_files = [pattern for pattern in FILE_PATTERNS 
                     if not os.path.exists(os.path.join(data_dir, pattern))]
    if missing_files:
        raise FileNotFoundError(f"Missing files after extraction: {missing_files}")
    
    return [os.path.join(data_dir, pattern) for pattern in FILE_PATTERNS]

def load_10x_h5(file_path):
    """Load a 10X h5 file into an AnnData object."""
    print(f"Loading {os.path.basename(file_path)}...")
    
    with h5py.File(file_path, 'r') as f:
        # Get the matrix data
        data = f['matrix/data'][:]
        indices = f['matrix/indices'][:]
        indptr = f['matrix/indptr'][:]
        shape = f['matrix/shape'][:]
        
        # Create a sparse matrix (CSC format)
        matrix = sparse.csc_matrix((data, indices, indptr), shape=shape)
        
        # Get barcodes (cell IDs)
        barcodes = [b.decode('utf-8') for b in f['matrix/barcodes'][:]]
        
        # Get features (genes)
        gene_ids = [g.decode('utf-8') for g in f['matrix/features/id'][:]]
        gene_names = [g.decode('utf-8') for g in f['matrix/features/name'][:]]
        feature_types = [g.decode('utf-8') for g in f['matrix/features/feature_type'][:]]
        
        # Check for duplicate gene names and make them unique
        gene_name_counts = {}
        unique_gene_names = []
        
        for name in gene_names:
            if name in gene_name_counts:
                gene_name_counts[name] += 1
                unique_gene_names.append(f"{name}_{gene_name_counts[name]}")
            else:
                gene_name_counts[name] = 0
                unique_gene_names.append(name)
        
        # Create AnnData object (transpose matrix to cells x genes)
        obs = pd.DataFrame(index=barcodes)
        var = pd.DataFrame(index=unique_gene_names)
        var['gene_ids'] = gene_ids
        var['original_gene_names'] = gene_names
        var['feature_types'] = feature_types
        
        # Create AnnData object
        adata = ad.AnnData(X=matrix.transpose(), obs=obs, var=var)
        
        # Add file name as a metadata field
        adata.uns['source_file'] = os.path.basename(file_path)
        
        print(f"  Loaded {adata.n_obs} cells and {adata.n_vars} genes.")
        return adata

def add_metadata(adata, file_name):
    """Add standardized metadata to the AnnData object."""
    # Extract sample information from filename
    sample_name = os.path.basename(file_name).split('_filtered')[0]
    
    # Determine condition based on sample name
    if 'Clay' in sample_name:
        condition = 'Control'
        perturbation_name = 'Non-targeting'
        crispr_type = 'None'
    elif 'Arch_correction' in sample_name:
        condition = 'Test'
        perturbation_name = 'CTNNB1'
        crispr_type = 'CRISPR_correction'
    else:
        condition = 'Unknown'
        perturbation_name = 'Unknown'
        crispr_type = 'Unknown'
    
    # Add standardized metadata
    adata.obs['sample'] = sample_name
    adata.obs['organism'] = 'Homo sapiens'
    adata.obs['cell_type'] = 'iPSC-derived cortical organoid'
    adata.obs['condition'] = condition
    adata.obs['perturbation_name'] = perturbation_name
    adata.obs['crispr_type'] = crispr_type
    adata.obs['cancer_type'] = 'Non-Cancer'
    
    return adata

def harmonize_dataset(file_paths):
    """Load and harmonize the dataset."""
    # Process each file separately and save to individual temporary h5ad files
    processed_files = []
    
    for i, file_path in enumerate(file_paths):
        temp_file = f"temp_processed_{i}.h5ad"
        if not os.path.exists(temp_file):
            print(f"Processing {os.path.basename(file_path)}...")
            adata = load_10x_h5(file_path)
            adata = add_metadata(adata, file_path)
            
            # Add batch information
            adata.obs['batch'] = f'batch{i}'
            adata.obs['source_file'] = os.path.basename(file_path)
            
            # Save to temporary file
            adata.write_h5ad(temp_file)
            print(f"  Saved to {temp_file}")
        else:
            print(f"Found existing processed file: {temp_file}")
        
        processed_files.append(temp_file)
    
    # Combine the processed files
    print("Combining datasets...")
    
    combined = ad.read_h5ad(processed_files[0])
    print(f"  Loaded first dataset: {combined.n_obs} cells, {combined.n_vars} genes")
    
    for i in range(1, len(processed_files)):
        print(f"  Adding dataset {i+1}...")
        adata = ad.read_h5ad(processed_files[i])
        print(f"    Dataset {i+1}: {adata.n_obs} cells, {adata.n_vars} genes")
        
        # Check if gene sets are the same; if not, find common genes
        if not all(combined.var_names == adata.var_names):
            print("    Gene sets differ. Finding common genes...")
            common_genes = set(combined.var_names).intersection(set(adata.var_names))
            print(f"    Found {len(common_genes)} common genes.")
            
            combined = combined[:, list(common_genes)].copy()
            adata = adata[:, list(common_genes)].copy()
        
        combined = ad.concat([combined, adata], join='outer', merge='same')
        print(f"    Combined dataset now: {combined.n_obs} cells, {combined.n_vars} genes")
    
    # Ensure var_names are gene symbols
    if 'gene_ids' in combined.var:
        print("Ensuring var_names are based on gene symbols...")
        if combined.var_names[0] in combined.var['gene_ids'].values:
            gene_id_to_symbol = {}
            for file_path in file_paths:
                with h5py.File(file_path, 'r') as f:
                    gene_ids = [g.decode('utf-8') for g in f['matrix/features/id'][:]]
                    gene_names = [g.decode('utf-8') for g in f['matrix/features/name'][:]]
                    for gene_id, gene_name in zip(gene_ids, gene_names):
                        gene_id_to_symbol[gene_id] = gene_name
            
            new_var = combined.var.copy()
            new_var.index = [gene_id_to_symbol.get(gene_id, gene_id) 
                             for gene_id in combined.var_names]
            
            combined = ad.AnnData(X=combined.X, obs=combined.obs, var=new_var)
    
    print(f"Final combined dataset: {combined.n_obs} cells, {combined.n_vars} genes")
    
    # Clean up temporary files
    for file in processed_files:
        if os.path.exists(file):
            os.remove(file)
    
    return combined

# In Jupyter, we'll simply set data_dir to the current working directory
data_dir = os.getcwd()
os.makedirs(data_dir, exist_ok=True)

# Download and extract the dataset (if needed)
file_paths = download_dataset(data_dir)

# Harmonize the dataset
combined = harmonize_dataset(file_paths)

# Save the harmonized dataset
output_file = os.path.join(data_dir, f"{DATASET_ID}_harmonized.h5ad")
print(f"Saving harmonized dataset to {output_file}...")
combined.write_h5ad(output_file)
print("Processing complete.")

# Print summary of the harmonized dataset
print("\nHarmonized Dataset Summary:")
print(f"  Number of cells: {combined.n_obs}")
print(f"  Number of genes: {combined.n_vars}")
print("  Metadata fields:")
for field in combined.obs.columns:
    print(f"    - {field}")

print("\nUnique values in categorical fields:")
for field in ['organism', 'cell_type', 'crispr_type', 'cancer_type', 'condition', 'perturbation_name']:
    if field in combined.obs.columns:
        unique_values = combined.obs[field].unique()
        print(f"    - {field}: {unique_values}")
