In [None]:
# If you don't already have these installed in your environment, uncomment the following:
# %pip install h5py anndata scipy pandas

import os
import urllib.request
import h5py
import numpy as np
import pandas as pd
import scipy.sparse as sp
import anndata as ad
from pathlib import Path

# URLs for the dataset files
GSE234610_FILES = {
    'TALL1': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE234nnn/GSE234610/suppl/GSE234610_TALL1_filtered_feature_bc_matrix.h5',
    'TALL2': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE234nnn/GSE234610/suppl/GSE234610_TALL2_filtered_feature_bc_matrix.h5',
    'TALL3': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE234nnn/GSE234610/suppl/GSE234610_TALL3_filtered_feature_bc_matrix.h5',
    'TALL4': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE234nnn/GSE234610/suppl/GSE234610_TALL4_filtered_feature_bc_matrix.h5',
    'TALL5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE234nnn/GSE234610/suppl/GSE234610_TALL5_filtered_feature_bc_matrix.h5',
    'TALL6': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE234nnn/GSE234610/suppl/GSE234610_TALL6_filtered_feature_bc_matrix.h5',
    'TALL7': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE234nnn/GSE234610/suppl/GSE234610_TALL7_filtered_feature_bc_matrix.h5',
    'TALL8': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE234nnn/GSE234610/suppl/GSE234610_TALL8_filtered_feature_bc_matrix.h5',
    'TALL9': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE234nnn/GSE234610/suppl/GSE234610_TALL9_filtered_feature_bc_matrix.h5',
    'TALL10': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE234nnn/GSE234610/suppl/GSE234610_TALL10_filtered_feature_bc_matrix.h5',
    'TALL11': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE234nnn/GSE234610/suppl/GSE234610_TALL11_filtered_feature_bc_matrix.h5',
}

def download_file(url, destination):
    """Download a file from a URL to a destination path."""
    if os.path.exists(destination):
        print(f"File already exists: {destination}")
        return
    
    print(f"Downloading {url} to {destination}")
    urllib.request.urlretrieve(url, destination)
    print(f"Downloaded {destination}")

def download_dataset(data_dir):
    """Download all dataset files if they don't exist."""
    os.makedirs(data_dir, exist_ok=True)
    
    for sample_id, url in GSE234610_FILES.items():
        file_name = os.path.basename(url)
        destination = os.path.join(data_dir, file_name)
        download_file(url, destination)

def process_10x_h5(file_path, sample_id):
    """
    Process a 10X h5 file and create AnnData objects for gene and protein expression.
    
    Args:
        file_path: Path to the h5 file
        sample_id: Sample ID (e.g., 'TALL1')
        
    Returns:
        gene_adata: AnnData object for gene expression
        protein_adata: AnnData object for protein expression
    """
    print(f"Processing {file_path}")
    
    with h5py.File(file_path, 'r') as f:
        # Read the raw matrix (features x cells)
        data = f['matrix/data'][:]
        indices = f['matrix/indices'][:]
        indptr = f['matrix/indptr'][:]
        shape = f['matrix/shape'][:]
        
        # Features (genes/proteins)
        feature_ids = [id.decode() for id in f['matrix/features/id'][:]]
        feature_names = [name.decode() for name in f['matrix/features/name'][:]]
        feature_types = [ft.decode() for ft in f['matrix/features/feature_type'][:]]
        
        # Barcodes (cell IDs)
        barcodes = [bc.decode() for bc in f['matrix/barcodes'][:]]
        
        # Create a DataFrame for features
        features_df = pd.DataFrame({
            'id': feature_ids,
            'name': feature_names,
            'feature_type': feature_types
        })
        
        # Create the full matrix in CSC, then convert to CSR
        matrix = sp.csc_matrix((data, indices, indptr), shape=shape).tocsr()
        
        # Identify gene vs protein indices
        gene_indices = features_df[features_df['feature_type'] == 'Gene Expression'].index
        protein_indices = features_df[features_df['feature_type'] == 'Antibody Capture'].index
        
        gene_matrix = matrix[gene_indices, :]
        protein_matrix = matrix[protein_indices, :]
        
        # ---- Create gene AnnData ----
        gene_names = features_df.loc[gene_indices, 'name'].values
        gene_ids = features_df.loc[gene_indices, 'id'].values
        
        # Make gene names unique if needed
        gene_name_count = {}
        unique_gene_names = []
        for name in gene_names:
            if name in gene_name_count:
                gene_name_count[name] += 1
                unique_gene_names.append(f"{name}_{gene_name_count[name]}")
            else:
                gene_name_count[name] = 0
                unique_gene_names.append(name)
        
        gene_var = pd.DataFrame({
            'gene_ids': gene_ids,
            'original_name': gene_names,
            'feature_type': 'Gene Expression'
        }, index=unique_gene_names)
        
        # ---- Create protein AnnData ----
        protein_names = features_df.loc[protein_indices, 'name'].values
        protein_ids = features_df.loc[protein_indices, 'id'].values
        
        protein_name_count = {}
        unique_protein_names = []
        
        for name in protein_names:
            # Remove "adt_" prefix if present
            clean_name = name.replace("adt_", "")
            
            if clean_name in protein_name_count:
                protein_name_count[clean_name] += 1
                unique_protein_names.append(f"{clean_name}_{protein_name_count[clean_name]}")
            else:
                protein_name_count[clean_name] = 0
                unique_protein_names.append(clean_name)
        
        protein_var = pd.DataFrame({
            'protein_ids': protein_ids,
            'original_name': protein_names,
            'feature_type': 'Antibody Capture'
        }, index=unique_protein_names)
        
        # ---- Create obs DataFrame with metadata ----
        obs = pd.DataFrame(index=barcodes)
        obs['sample_id'] = sample_id
        obs['organism'] = 'Homo sapiens'
        obs['cell_type'] = 'T-ALL'
        obs['crispr_type'] = 'None'
        obs['cancer_type'] = 'T-ALL'
        obs['condition'] = 'Primary'
        obs['perturbation_name'] = 'None'
        
        # ---- Build the AnnData objects ----
        # AnnData expects cells x features
        gene_adata = ad.AnnData(X=gene_matrix.T, var=gene_var, obs=obs)
        protein_adata = ad.AnnData(X=protein_matrix.T, var=protein_var, obs=obs)
        
        # Add some metadata
        gene_adata.uns['dataset_id'] = 'GSE234610'
        gene_adata.uns['data_type'] = 'Gene Expression'
        
        protein_adata.uns['dataset_id'] = 'GSE234610'
        protein_adata.uns['data_type'] = 'Protein Expression'
        
        return gene_adata, protein_adata

def process_dataset(data_dir):
    """
    Process all files in the dataset and create harmonized AnnData objects.
    
    Args:
        data_dir: Path to the directory containing the dataset files
        
    Returns:
        gene_adata_combined: Combined AnnData object for gene expression
        protein_adata_combined: Combined AnnData object for protein expression
    """
    gene_adatas = []
    protein_adatas = []
    
    for sample_id, url in GSE234610_FILES.items():
        file_name = os.path.basename(url)
        file_path = os.path.join(data_dir, file_name)
        
        if not os.path.exists(file_path):
            print(f"File not found: {file_path}. Skipping.")
            continue
        
        gene_adata, protein_adata = process_10x_h5(file_path, sample_id)
        
        # Make cell barcodes unique across samples
        gene_adata.obs_names = [f"{sample_id}_{bc}" for bc in gene_adata.obs_names]
        protein_adata.obs_names = [f"{sample_id}_{bc}" for bc in protein_adata.obs_names]
        
        gene_adatas.append(gene_adata)
        protein_adatas.append(protein_adata)
    
    # Combine all samples
    if gene_adatas:
        gene_adata_combined = ad.concat(gene_adatas, join='outer', merge='same')
        protein_adata_combined = ad.concat(protein_adatas, join='outer', merge='same')
        
        # Ensure no duplicate gene/protein names remain
        if gene_adata_combined.var_names.duplicated().any():
            print("Warning: Duplicate gene names found. Making them unique...")
            gene_adata_combined.var_names_make_unique()
        
        if protein_adata_combined.var_names.duplicated().any():
            print("Warning: Duplicate protein names found. Making them unique...")
            protein_adata_combined.var_names_make_unique()
        
        return gene_adata_combined, protein_adata_combined
    else:
        print("No data processed.")
        return None, None

# --------------------------------
# Run everything directly in Jupyter
# --------------------------------

# 1. Specify a local directory for data storage
data_dir = "./GSE234610"
data_dir = os.path.abspath(data_dir)
print(f"Data directory: {data_dir}")

# 2. Download the dataset (skip if files already exist)
download_dataset(data_dir)

# 3. Process the dataset and create AnnData objects
gene_adata, protein_adata = process_dataset(data_dir)

# 4. Save output if available
if gene_adata is not None and protein_adata is not None:
    output_dir = os.path.join(data_dir, "harmonized")
    os.makedirs(output_dir, exist_ok=True)

    gene_output_path = os.path.join(output_dir, "GSE234610_gene_expression.h5ad")
    protein_output_path = os.path.join(output_dir, "GSE234610_protein_expression.h5ad")
    
    print(f"Saving gene expression AnnData to: {gene_output_path}")
    gene_adata.write(gene_output_path)
    
    print(f"Saving protein expression AnnData to: {protein_output_path}")
    protein_adata.write(protein_output_path)
    
    print("\nProcessing complete. Summary:")
    print(f"  Gene expression AnnData: {gene_adata.shape[0]} cells x {gene_adata.shape[1]} genes")
    print(f"  Protein expression AnnData: {protein_adata.shape[0]} cells x {protein_adata.shape[1]} proteins")
else:
    print("No data processed; nothing to save.")
