In [None]:
import os
import sys
import gzip
import urllib.request
import h5py
import pandas as pd
import numpy as np
import scanpy as sc
from scipy.sparse import csc_matrix
import re
import logging
from tqdm import tqdm
import anndata  # for concatenating AnnData objects

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler('harmonize_GSE213511.log')
    ]
)
logger = logging.getLogger(__name__)

# URLs for the dataset files
FILE_URLS = {
    # H5 files
    'GSE213511_DM_CITEseq-1_NA_NM_1.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_DM_CITEseq-1_NA_NM_1.h5',
    'GSE213511_DM_CITEseq-2_NA_NM_1.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_DM_CITEseq-2_NA_NM_1.h5',
    'GSE213511_DM_OP0_NM_6d_1.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_DM_OP0_NM_6d_1.h5',
    'GSE213511_DM_OP1_NM_6d_1.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_DM_OP1_NM_6d_1.h5',
    'GSE213511_DM_OP1_NM_6d_2.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_DM_OP1_NM_6d_2.h5',
    'GSE213511_DM_OP2_NM_6d_1.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_DM_OP2_NM_6d_1.h5',
    'GSE213511_DM_OP2_NM_6d_2.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_DM_OP2_NM_6d_2.h5',
    'GSE213511_DM_OP2_NM_6d_3.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_DM_OP2_NM_6d_3.h5',
    'GSE213511_DM_OP3_NM_6d_1.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_DM_OP3_NM_6d_1.h5',
    'GSE213511_DM_OP4_NM_6d_1.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_DM_OP4_NM_6d_1.h5',
    'GSE213511_DM_OP5_NM_6d_1.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_DM_OP5_NM_6d_1.h5',
    'GSE213511_DM_Test1_NM_6d_1.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_DM_Test1_NM_6d_1.h5',
    'GSE213511_DM_Test2_NM_6d_1.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_DM_Test2_NM_6d_1.h5',
    'GSE213511_LSK_OP0_NM_7d_1.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_LSK_OP0_NM_7d_1.h5',
    'GSE213511_LSK_OP1_NM_7d_1.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_LSK_OP1_NM_7d_1.h5',
    'GSE213511_LSK_OP1_NM_9d_1.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_LSK_OP1_NM_9d_1.h5',
    'GSE213511_LSK_OP2_NM_7d_1.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_LSK_OP2_NM_7d_1.h5',
    'GSE213511_LSK_OP2_NM_9d_1.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_LSK_OP2_NM_9d_1.h5',
    'GSE213511_LSK_OP3_NM_7d_1.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_LSK_OP3_NM_7d_1.h5',
    'GSE213511_LSK_OP3_NM_9d_1.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_LSK_OP3_NM_9d_1.h5',
    'GSE213511_LSK_OP4_NM_7d_1.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_LSK_OP4_NM_7d_1.h5',
    'GSE213511_LSK_OP4_NM_9d_1.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_LSK_OP4_NM_9d_1.h5',
    'GSE213511_inVivo_NTC_lin-andckit_14d_1.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_inVivo_NTC_lin-andckit_14d_1.h5',
    'GSE213511_inVivo_OP1_ckit_14d_1.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_inVivo_OP1_ckit_14d_1.h5',
    'GSE213511_inVivo_OP1_lin-_14d_1.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_inVivo_OP1_lin-_14d_1.h5',
    'GSE213511_inVivo_OP1_lin-_14d_2.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_inVivo_OP1_lin-_14d_2.h5',
    'GSE213511_inVivo_OP1_lin-_28d_1.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_inVivo_OP1_lin-_28d_1.h5',
    'GSE213511_inVivo_OP1_lin-_28d_2.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_inVivo_OP1_lin-_28d_2.h5',
    'GSE213511_inVivo_OP2_ckit_14d_1.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_inVivo_OP2_ckit_14d_1.h5',
    'GSE213511_inVivo_OP2_lin-_14d_1.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_inVivo_OP2_lin-_14d_1.h5',
    'GSE213511_inVivo_OP3_ckit_14d_1.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_inVivo_OP3_ckit_14d_1.h5',
    'GSE213511_inVivo_OP3_lin-_14d_1.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_inVivo_OP3_lin-_14d_1.h5',
    'GSE213511_inVivo_OP4_ckit_14d_1.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_inVivo_OP4_ckit_14d_1.h5',
    'GSE213511_inVivo_OP4_lin-_14d_1.h5': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_inVivo_OP4_lin-_14d_1.h5',
    
    # Annotation files
    'GSE213511_CellAnnotation_exvivo.tsv.gz': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_CellAnnotation_exvivo.tsv.gz',
    'GSE213511_CellAnnotation_invivo.tsv.gz': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_CellAnnotation_invivo.tsv.gz',
    'GSE213511_CellAnnotation_leukemia.tsv.gz': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE213nnn/GSE213511/suppl/GSE213511_CellAnnotation_leukemia.tsv.gz'
}

def download_file(url, destination):
    """Download a file from a URL to a destination path."""
    logger.info(f"Downloading {url} to {destination}")
    try:
        urllib.request.urlretrieve(url, destination)
        logger.info(f"Downloaded {destination}")
    except Exception as e:
        logger.error(f"Failed to download {url}: {e}")
        raise

def ensure_files_exist(data_dir):
    """Ensure all necessary files exist, downloading them if they don't."""
    os.makedirs(data_dir, exist_ok=True)
    
    for filename, url in FILE_URLS.items():
        file_path = os.path.join(data_dir, filename)
        if not os.path.exists(file_path):
            download_file(url, file_path)
        else:
            logger.info(f"File {filename} already exists, skipping download")

def read_gzipped_tsv(file_path):
    """Read a gzipped TSV file into a pandas DataFrame."""
    try:
        with gzip.open(file_path, 'rt') as f:
            df = pd.read_csv(f, sep='\t')
        return df
    except Exception as e:
        logger.error(f"Failed to read {file_path}: {e}")
        raise

def h5_to_anndata(h5_file_path):
    """Convert an h5 file to an AnnData object."""
    try:
        with h5py.File(h5_file_path, 'r') as f:
            # Get matrix dimensions
            shape = f['matrix']['shape'][:]
            n_genes, n_cells = shape
            
            # Get feature information
            feature_ids = [id.decode('utf-8') for id in f['matrix']['features']['id'][:]]
            feature_names = [name.decode('utf-8') for name in f['matrix']['features']['name'][:]]
            
            # Get barcode information
            barcodes = [bc.decode('utf-8') for bc in f['matrix']['barcodes'][:]]
            
            # Create sparse matrix (stored in CSC; convert to CSR and transpose for AnnData)
            data = f['matrix']['data'][:]
            indices = f['matrix']['indices'][:]
            indptr = f['matrix']['indptr'][:]
            matrix = csc_matrix((data, indices, indptr), shape=(n_genes, n_cells))
            matrix = matrix.tocsr().transpose()  # cells as rows, genes as columns
            
            # Create AnnData object
            adata = sc.AnnData(X=matrix)
            adata.var_names = pd.Index(feature_names)
            adata.var['gene_ids'] = feature_ids
            adata.obs_names = pd.Index(barcodes)
            
            return adata
    except Exception as e:
        logger.error(f"Failed to convert {h5_file_path} to AnnData: {e}")
        raise

def determine_dataset_type(file_name):
    """Determine the dataset type based on the file name."""
    if 'DM_' in file_name:
        return 'leukemia'
    elif 'LSK_' in file_name:
        return 'exvivo'
    elif 'inVivo_' in file_name:
        return 'invivo'
    else:
        return 'unknown'

def extract_perturbation_name(guide_name):
    """Extract the perturbation name from the guide name."""
    if pd.isna(guide_name) or guide_name == 'NaN' or guide_name == '':
        return 'Unknown'
    
    # Check if it's a non-targeting control
    if 'NTC' in guide_name:
        return 'Non-targeting'
    
    # Extract gene name from guide format like "Rcor1_AS_21752"
    match = re.match(r'([A-Za-z0-9]+)_', guide_name)
    if match:
        return match.group(1)
    
    return guide_name

def determine_condition(mixscape_value):
    """Determine the condition based on the mixscape value."""
    if pd.isna(mixscape_value):
        return 'Unknown'
    elif mixscape_value == 'NTC':
        return 'Control'
    else:
        return 'Test'

def harmonize_dataset(adata, annotation_df, dataset_type):
    """Harmonize the dataset by adding standardized metadata."""
    # Create a mapping from barcode to annotation
    barcode_to_annotation = {row['Barcode']: row for _, row in annotation_df.iterrows()}
    
    # Initialize standardized metadata columns
    adata.obs['organism'] = 'Mice (Mus musculus)'
    adata.obs['cell_type'] = 'Unknown'
    adata.obs['crispr_type'] = 'CRISPR KO'
    adata.obs['cancer_type'] = 'Unknown'
    adata.obs['condition'] = 'Unknown'
    adata.obs['perturbation_name'] = 'Unknown'
    
    # Add dataset-specific metadata
    adata.obs['dataset_type'] = dataset_type
    
    # Add original metadata and map values
    for barcode in adata.obs_names:
        if barcode in barcode_to_annotation:
            annotation = barcode_to_annotation[barcode]
            for col in annotation.index:
                if col != 'Barcode':  # Skip the barcode column
                    adata.obs.loc[barcode, f'original_{col}'] = annotation[col]
            
            # Map cell type from Clusters column
            if 'Clusters' in annotation:
                adata.obs.loc[barcode, 'cell_type'] = annotation['Clusters']
            
            # Map perturbation name from Guide column
            guide_col = 'Guide' if 'Guide' in annotation else None
            if guide_col and not pd.isna(annotation[guide_col]):
                adata.obs.loc[barcode, 'perturbation_name'] = extract_perturbation_name(annotation[guide_col])
            
            # Map condition from mixscape column (handles either 'mixscape' or 'Mixscape')
            mixscape_col = next((col for col in ['mixscape', 'Mixscape'] if col in annotation), None)
            if mixscape_col:
                adata.obs.loc[barcode, 'condition'] = determine_condition(annotation[mixscape_col])
    
    # Set cancer type based on dataset type
    if dataset_type == 'leukemia':
        adata.obs['cancer_type'] = 'Leukemia'
    else:
        adata.obs['cancer_type'] = 'Non-Cancer'
    
    # Make var_names unique
    adata.var_names_make_unique()
    
    return adata

def process_file(h5_file_path, annotation_dfs, output_dir):
    """
    Process an h5 file and convert it to h5ad format with standardized metadata.
    Returns the processed AnnData object if successful, or None otherwise.
    """
    file_name = os.path.basename(h5_file_path)
    logger.info(f"Processing {file_name}")
    
    # Determine dataset type
    dataset_type = determine_dataset_type(file_name)
    
    # Select the appropriate annotation dataframe
    if dataset_type == 'leukemia':
        annotation_df = annotation_dfs['leukemia']
    elif dataset_type == 'exvivo':
        annotation_df = annotation_dfs['exvivo']
    elif dataset_type == 'invivo':
        annotation_df = annotation_dfs['invivo']
    else:
        logger.warning(f"Unknown dataset type for {file_name}, skipping")
        return None
    
    try:
        # Convert h5 to AnnData and harmonize the dataset
        adata = h5_to_anndata(h5_file_path)
        adata = harmonize_dataset(adata, annotation_df, dataset_type)
        
        # Save individual harmonized dataset
        output_file = os.path.join(output_dir, f"{os.path.splitext(file_name)[0]}.h5ad")
        adata.write_h5ad(output_file)
        logger.info(f"Saved harmonized dataset to {output_file}")
        
        return adata
    except Exception as e:
        logger.error(f"Failed to process {file_name}: {e}")
        return None

def main(data_dir):
    """Main function to process all files and save individual as well as a combined h5ad."""
    logger.info(f"Starting harmonization of GSE213511 dataset in {data_dir}")
    
    # Ensure all necessary files exist
    ensure_files_exist(data_dir)
    
    # Create output directory
    output_dir = os.path.join(data_dir, 'harmonized')
    os.makedirs(output_dir, exist_ok=True)
    
    # Read annotation files
    logger.info("Reading annotation files")
    annotation_dfs = {
        'exvivo': read_gzipped_tsv(os.path.join(data_dir, 'GSE213511_CellAnnotation_exvivo.tsv.gz')),
        'invivo': read_gzipped_tsv(os.path.join(data_dir, 'GSE213511_CellAnnotation_invivo.tsv.gz')),
        'leukemia': read_gzipped_tsv(os.path.join(data_dir, 'GSE213511_CellAnnotation_leukemia.tsv.gz'))
    }
    
    # Process each h5 file and collect the processed AnnData objects
    h5_files = [f for f in os.listdir(data_dir) if f.endswith('.h5')]
    logger.info(f"Found {len(h5_files)} h5 files to process")
    
    adata_list = []
    successful_files = 0
    for file_name in tqdm(h5_files, desc="Processing h5 files"):
        h5_file_path = os.path.join(data_dir, file_name)
        adata = process_file(h5_file_path, annotation_dfs, output_dir)
        if adata is not None:
            adata_list.append(adata)
            successful_files += 1
    
    logger.info(f"Harmonization complete. Successfully processed {successful_files}/{len(h5_files)} files.")
    
    # Combine all processed AnnData objects
    if adata_list:
        combined_adata = anndata.concat(adata_list, join='outer')
        # Exclude cells with "Unknown" in their perturbation_name
        combined_adata = combined_adata[combined_adata.obs['perturbation_name'] != "Unknown", :]
        
        # Save combined AnnData
        combined_file = os.path.join(output_dir, "combined.h5ad")
        combined_adata.write_h5ad(combined_file)
        logger.info(f"Saved combined harmonized dataset to {combined_file}")
    else:
        logger.warning("No AnnData objects were processed; combined file not created.")

# ----- Run in Jupyter Notebook -----
# Set the data directory path where your files will be downloaded/stored.
data_dir = "/content/GSE213511"  # <-- Change this to your actual directory

# Run the main function
main(data_dir)
