In [None]:
# Harmonization script for GSE274751 dataset (adapted for Jupyter Notebook)
#
# This notebook cell processes the GSE274751 dataset (CRISPR-Cas9 perturb-seq data of CD4+ T cells)
# and harmonizes it into a standardized h5ad format with specific metadata.
#
# It now also downloads the required dataset files if they are not present, and forces the "condition"
# column in .obs to be "Test" for all cells.
#
# Instructions:
# 1. Update the `data_path` variable with the path to your dataset folder.
# 2. Run the cell.
#
# The script will:
#   - Download dataset files (if needed)
#   - Load the dataset files
#   - Extract and standardize metadata (with condition always set to "Test")
#   - Create a harmonized h5ad file with standardized .obs attributes

import os
import pandas as pd
import numpy as np
import h5py
import anndata
import re
from pathlib import Path

def download_dataset(data_path):
    """Download the GSE274751 dataset files."""
    # Define the files to download
    files = [
        "GSE274751_tfko.sng.guides.full.ct.h5ad",
        "GSE274751_feature_reference.csv.gz",
        "GSE274751_HTEC_counts.h5ad"
    ]

    # Base URL for the GEO dataset
    base_url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE274nnn/GSE274751/suppl/"

    # Create the data directory if it doesn't exist
    os.makedirs(data_path, exist_ok=True)

    # Download each file
    for file in files:
        file_path = os.path.join(data_path, file)
        if os.path.exists(file_path):
            file_size = os.path.getsize(file_path) / (1024 * 1024)  # Size in MB
            print(f"File {file} already exists ({file_size:.1f} MB). Skipping download.")
            continue

        url = f"{base_url}{file}"
        print(f"Downloading {file} from {url}...")

        try:
            import urllib.request
            urllib.request.urlretrieve(url, file_path)
            file_size = os.path.getsize(file_path) / (1024 * 1024)  # Size in MB
            print(f"Downloaded {file} ({file_size:.1f} MB)")
        except Exception as e:
            print(f"Error downloading {file}: {e}")
            if os.path.exists(file_path):
                os.remove(file_path)  # Remove partial download

    # Check if the main file exists
    main_file = os.path.join(data_path, "GSE274751_tfko.sng.guides.full.ct.h5ad")
    if not os.path.exists(main_file):
        print(f"Error: Main dataset file {main_file} not found. Cannot proceed with harmonization.")
        return False

    return True

def extract_categories(file_path, category_name):
    """Extract category values from h5ad file."""
    with h5py.File(file_path, 'r') as f:
        if f'uns/{category_name}_categories' in f:
            categories = f[f'uns/{category_name}_categories'][:]
            return [c.decode('utf-8') if isinstance(c, bytes) else c for c in categories]
        return None

def extract_guide_info(file_path):
    """Extract guide information from h5ad file."""
    with h5py.File(file_path, 'r') as f:
        guide_categories = {}
        for i in range(1, 5):  # Guide 1-4
            key = f'guide{i}_cov_categories'
            if f'uns/{key}' in f:
                categories = f[f'uns/{key}'][:]
                guide_categories[f'guide{i}'] = [c.decode('utf-8') if isinstance(c, bytes) else c for c in categories]
        return guide_categories

def parse_guide_name(guide_name):
    """Parse guide name to extract gene and guide information."""
    if not guide_name or guide_name == '280':  # Handle special case
        return None, None
    
    # The guide names are in format: GENE.POSITION.SEQUENCE
    # Example: ARID5A.96550280.CCCCGCCGTACCTCTCGTAG
    parts = guide_name.split('.')
    if len(parts) >= 1:
        gene = parts[0]
        return gene, guide_name
    return None, guide_name

def create_harmonized_dataset(data_path):
    """
    Create a harmonized h5ad file from the GSE274751 dataset.
    
    Parameters:
    -----------
    data_path : str
        Path to the folder containing the dataset files
    
    Returns:
    --------
    anndata.AnnData
        Harmonized dataset
    """
    # Define file paths
    tfko_file = os.path.join(data_path, "GSE274751_tfko.sng.guides.full.ct.h5ad")
    
    # Load the dataset using h5py for metadata extraction
    print(f"Loading dataset from {tfko_file}...")
    
    # Extract metadata
    ct_categories = extract_categories(tfko_file, 'ct')
    donor_categories = extract_categories(tfko_file, 'donor')
    guide_info = extract_guide_info(tfko_file)
    
    # Load the dataset using anndata
    try:
        adata = anndata.read_h5ad(tfko_file)
        print(f"Successfully loaded dataset with shape {adata.shape}")
    except Exception as e:
        print(f"Error loading dataset with anndata: {e}")
        print("Attempting to create anndata object manually...")
        
        # Create anndata object manually
        with h5py.File(tfko_file, 'r') as f:
            if 'X' in f and 'data' in f['X'] and 'indices' in f['X'] and 'indptr' in f['X']:
                from scipy import sparse
                data = f['X/data'][:]
                indices = f['X/indices'][:]
                indptr = f['X/indptr'][:]
                shape = f['X'].attrs['h5sparse_shape']
                X = sparse.csr_matrix((data, indices, indptr), shape=shape)
                
                # Load obs and var data
                obs_data = f['obs'][:]
                var_data = f['var'][:]
                
                # Convert to pandas DataFrames
                obs_df = pd.DataFrame(obs_data)
                var_df = pd.DataFrame(var_data)
                
                adata = anndata.AnnData(X=X, obs=obs_df, var=var_df)
                print(f"Successfully created anndata object with shape {adata.shape}")
            else:
                raise ValueError("Could not find required data in the h5ad file")
    
    # Process and harmonize metadata
    print("Harmonizing metadata...")
    
    # Create new standardized metadata DataFrame
    harmonized_obs = pd.DataFrame(index=adata.obs.index)
    
    # 1. Organism
    harmonized_obs['organism'] = 'Homo sapiens'
    
    # 2. Cell Type
    if 'ct' in adata.obs.columns and ct_categories:
        harmonized_obs['cell_type'] = adata.obs['ct'].astype('category').cat.rename_categories(ct_categories)
    else:
        harmonized_obs['cell_type'] = 'CD4+ T cell'  # Default based on dataset description
    
    # 3. CRISPR Type
    harmonized_obs['crispr_type'] = 'CRISPR KO'
    
    # 4. Cancer Type
    harmonized_obs['cancer_type'] = 'Non-Cancer'
    
    # 5. Condition: Force condition to "Test" for all cells
    harmonized_obs['condition'] = 'Test'
    
    # 6. Perturbation Name: Extract perturbation information from guide columns
    perturbation_genes = []
    for i, cell_idx in enumerate(adata.obs.index):
        cell_guides = []
        # Even if there's a WT column, we force "Test", so we still extract guide data
        for guide_num in range(1, 5):  # Check guides 1-4
            guide_col = f'guide{guide_num}_cov'
            if guide_col in adata.obs.columns:
                guide_value = adata.obs[guide_col].iloc[i]
                if isinstance(guide_value, str) and guide_value != 'nan' and '.' in guide_value:
                    gene, _ = parse_guide_name(guide_value)
                    if gene and gene not in cell_guides:
                        cell_guides.append(gene)
        if not cell_guides:
            perturbation_genes.append('Non-targeting')
        else:
            perturbation_genes.append(' + '.join(sorted(cell_guides)))
    
    unique_perturbations = set(perturbation_genes)
    print(f"Found {len(unique_perturbations)} unique perturbations")
    if len(unique_perturbations) > 1:
        print(f"Sample of perturbations: {list(unique_perturbations)[:10]}")
    else:
        print("Warning: Only found 'Non-targeting' perturbations. Check the guide data.")
    
    harmonized_obs['perturbation_name'] = perturbation_genes
    
    # Add donor information if available
    if 'donor' in adata.obs.columns and donor_categories:
        harmonized_obs['donor'] = adata.obs['donor'].astype('category').cat.rename_categories(donor_categories)
    
    # Add additional metrics if available
    for col in ['numi', 'n_genes', 'percent_mito', 'percent_ribo', 'percent_hist']:
        if col in adata.obs.columns:
            harmonized_obs[col] = adata.obs[col]
    
    # Create the final harmonized AnnData object
    harmonized_adata = anndata.AnnData(
        X=adata.X,
        obs=harmonized_obs,
        var=adata.var,
        uns=adata.uns
    )
    
    # Optionally, store original obs metadata
    harmonized_adata.uns['original_obs'] = adata.obs
    
    # Convert string columns to categorical for compatibility
    for col in harmonized_obs.select_dtypes(include=['object']).columns:
        if col != 'index':  # Skip index column
            harmonized_obs[col] = harmonized_obs[col].astype('category')
    
    harmonized_adata.obs = harmonized_obs
    
    # Add categories to uns for categorical fields
    for col in harmonized_obs.select_dtypes(include=['category']).columns:
        harmonized_adata.uns[f'{col}_categories'] = np.array(harmonized_obs[col].cat.categories)
    
    print(f"Harmonization complete. Final dataset shape: {harmonized_adata.shape}")
    return harmonized_adata

# =========================
# Set your data folder path here:
data_path = "/content/GSE274751"  # <-- UPDATE this path accordingly

# Download the dataset files if necessary
print(f"Checking for dataset files in {data_path}...")
if not download_dataset(data_path):
    raise ValueError("Failed to download or locate the required dataset files.")

# Create harmonized dataset
harmonized_adata = create_harmonized_dataset(data_path)

# Save harmonized dataset to file in the same directory
output_file = os.path.join(data_path, "GSE274751_harmonized.h5ad")
print(f"Saving harmonized dataset to {output_file}...")
harmonized_adata.write_h5ad(output_file)
print("Harmonized dataset saved successfully!")
