In [None]:
#!/usr/bin/env Rscript
# Fixed GSE236519 Harmonization Script for Jupyter Environment
# This script downloads, harmonizes, and converts the dataset to h5ad format

# Load required packages
suppressPackageStartupMessages({
  library(Seurat)
  library(tidyverse)
  library(R.utils)
  library(Matrix)
})

# Define directories and files
accession <- "GSE236519"

# Fix for Jupyter environment - use a simple, clean path instead of kernel path
# Check if we're in a Jupyter kernel path and set data directory accordingly
current_dir <- getwd()
if (grepl("kernel-", current_dir) || grepl("\\.json", current_dir)) {
  # We're likely in a Jupyter environment with a problematic path
  # Use a simpler path that doesn't have special characters or nested .json
  root_dir <- "/tmp"
  message("Detected Jupyter environment. Using /tmp directory for data storage.")
} else {
  root_dir <- current_dir
}

dataset_dir <- file.path(root_dir, accession)
output_rds <- file.path(dataset_dir, paste0(accession, "_harmonized.rds"))

# Create the dataset directory if it doesn't exist
dir.create(dataset_dir, showWarnings = FALSE, recursive = TRUE)
message("Working with dataset directory: ", dataset_dir)

# File URLs and their local paths
file_urls <- c(
  "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE236nnn/GSE236519/suppl/GSE236519_arrayed_CRISPRi_hSyn.rds.gz",
  "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE236nnn/GSE236519/suppl/GSE236519_arrayed_screen_hSyn.rds.gz",
  "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE236nnn/GSE236519/suppl/GSE236519_lgdel_model.rds.gz",
  "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE236nnn/GSE236519/suppl/GSE236519_pooled_screen_CBh.rds.gz"
)

# Function to safely download and extract files
download_and_extract <- function(file_url, dest_dir) {
  file_name <- basename(file_url)
  download_path <- file.path(dest_dir, file_name)
  extracted_path <- sub("\\.gz$", "", download_path)

  # Create a temporary path for download if needed
  temp_download <- FALSE
  if (!file.exists(extracted_path)) {
    message("Downloading ", file_name, "...")

    # First try direct download
    download_success <- tryCatch({
      download.file(file_url, download_path, mode = "wb", quiet = FALSE)
      TRUE
    }, error = function(e) {
      message("Direct download failed: ", e$message)
      FALSE
    })

    # If direct download fails, try system commands as fallback
    if (!download_success) {
      message("Trying system commands for download...")

      # Try wget first
      if (Sys.which("wget") != "") {
        message("Using wget...")
        download_success <- system(paste("wget -q", shQuote(file_url), "-O", shQuote(download_path))) == 0
      }

      # If wget fails, try curl
      if (!download_success && Sys.which("curl") != "") {
        message("Using curl...")
        download_success <- system(paste("curl -L", shQuote(file_url), "-o", shQuote(download_path))) == 0
      }
    }

    # Check if download was successful
    if (!download_success || !file.exists(download_path) || file.info(download_path)$size == 0) {
      message("Download failed for ", file_name)
      return(NULL)
    }

    message("Download complete.")

    # Extract the file
    message("Extracting ", file_name, "...")

    # Try multiple extraction methods
    extract_success <- tryCatch({
      R.utils::gunzip(download_path, extracted_path, remove = FALSE)
      TRUE
    }, error = function(e) {
      message("R.utils::gunzip failed: ", e$message)
      FALSE
    })

    if (!extract_success) {
      message("Trying system gunzip...")
      extract_success <- system(paste("gunzip -c", shQuote(download_path), ">", shQuote(extracted_path))) == 0
    }

    if (!extract_success || !file.exists(extracted_path) || file.info(extracted_path)$size == 0) {
      message("Extraction failed for ", file_name)
      return(NULL)
    }

    message("Extraction complete.")
  } else {
    message("Found existing extracted file: ", extracted_path)
  }

  return(extracted_path)
}

# Download and extract all files
all_files <- sapply(file_urls, download_and_extract, dest_dir = dataset_dir)
all_files <- all_files[!is.null(all_files)]

# Function to harmonize metadata for a Seurat object
harmonize_metadata <- function(seu_obj, dataset_name) {
  message("Harmonizing metadata for: ", dataset_name)

  # Create a base metadata dataframe
  meta_data <- seu_obj@meta.data
  meta_cols <- colnames(meta_data)

  # Initialize harmonized metadata with cell barcodes
  harmonized_meta <- data.frame(
    cell_barcode = rownames(meta_data),
    row.names = rownames(meta_data)
  )

  # Standard fields required for harmonization
  harmonized_meta$organism <- "Mus musculus"

  # Determine cell type based on the specific dataset
  if ("cell_types" %in% meta_cols) {
    harmonized_meta$cell_type <- meta_data$cell_types
  } else if ("cell_types_broad" %in% meta_cols) {
    harmonized_meta$cell_type <- meta_data$cell_types_broad
  } else {
    # Default cell type for this dataset (from the GEO description)
    harmonized_meta$cell_type <- "Prefrontal cortex neurons"
  }

  # Set CRISPR type based on the dataset name
  if (grepl("CRISPRi", dataset_name)) {
    harmonized_meta$crispr_type <- "CRISPRi"
  } else {
    harmonized_meta$crispr_type <- "CRISPR KO"
  }

  # Set cancer type (non-cancer in this case)
  harmonized_meta$cancer_type <- "Non-Cancer"

  # Determine condition (control vs test)
  # For the pooled screen dataset with gRNA information
  if ("gRNAs" %in% meta_cols) {
    perturbation_col <- "gRNAs"
    perturbations <- meta_data[[perturbation_col]]

    # Identify controls using common naming patterns
    is_control <- grepl("control|ctrl|non-target|nt|neg|scramble",
                        perturbations, ignore.case = TRUE)

    harmonized_meta$condition <- "test"
    harmonized_meta$condition[is_control] <- "control"

    # Set perturbation names
    harmonized_meta$perturbation_name <- perturbations
    harmonized_meta$perturbation_name[is_control] <- "non-targeting"

  } else if ("per_gene" %in% meta_cols) {
    # For the arrayed screens that use per_gene for perturbation info
    perturbation_col <- "per_gene"
    perturbations <- meta_data[[perturbation_col]]

    # Identify controls for this specific dataset
    is_control <- grepl("Safe|Control|Ctrl", perturbations, ignore.case = TRUE)

    harmonized_meta$condition <- "test"
    harmonized_meta$condition[is_control] <- "control"

    # Clean up perturbation names
    harmonized_meta$perturbation_name <- perturbations
    harmonized_meta$perturbation_name[is_control] <- "non-targeting"

  } else if ("condition" %in% meta_cols) {
    # For the lgdel_model that uses condition
    perturbation_col <- "condition"
    perturbations <- meta_data[[perturbation_col]]

    # This dataset compares WT (control) vs. deletion model
    is_control <- grepl("wt|WT|control", perturbations, ignore.case = TRUE)

    harmonized_meta$condition <- "test"
    harmonized_meta$condition[is_control] <- "control"

    # Set perturbation names
    harmonized_meta$perturbation_name <- ifelse(is_control, "non-targeting", "22q11.2-deletion")

  } else {
    # Default if no perturbation information is found
    harmonized_meta$condition <- "unknown"
    harmonized_meta$perturbation_name <- "unknown"
  }

  # Preserve original quality metrics
  quality_cols <- c("nCount_RNA", "nFeature_RNA", "percent.mt", "orig.ident")
  for (col in quality_cols) {
    if (col %in% meta_cols) {
      harmonized_meta[[col]] <- meta_data[[col]]
    }
  }

  # Preserve all other metadata with orig_ prefix
  for (col in meta_cols) {
    if (!col %in% colnames(harmonized_meta) &&
        !col %in% c(quality_cols, perturbation_col)) {
      new_col <- paste0("orig_", col)
      harmonized_meta[[new_col]] <- meta_data[[col]]
    }
  }

  return(harmonized_meta)
}

# Function to merge all datasets into a single Seurat object
merge_and_harmonize <- function(file_paths) {
  if (length(file_paths) == 0) {
    message("No files found for merging.")
    return(NULL)
  }

  # Create a list to hold Seurat objects
  seurat_list <- list()
  dataset_names <- c()

  # Process each file
  for (i in seq_along(file_paths)) {
    file_path <- file_paths[i]
    dataset_name <- sub("\\.rds$", "", basename(file_path))
    dataset_names <- c(dataset_names, dataset_name)

    message("Processing dataset: ", dataset_name)

    # Load the Seurat object
    tryCatch({
      seu_obj <- readRDS(file_path)

      # Harmonize metadata
      harmonized_meta <- harmonize_metadata(seu_obj, dataset_name)

      # Add dataset identifier
      harmonized_meta$dataset <- dataset_name

      # Create a new Seurat object with raw counts and harmonized metadata
      harmonized_obj <- CreateSeuratObject(
        counts = seu_obj@assays$RNA@counts,
        meta.data = harmonized_meta,
        project = accession
      )

      # Store in list
      seurat_list[[i]] <- harmonized_obj

      # Report statistics
      message("Dataset: ", dataset_name)
      message("Cells: ", ncol(harmonized_obj))
      message("Genes: ", nrow(harmonized_obj))
      message("")
    }, error = function(e) {
      message("Error processing ", dataset_name, ": ", e$message)
    })
  }

  if (length(seurat_list) == 0) {
    message("No datasets were successfully processed.")
    return(NULL)
  }

  # Merge all Seurat objects
  message("Merging all datasets...")
  if (length(seurat_list) > 1) {
    merged_obj <- merge(seurat_list[[1]], seurat_list[-1], add.cell.ids = dataset_names)
  } else {
    merged_obj <- seurat_list[[1]]
  }

  message("Merged object created with ", ncol(merged_obj), " cells and ", nrow(merged_obj), " genes.")

  return(merged_obj)
}

# Function to write conversion script
write_conversion_script <- function(dataset_dir, input_rds) {
  python_script <- file.path(dataset_dir, "convert_to_h5ad.py")
  output_h5ad <- sub("\\.rds$", ".h5ad", input_rds)

  script_content <- paste(
    "#!/usr/bin/env python3",
    "# Convert Seurat RDS to h5ad format for GSE236519",
    "",
    "import os",
    "import sys",
    "import numpy as np",
    "import pandas as pd",
    "import scipy.sparse",
    "import scipy.io",
    "import anndata",
    "import subprocess",
    "",
    "# First install required packages if needed",
    "def install_package(package):",
    "    try:",
    "        __import__(package)",
    "    except ImportError:",
    "        print(f'Installing {package}...')",
    "        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])",
    "",
    "install_package('rpy2')",
    "install_package('anndata')",
    "",
    "# Import rpy2 modules",
    "from rpy2.robjects import pandas2ri, r, numpy2ri",
    "from rpy2.robjects.conversion import localconverter",
    "import rpy2.robjects as ro",
    "",
    "# Initialize R converters",
    "pandas2ri.activate()",
    "numpy2ri.activate()",
    "",
    "# File paths",
    paste0("input_rds = '", input_rds, "'"),
    paste0("output_h5ad = '", output_h5ad, "'"),
    "dataset_dir = os.path.dirname(input_rds)",
    "",
    "print(f'Converting {input_rds} to {output_h5ad}')",
    "",
    "# Load required R packages",
    "print('Loading R packages...')",
    "r(\"library(Seurat)\")",
    "r(\"library(Matrix)\")",
    "",
    "# Load the Seurat object",
    "print('Loading Seurat object...')",
    paste0("r(f\"seu <- readRDS('", input_rds, "')\")"),
    "",
    "# Use intermediate file approach to avoid conversion issues",
    "print('Converting via temporary sparse matrix file...')",
    "temp_mtx = os.path.join(dataset_dir, 'temp_counts.mtx')",
    "r(f\"\"\"",
    "# Get counts matrix",
    "counts <- GetAssayData(seu, slot='counts', assay='RNA')",
    "# Write sparse matrix to file",
    "writeMM(counts, '{temp_mtx}')",
    "# Save gene and cell names",
    "write.table(rownames(counts), '{dataset_dir}/temp_genes.txt', row.names=FALSE, col.names=FALSE, quote=FALSE)",
    "write.table(colnames(counts), '{dataset_dir}/temp_cells.txt', row.names=FALSE, col.names=FALSE, quote=FALSE)",
    "\"\"\")",
    "",
    "# Read the sparse matrix",
    "print('Reading sparse matrix from file...')",
    "counts_sparse = scipy.io.mmread(temp_mtx).tocsr()",
    "",
    "# Read gene and cell names",
    "genes = pd.read_csv(f\"{dataset_dir}/temp_genes.txt\", header=None)[0].values",
    "cells = pd.read_csv(f\"{dataset_dir}/temp_cells.txt\", header=None)[0].values",
    "",
    "# Get metadata",
    "print('Extracting metadata...')",
    "with localconverter(ro.default_converter + pandas2ri.converter):",
    "    meta_df = r('seu@meta.data')",
    "    meta_df = pd.DataFrame(meta_df)",
    "",
    "# Create AnnData object - note we transpose the matrix to get cells x genes",
    "print('Creating AnnData object...')",
    "adata = anndata.AnnData(",
    "    X=counts_sparse.T,  # Transpose to cells x genes",
    "    obs=meta_df,",
    "    var=pd.DataFrame(index=genes)",
    ")",
    "",
    "# Ensure required fields are present",
    "required_fields = ['organism', 'cell_type', 'crispr_type', 'cancer_type', 'condition', 'perturbation_name']",
    "for field in required_fields:",
    "    if field not in adata.obs_keys():",
    "        print(f'WARNING: Required field {field} missing from metadata')",
    "",
    "# Save as h5ad",
    "print('Saving h5ad file...')",
    "adata.write(output_h5ad)",
    "",
    "# Clean up temporary files",
    "print('Cleaning up temporary files...')",
    "os.remove(temp_mtx)",
    "os.remove(f\"{dataset_dir}/temp_genes.txt\")",
    "os.remove(f\"{dataset_dir}/temp_cells.txt\")",
    "",
    "print(f'Conversion complete! File saved to {output_h5ad}')",
    sep = "\n"
  )

  # Write the Python script to file
  writeLines(script_content, python_script)

  # Make the script executable
  Sys.chmod(python_script, mode = "0755")

  message("Python conversion script created: ", python_script)
  return(python_script)
}

# Main execution flow
message("=== Starting GSE236519 harmonization process ===")

# Merge and harmonize datasets
harmonized_seurat <- merge_and_harmonize(all_files)

if (!is.null(harmonized_seurat)) {
  # Save as RDS
  message("Saving harmonized Seurat object to RDS...")
  saveRDS(harmonized_seurat, output_rds)
  message("Saved to: ", output_rds)

  # Create Python conversion script
  python_script <- write_conversion_script(dataset_dir, output_rds)

  # Print instructions for conversion
  message("\n=== Conversion Instructions ===")
  message("To convert the RDS file to h5ad format, run the following command:")
  message("python ", python_script, "\n")

  # Print dataset summary
  message("\n=== Dataset Summary ===")
  message("Accession: ", accession)
  message("Number of cells: ", ncol(harmonized_seurat))
  message("Number of genes: ", nrow(harmonized_seurat))
} else {
  message("Failed to create harmonized Seurat object.")
}

message("\nHarmonization process complete!")


In [4]:
#!/usr/bin/env python3
# Data Type Fixed Harmonized Converter for GSE236519
# This script handles mixed data types in the metadata

import os
import sys
import numpy as np
import pandas as pd
import scipy.sparse
import scipy.io
import subprocess
import glob
from typing import Dict, List, Tuple, Optional

# First install required packages if needed
def install_package(package):
    try:
        __import__(package)
    except ImportError:
        print(f'Installing {package}...')
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])

install_package('rpy2')
install_package('anndata')
install_package('scanpy')

# Import packages after installation
from rpy2.robjects import pandas2ri, r, numpy2ri
from rpy2.robjects.conversion import localconverter
import rpy2.robjects as ro
import anndata
import scanpy as sc

# Initialize R converters
pandas2ri.activate()
numpy2ri.activate()

# File paths
data_dir = '/content/GSE236519'
harmonized_rds = os.path.join(data_dir, 'GSE236519_harmonized.rds')
output_h5ad = os.path.join(data_dir, 'GSE236519_harmonized.h5ad')

# Load required R packages
print('Loading R packages...')
r("suppressPackageStartupMessages(library(Seurat))")
r("suppressPackageStartupMessages(library(Matrix))")
r("suppressPackageStartupMessages(library(dplyr))")

# Check Seurat version
r("seurat_version <- packageVersion('Seurat')")
seurat_version = r("as.character(seurat_version)")
print(f"Detected Seurat version: {seurat_version[0]}")

# Function to clean and standardize metadata
def clean_metadata(meta_df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean and standardize metadata to ensure all data types are compatible with h5ad format.
    
    Args:
        meta_df: Input metadata DataFrame
        
    Returns:
        Cleaned metadata DataFrame
    """
    print("Cleaning and standardizing metadata...")
    
    # Make a copy to avoid modifying the original
    meta = meta_df.copy()
    
    # Convert problematic columns to string
    # These columns are commonly causing issues in conversion
    str_columns = ['cell_barcode', 'organism', 'cell_type', 'crispr_type', 
                   'cancer_type', 'condition', 'perturbation_name', 'dataset']
    
    for col in str_columns:
        if col in meta.columns:
            meta[col] = meta[col].astype(str)
    
    # Handle NA values and convert mixed columns to string
    for col in meta.columns:
        # Skip numeric columns
        if pd.api.types.is_numeric_dtype(meta[col]):
            # Replace NaN in numeric columns with 0
            meta[col] = meta[col].fillna(0)
            continue
            
        # For mixed or problematic columns, convert to string
        try:
            # First try to convert to string
            meta[col] = meta[col].fillna('NA').astype(str)
        except:
            print(f"Warning: Could not convert column {col} to string. Dropping column.")
            meta = meta.drop(columns=[col])
    
    # Check for any remaining NaN values
    if meta.isna().any().any():
        print("Warning: NaN values remain in metadata. Converting to string 'NA'")
        meta = meta.fillna('NA')
    
    return meta

# First extract the harmonized metadata directly using R
print("Extracting harmonized metadata directly from R...")
r(f"""
# Load the harmonized Seurat object
harm_obj <- readRDS('{harmonized_rds}')

# Function to safely convert to character
safe_char <- function(x) {{
  tryCatch({{
    as.character(x)
  }}, error = function(e) {{
    rep("NA", length(x))
  }})
}}

# Extract the harmonized metadata and convert problematic columns to character
harm_meta <- harm_obj@meta.data
harm_meta$cell_barcode <- row.names(harm_meta)
harm_meta$organism <- safe_char(harm_meta$organism)
harm_meta$cell_type <- safe_char(harm_meta$cell_type)
harm_meta$crispr_type <- safe_char(harm_meta$crispr_type)
harm_meta$cancer_type <- safe_char(harm_meta$cancer_type)
harm_meta$condition <- safe_char(harm_meta$condition)
harm_meta$perturbation_name <- safe_char(harm_meta$perturbation_name)
harm_meta$dataset <- safe_char(harm_meta$dataset)

# Get indices per dataset
dataset_indices <- list()
for (ds in unique(harm_meta$dataset)) {{
  dataset_indices[[ds]] <- which(harm_meta$dataset == ds)
}}

# Store info for reference
datasets <- names(dataset_indices)
dataset_sizes <- sapply(dataset_indices, length)
print(paste("Found datasets:", paste(datasets, collapse=", ")))
print(paste("Dataset sizes:", paste(dataset_sizes, collapse=", ")))
""")

# Function to process an individual RDS file with harmonized metadata
def process_rds_file(rds_path: str, dataset_name: str) -> Tuple[str, Optional[anndata.AnnData]]:
    """
    Process a single RDS file and create an h5ad file with harmonized metadata.
    
    Args:
        rds_path: Path to the RDS file
        dataset_name: Name of the dataset
        
    Returns:
        Tuple of (output file path, AnnData object if successful)
    """
    print(f"Processing file: {rds_path} as {dataset_name}")
    output_file = os.path.join(data_dir, f"{dataset_name}_harmonized.h5ad")
    
    # Load the Seurat object
    print(f'Loading {dataset_name}...')
    r(f"seu <- readRDS('{rds_path}')")
    
    # Use intermediate file approach to avoid conversion issues
    print('Converting to sparse matrix...')
    temp_mtx = os.path.join(data_dir, f'temp_{dataset_name}.mtx')
    temp_genes = os.path.join(data_dir, f'temp_{dataset_name}_genes.txt')
    temp_cells = os.path.join(data_dir, f'temp_{dataset_name}_cells.txt')

    # Extract data based on Seurat version
    r(f"""
    # Different approach for v5 vs older versions
    seurat_v5 <- as.numeric(substr(packageVersion('Seurat'), 1, 1)) >= 5

    if (seurat_v5) {{
      # Seurat v5 approach
      print("Using Seurat v5 approach with layers...")
      
      # Get the first assay name
      assay_name <- DefaultAssay(seu)
      
      # Get available layers
      available_layers <- Layers(seu[[assay_name]])
      print(paste("Available layers:", paste(available_layers, collapse=", ")))
      
      # If 'counts' exists as a layer, use it, otherwise use the first layer
      layer_to_use <- if ("counts" %in% available_layers) "counts" else available_layers[1]
      print(paste("Using layer:", layer_to_use))
      
      # Access the layer
      counts <- seu[[assay_name]][layer_to_use]
    }} else {{
      # Seurat v4 and older approach
      print("Using traditional GetAssayData approach...")
      counts <- GetAssayData(seu, slot='counts', assay='RNA')
    }}

    # Write sparse matrix to file
    print("Writing matrix to file...")
    writeMM(counts, '{temp_mtx}')

    # Save gene and cell names
    write.table(rownames(counts), '{temp_genes}', row.names=FALSE, col.names=FALSE, quote=FALSE)
    write.table(colnames(counts), '{temp_cells}', row.names=FALSE, col.names=FALSE, quote=FALSE)
    """)

    # Read the sparse matrix
    print('Reading sparse matrix from file...')
    counts_sparse = scipy.io.mmread(temp_mtx).tocsr()
    
    # Read gene and cell names
    genes = pd.read_csv(temp_genes, header=None)[0].values
    cells = pd.read_csv(temp_cells, header=None)[0].values
    
    print(f'Extracting harmonized metadata for {dataset_name}...')
    # Get dataset-specific metadata directly from R
    with localconverter(ro.default_converter + pandas2ri.converter):
        # Use R to get the metadata for this specific dataset
        r(f"""
        # Create a metadata subset for just this dataset
        dataset_cells <- which(harm_meta$dataset == '{dataset_name}')
        dataset_meta <- harm_meta[dataset_cells, ]
        
        # Check for cell name alignment
        original_cell_names <- readLines('{temp_cells}')
        meta_cells <- rownames(dataset_meta)
        
        # If cell names don't match exactly, attempt to realign
        if (!all(original_cell_names %in% meta_cells)) {{
            # If we have same number of cells, assume same order
            if (length(original_cell_names) == nrow(dataset_meta)) {{
                rownames(dataset_meta) <- original_cell_names
                print("Used positional matching for metadata")
            }} else {{
                print("WARNING: Cell count mismatch, using basic metadata")
                dataset_meta <- data.frame(
                    dataset = rep('{dataset_name}', length(original_cell_names)),
                    cell_barcode = original_cell_names,
                    row.names = original_cell_names
                )
                # Add required harmonized fields
                dataset_meta$organism <- "Mus musculus"
                dataset_meta$cell_type <- "Prefrontal cortex neurons"
                dataset_meta$crispr_type <- if (grepl("CRISPRi", '{dataset_name}')) "CRISPRi" else "CRISPR KO"
                dataset_meta$cancer_type <- "Non-Cancer"
                dataset_meta$condition <- "unknown"
                dataset_meta$perturbation_name <- "unknown"
            }}
        }}
        
        # Write to CSV for Python to read
        write.csv(dataset_meta, '{data_dir}/temp_{dataset_name}_meta.csv', row.names=TRUE)
        """)
    
    # Read the metadata from CSV - this avoids serialization issues from R to Python
    dataset_metadata = pd.read_csv(f"{data_dir}/temp_{dataset_name}_meta.csv", 
                                   index_col=0, low_memory=False)
    
    # Clean and standardize the metadata
    dataset_metadata = clean_metadata(dataset_metadata)
    
    # Create AnnData object
    print('Creating AnnData object...')
    try:
        adata = anndata.AnnData(
            X=counts_sparse.T,  # Transpose to cells x genes
            obs=dataset_metadata,
            var=pd.DataFrame(index=genes)
        )
        
        # Ensure required fields are present
        required_fields = ['organism', 'cell_type', 'crispr_type', 'cancer_type', 'condition', 'perturbation_name']
        for field in required_fields:
            if field not in adata.obs.columns:
                print(f'WARNING: Required field {field} missing from metadata')
        
        # Save as h5ad
        print(f'Saving {dataset_name} to h5ad with harmonized metadata...')
        adata.write(output_file)
        
        # Clean up temporary files
        for file in [temp_mtx, temp_genes, temp_cells]:
            try:
                os.remove(file)
            except:
                print(f"Warning: Could not remove temporary file {file}")
        
        print(f'Completed processing {dataset_name}')
        return output_file, adata
        
    except Exception as e:
        print(f"Error creating AnnData object: {e}")
        return output_file, None

# Process original RDS files with harmonized metadata
original_rds_files = [
    (os.path.join(data_dir, 'GSE236519_arrayed_CRISPRi_hSyn.rds'), 'GSE236519_arrayed_CRISPRi_hSyn'),
    (os.path.join(data_dir, 'GSE236519_arrayed_screen_hSyn.rds'), 'GSE236519_arrayed_screen_hSyn'),
    (os.path.join(data_dir, 'GSE236519_lgdel_model.rds'), 'GSE236519_lgdel_model'),
    (os.path.join(data_dir, 'GSE236519_pooled_screen_CBh.rds'), 'GSE236519_pooled_screen_CBh')
]

# Process each file individually
h5ad_files = []
adatas = []

for rds_file, dataset_name in original_rds_files:
    if os.path.exists(rds_file):
        h5ad_file, adata = process_rds_file(rds_file, dataset_name)
        if adata is not None:
            h5ad_files.append(h5ad_file)
            adatas.append(adata)
    else:
        print(f"Warning: File {rds_file} not found")

# Now combine the datasets
if len(adatas) > 0:
    print(f"\nCombining {len(adatas)} datasets...")
    
    try:
        # Use scanpy's concatenate function
        print("Using scanpy to combine datasets...")
        combined = sc.concat(
            adatas, 
            join='outer',  # Use outer join to keep all genes
            label='dataset_batch'  # This creates a new column with the dataset name
        )
        
        # Ensure all required fields are present in final object
        required_fields = ['organism', 'cell_type', 'crispr_type', 'cancer_type', 'condition', 'perturbation_name']
        for field in required_fields:
            if field not in combined.obs.columns:
                print(f'WARNING: Required field {field} missing from combined metadata')
                # Add a default value if missing
                combined.obs[field] = "unknown"
        
        # Save as h5ad
        print(f'Saving final harmonized h5ad to {output_h5ad}...')
        combined.write(output_h5ad)
        
        print(f"\nConversion complete! Final harmonized file saved to {output_h5ad}")
        print(f"Individual harmonized dataset files: {', '.join(h5ad_files)}")
        
        # Print metadata stats for verification
        print("\nHarmonized metadata field summary:")
        for field in required_fields:
            nunique = combined.obs[field].nunique()
            print(f"  {field}: {nunique} unique values")
            
            # Print top values for categorical fields
            if nunique < 20:  # Only for fields with few unique values
                value_counts = combined.obs[field].value_counts().head(5)
                print(f"    Top values: {', '.join([f'{k} ({v})' for k, v in value_counts.items()])}")
    
    except Exception as e:
        print(f"Error combining datasets: {e}")
        print("Trying alternative approach...")
        
        # Alternative approach: manually create combined h5ad
        print("Creating combined h5ad by concatenating files manually...")
        
        # Write a script to join them using command line tools
        concatenation_script = os.path.join(data_dir, "concatenate_h5ad.py")
        
        with open(concatenation_script, "w") as f:
            f.write("""
import scanpy as sc
import sys
import os

# Get the list of files to concatenate
files = sys.argv[1:-1]
output_file = sys.argv[-1]

print(f"Concatenating {len(files)} h5ad files to {output_file}")

# Load the files
adatas = []
for file in files:
    print(f"Loading {file}...")
    try:
        adata = sc.read_h5ad(file)
        adatas.append(adata)
    except Exception as e:
        print(f"Error loading {file}: {e}")

if len(adatas) > 0:
    # Concatenate the files
    print("Concatenating datasets...")
    combined = sc.concat(
        adatas, 
        join='outer',
        label='dataset'
    )
    
    # Save the combined file
    print(f"Saving to {output_file}...")
    combined.write(output_file)
    print("Done!")
else:
    print("No valid files to concatenate.")
""")
        
        # Make the script executable
        os.chmod(concatenation_script, 0o755)
        
        # Run the script
        cmd = ["python", concatenation_script] + h5ad_files + [output_h5ad]
        print("Running command:", " ".join(cmd))
        subprocess.call(cmd)
else:
    print("No datasets were successfully processed.")

# Clean up temporary files
for temp_file in glob.glob(f"{data_dir}/temp_*"):
    try:
        os.remove(temp_file)
    except:
        pass

In [None]:
import scanpy as sc
import pandas as pd
import numpy as np

# Load the concatenated file
adata = sc.read("/content/GSE236519/GSE236519_harmonized.h5ad")

# Check for reserved column names and fix them
if '_index' in adata.obs.columns:
    # Rename the '_index' column to 'orig_index'
    adata.obs['orig_index'] = adata.obs['_index']
    del adata.obs['_index']

# Add the required harmonization fields if missing
if 'organism' not in adata.obs.columns:
    adata.obs['organism'] = "Mus musculus"

if 'cell_type' not in adata.obs.columns:
    # Use cell_types or cell_types_broad if available
    if 'cell_types' in adata.obs.columns:
        adata.obs['cell_type'] = adata.obs['cell_types']
    elif 'cell_types_broad' in adata.obs.columns:
        adata.obs['cell_type'] = adata.obs['cell_types_broad']
    else:
        adata.obs['cell_type'] = "Prefrontal cortex neurons"

if 'crispr_type' not in adata.obs.columns:
    # Set based on dataset
    adata.obs['crispr_type'] = "CRISPR KO"
    if 'dataset' in adata.obs.columns:
        mask = adata.obs['dataset'].astype(str).str.contains('CRISPRi')
        adata.obs.loc[mask, 'crispr_type'] = "CRISPRi"
    elif 'dataset_name' in adata.obs.columns:
        mask = adata.obs['dataset_name'].astype(str).str.contains('CRISPRi')
        adata.obs.loc[mask, 'crispr_type'] = "CRISPRi"

if 'cancer_type' not in adata.obs.columns:
    adata.obs['cancer_type'] = "Non-Cancer"

# Handle perturbation_name with the specific improvements requested
# First, convert categorical to string to avoid category issues
if 'perturbation_name' in adata.obs.columns:
    # Convert to string to avoid categorical data issues
    adata.obs['perturbation_name'] = adata.obs['perturbation_name'].astype(str)
else:
    # Use gRNAs if available, otherwise per_gene
    if 'gRNAs' in adata.obs.columns:
        adata.obs['perturbation_name'] = adata.obs['gRNAs'].astype(str)
    elif 'per_gene' in adata.obs.columns:
        adata.obs['perturbation_name'] = adata.obs['per_gene'].astype(str)
    else:
        adata.obs['perturbation_name'] = "unknown"

# Clean up perturbation names
# 1. Set "nan" values to np.nan for proper filtering
adata.obs.loc[adata.obs['perturbation_name'] == 'nan', 'perturbation_name'] = np.nan

# 2. Convert "non-targeting" to "Non-targeting" for consistency
adata.obs.loc[adata.obs['perturbation_name'].str.lower() == 'non-targeting', 'perturbation_name'] = "Non-targeting"

# 3. Remove numeric suffixes from gene names (e.g., Txnrd2_2 → Txnrd2)
# First, create a helper function to clean gene names
def clean_gene_name(name):
    if pd.isna(name):
        return name
    name = str(name)
    if '_' in name:
        # Split by underscore and check if last part is numeric
        parts = name.split('_')
        if parts[-1].isdigit():
            # Return all parts except the last numeric one
            return '_'.join(parts[:-1])
    return name

# Apply the cleaning function to all perturbation names
adata.obs['perturbation_name'] = adata.obs['perturbation_name'].apply(clean_gene_name)

# Set controls consistently
control_patterns = ['Safe_H', 'Safe', 'Control', 'Ctrl', 'control', 'ctrl', 'non-target', 'NT']
for pattern in control_patterns:
    mask = adata.obs['perturbation_name'].str.contains(pattern, case=False, na=False)
    adata.obs.loc[mask, 'perturbation_name'] = "Non-targeting"

# Ensure condition is present and consistent with perturbation names
adata.obs['condition'] = "test"
mask = adata.obs['perturbation_name'] == "Non-targeting"
adata.obs.loc[mask, 'condition'] = "control"

# Handle wt_ prefixes separately
wt_mask = adata.obs['perturbation_name'].str.startswith('wt_', na=False)
if wt_mask.any():
    adata.obs.loc[wt_mask, 'condition'] = "control"

# Remove cells with NaN perturbation names
print(f"Cell count before filtering: {adata.shape[0]}")
adata = adata[~adata.obs['perturbation_name'].isna()].copy()
print(f"Cell count after filtering: {adata.shape[0]}")

# Save the updated file
adata.write("/content/GSE236519/GSE236519_harmonized_fixed.h5ad")

# Display summary
print("\n======== Harmonized Metadata Fields ========")
required_fields = ['organism', 'cell_type', 'crispr_type', 'cancer_type', 'condition', 'perturbation_name']
for field in required_fields:
    n_unique = adata.obs[field].nunique()
    print(f"{field}: {n_unique} unique values")
    # Show the most common values
    if n_unique < 20:
        top_values = adata.obs[field].value_counts().head(5)
        print(f"  Top values: {', '.join([f'{k} ({v})' for k, v in top_values.items()])}")
    else:
        print(f"  (Many values)")

# Show updated perturbation name counts
print("\n======== Top Perturbation Targets ========")
perturbation_counts = adata.obs["perturbation_name"].value_counts()
print(perturbation_counts[:20])

print("\nFixed harmonized file saved to: /content/GSE236519/GSE236519_harmonized_fixed.h5ad")