In [None]:
# Install R.utils if not already installed
if (!requireNamespace("R.utils", quietly = TRUE)) {
  install.packages("R.utils")
}
library(R.utils)

download_and_unzip <- function(url, destfile, max_retries = 3) {
  # Create destination directory if needed
  dest_dir <- dirname(destfile)
  if (!dir.exists(dest_dir)) {
    dir.create(dest_dir, recursive = TRUE)
  }

  # Download the file if it doesn't exist
  if (!file.exists(destfile)) {
    for (attempt in 1:max_retries) {
      message(sprintf("Downloading %s to %s (Attempt %d/%d)", url, destfile, attempt, max_retries))
      tryCatch({
        download.file(url, destfile, mode = "wb")
        message("Download successful!")
        break
      }, error = function(e) {
        message(sprintf("Error on attempt %d: %s", attempt, e$message))
        if (attempt < max_retries) {
          wait_time <- 2^(attempt - 1)
          message(sprintf("Retrying in %d seconds...", wait_time))
          Sys.sleep(wait_time)
        } else {
          stop(sprintf("Failed to download %s after %d attempts.", url, max_retries))
        }
      })
    }
  } else {
    message("File already exists: ", destfile)
  }

  # Unzip the downloaded file (remove .gz extension)
  unzipped_file <- sub("\\.gz$", "", destfile)
  if (!file.exists(unzipped_file)) {
    message("Unzipping file: ", destfile)
    gunzip(destfile, destname = unzipped_file, overwrite = TRUE)
    message("Unzipping completed!")
  } else {
    message("Unzipped file already exists: ", unzipped_file)
  }

  return(unzipped_file)
}

# Example usage:
base_url <- "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE281nnn/GSE281048/suppl/"
file_name <- "GSE281048_Seurat_object_TNFA_Perturb_seq.rds.gz"
url <- paste0(base_url, file_name)
destfile <- file.path("data", file_name)

downloaded_file <- download_and_unzip(url, destfile)


In [4]:
#' Harmonize Seurat objects to h5ad format
#'
#' This script processes Seurat perturb-seq datasets and converts them to h5ad format
#' with standardized metadata according to specified requirements.
#'
#' Usage: In Jupyter with R kernel, set data_dir variable and run the code

# Load required libraries
options(warn = -1) # Suppress warnings during package loading
required_packages <- c(
  "Seurat", "dplyr", "purrr", "stringr", "SeuratDisk",
  "BiocManager"
)

# Install missing packages
for (pkg in required_packages) {
  if (!requireNamespace(pkg, quietly = TRUE)) {
    if (pkg %in% c("SeuratDisk")) {
      # SeuratDisk needs to be installed from GitHub
      if (!requireNamespace("remotes", quietly = TRUE)) {
        install.packages("remotes", repos = "https://cran.rstudio.com/")
      }
      remotes::install_github("mojaveazure/seurat-disk")
    } else if (pkg %in% c("SingleCellExperiment")) {
      BiocManager::install(pkg)
    } else {
      install.packages(pkg, repos = "https://cran.rstudio.com/")
    }
  }
}

# Load libraries
suppressPackageStartupMessages({
  library(Seurat)
  library(dplyr)
  library(purrr)
  library(stringr)
  library(SeuratDisk)
})

#' Find RDS files in the data directory
#'
#' @param data_dir Directory containing the data files
#' @return Vector of file paths
find_rds_files <- function(data_dir) {
  # Get a list of all RDS files
  files <- list.files(
    path = data_dir,
    pattern = ".*_Perturb_seq\\.rds$",
    full.names = TRUE
  )

  # Check if files were found
  if (length(files) == 0) {
    stop("No Seurat RDS files found in ", data_dir)
  }

  message(paste0("Found ", length(files), " RDS files in ", data_dir))
  return(files)
}

#' Process a Seurat object to standardize metadata
#'
#' @param seurat_obj Seurat object to process
#' @param file_name Name of the file for reference
#' @return Processed Seurat object with standardized metadata
process_seurat_object <- function(seurat_obj, file_name) {
  # Extract pathway/condition from file name
  pathway <- str_extract(file_name, "(IFNB|IFNG|INS|TGFB|TNFA)")

  # Create a data frame with the existing metadata
  metadata <- seurat_obj@meta.data

  # Map cell types to standard nomenclature if needed (they seem to be already standardized)
  cell_types <- unique(metadata$cell_type)

  # Standardize metadata as required
  metadata <- metadata %>%
    mutate(
      # Set standard organism
      organism = "Homo sapiens",

      # Keep existing cell_type
      cell_type = cell_type,

      # Determine CRISPR type (based on files, this appears to be CRISPR KO)
      crispr_type = "CRISPR KO",

      # Map cell lines to cancer types
      cancer_type = case_when(
        cell_type == "BXPC3" ~ "Pancreatic Cancer",
        cell_type == "MCF7" ~ "Breast Cancer",
        cell_type == "HT29" ~ "Colorectal Cancer",
        cell_type == "A549" ~ "Lung Cancer",
        cell_type == "K562" ~ "Leukemia",
        cell_type == "HAP1" ~ "Chronic Myeloid Leukemia",
        TRUE ~ "Unknown"
      ),

      # Set condition based on pathway
      condition = case_when(
        pathway == "IFNB" ~ "IFNB stimulation",
        pathway == "IFNG" ~ "IFNG stimulation",
        pathway == "INS" ~ "Insulin stimulation",
        pathway == "TGFB" ~ "TGFB stimulation",
        pathway == "TNFA" ~ "TNFA stimulation",
        TRUE ~ "Unknown"
      ),

      # Format perturbation_name
      perturbation_name = case_when(
        gene == "NT" ~ "Non-targeting",
        !is.na(gene) ~ gene,
        TRUE ~ "Unknown"
      )
    )

  # Update the metadata in the Seurat object
  seurat_obj@meta.data <- metadata

  return(seurat_obj)
}

#' Convert Seurat object to h5ad format
#'
#' @param seurat_obj Processed Seurat object
#' @param output_file Output file path for h5ad
#' @return Path to the created h5ad file
convert_to_h5ad <- function(seurat_obj, output_file) {
  # Create temporary h5Seurat file
  temp_h5seurat <- paste0(tempfile(), ".h5Seurat")
  message(paste0("Converting to h5Seurat: ", temp_h5seurat))

  # Save as h5Seurat first
  SaveH5Seurat(seurat_obj, filename = temp_h5seurat, overwrite = TRUE)

  # Convert to h5ad
  message(paste0("Converting to h5ad: ", output_file))
  Convert(temp_h5seurat, dest = output_file, overwrite = TRUE)

  # Clean up temp file
  if (file.exists(temp_h5seurat)) {
    file.remove(temp_h5seurat)
  }

  return(output_file)
}

#' Process a single file
#'
#' @param file_path Path to the Seurat RDS file
#' @param output_dir Directory to save output h5ad files
#' @return Path to the output h5ad file
process_file <- function(file_path, output_dir) {
  file_name <- basename(file_path)
  message(paste0("Processing ", file_name, "..."))

  # Load the Seurat object
  message(paste0("Loading Seurat object from ", file_path))
  seurat_obj <- readRDS(file_path)

  # Process the Seurat object
  message("Standardizing metadata...")
  processed_obj <- process_seurat_object(seurat_obj, file_name)

  # Create output directory if it doesn't exist
  if (!dir.exists(output_dir)) {
    dir.create(output_dir, recursive = TRUE)
  }

  # Set output file path
  output_file <- file.path(
    output_dir,
    sub("\\.rds$", ".h5ad", file_name)
  )

  # Convert to h5ad
  message(paste0("Converting to h5ad: ", output_file))
  convert_to_h5ad(processed_obj, output_file)

  # Clean up to free memory
  rm(seurat_obj, processed_obj)
  gc()

  return(output_file)
}

#' Main function to run the harmonization pipeline
#'
#' @param data_dir Directory containing the data files
#' @param output_dir Directory to save output h5ad files
#' @return Vector of paths to the output h5ad files
main <- function(data_dir, output_dir = NULL) {
  # Set default output directory if not provided
  if (is.null(output_dir)) {
    output_dir <- file.path(data_dir, "harmonized")
  }

  # Get list of RDS files
  file_paths <- find_rds_files(data_dir)

  # Process each file
  output_files <- map_chr(file_paths, function(file_path) {
    process_file(file_path, output_dir)
  })

  message("Harmonization completed!")

  # Return paths to output files
  return(output_files)
}

# For Jupyter notebook usage:
# Set your data directory and run the main function
data_dir <- "/content/data"
output_dir <- "/content/data/harmonized"
output_files <- main(data_dir, output_dir)