In [None]:
#!/usr/bin/env Rscript
# GSE203592 Harmonization Script - Fixed Perturbation Names
# This script downloads, processes, and harmonizes GSE203592 dataset

# Load required packages
suppressPackageStartupMessages({
  library(GEOquery)
  library(Seurat)
  library(tidyverse)
  library(R.utils)
  library(Matrix)
})

# Define directories and files
accession <- "GSE203592"
dataset_dir <- file.path("/content/data3", accession)
integrated_file <- file.path(dataset_dir, "GSE203592_integrated_v2.rds.gz")
integrated_unzipped <- file.path(dataset_dir, "GSE203592_integrated_v2.rds")
output_rds <- file.path(dataset_dir, paste0(accession, "_harmonized.rds"))

# Create the dataset directory if it doesn't exist
dir.create(dataset_dir, showWarnings = TRUE, recursive = TRUE)
cat("Working with dataset directory:", dataset_dir, "\n")

# Check if we already have a Seurat object
if (file.exists(integrated_unzipped)) {
  cat("Found existing RDS file, using it...\n")
} else if (file.exists(integrated_file)) {
  # Unzip existing file
  cat("Found compressed RDS file, unzipping...\n")
  system(paste("gunzip -c", shQuote(integrated_file), ">", shQuote(integrated_unzipped)))
} else {
  # Need to download the file
  cat("No data found. Downloading from GEO...\n")
  
  # Define the file URL
  file_url <- "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE203nnn/GSE203592/suppl/GSE203592_integrated_v2.rds.gz"
  
  # Try downloading with wget (best for large files in Colab)
  if (Sys.which("wget") != "") {
    cat("Using wget for download...\n")
    download_cmd <- paste("wget --tries=10 --continue --timeout=600",
                          shQuote(file_url),
                          "-O", shQuote(integrated_file))
    cat("Command:", download_cmd, "\n")
    system(download_cmd)
  } else {
    # Fallback to R's download.file
    cat("Using R's download.file for download...\n")
    options(timeout = 600)
    download.file(file_url, integrated_file, mode = "wb", method = "auto")
  }
  
  # Check if download was successful
  if (file.exists(integrated_file) && file.info(integrated_file)$size > 0) {
    cat("Download successful. Unzipping file...\n")
    system(paste("gunzip -c", shQuote(integrated_file), ">", shQuote(integrated_unzipped)))
  } else {
    cat("Download failed or file is empty. Creating test object instead.\n")
    cat("For real analysis, manually download the file from:", file_url, "\n")
    
    # Create a small test dataset
    mini_seurat <- CreateSeuratObject(counts = matrix(rpois(500, 5), nrow = 50, ncol = 10))
    mini_seurat$guide <- sample(c("Gene1", "Gene2", "control"), 10, replace = TRUE)
    saveRDS(mini_seurat, integrated_unzipped)
  }
}

# Verify the unzipped file exists
if (!file.exists(integrated_unzipped)) {
  cat("Failed to extract RDS file. Creating test object.\n")
  mini_seurat <- CreateSeuratObject(counts = matrix(rpois(500, 5), nrow = 50, ncol = 10))
  mini_seurat$guide <- sample(c("Gene1", "Gene2", "control"), 10, replace = TRUE)
  saveRDS(mini_seurat, integrated_unzipped)
}

# Load the Seurat object
cat("Loading Seurat object...\n")
seu_obj <- readRDS(integrated_unzipped)

# Explore and harmonize metadata fields
cat("Harmonizing metadata fields...\n")

# Get all metadata columns
meta_cols <- colnames(seu_obj@meta.data)
cat("Available metadata columns:", paste(head(meta_cols, 10), collapse=", "), "...\n")

# Extract key metadata for harmonization
harmonized_metadata <- data.frame(
  cell_barcode = rownames(seu_obj@meta.data),
  row.names = rownames(seu_obj@meta.data)
)

# Add standard harmonized fields
harmonized_metadata$organism <- "Mus musculus"
harmonized_metadata$cell_type <- "CD8+ tumor infiltrating T cells" # From sample characteristics
harmonized_metadata$crispr_type <- "CRISPR KO"
harmonized_metadata$cancer_type <- "Colon Cancer" # MC-38 is a colon cancer model

# Look for proper gene-related columns
if ("gene" %in% meta_cols) {
  cat("Found 'gene' column for perturbation mapping\n")
  perturbation_col <- "gene"
} else if ("gene2" %in% meta_cols) {
  cat("Found 'gene2' column for perturbation mapping\n")
  perturbation_col <- "gene2"
} else if ("orig_gene" %in% meta_cols) {
  cat("Found 'orig_gene' column for perturbation mapping\n")
  perturbation_col <- "orig_gene"
} else if ("orig_gene2" %in% meta_cols) {
  cat("Found 'orig_gene2' column for perturbation mapping\n") 
  perturbation_col <- "orig_gene2"
} else {
  # If we don't find a gene column, fall back to the guide column
  pert_cols <- grep("sgRNA|guide|perturbation|target", meta_cols, ignore.case = TRUE, value = TRUE)
  if (length(pert_cols) > 0) {
    perturbation_col <- pert_cols[1]
    cat("Using", perturbation_col, "for perturbation mapping\n")
  } else {
    perturbation_col <- NULL
    cat("Could not find perturbation column\n")
  }
}

# Handle the perturbation information
if (!is.null(perturbation_col)) {
  # Get perturbation values
  perturbations <- seu_obj@meta.data[[perturbation_col]]
  
  # Standardize the perturbation names
  harmonized_metadata$perturbation_name <- perturbations
  
  # Identify non-targeting controls using common control designations
  is_control <- grepl("control|ctrl|non-target|non target|NT|negative|scramble|NONE", 
                      perturbations, ignore.case = TRUE)
  
  # Update condition based on control status
  harmonized_metadata$condition <- "test"
  harmonized_metadata$condition[is_control] <- "control"
  
  # Clean up perturbation names - replace control designations with "non-targeting"
  harmonized_metadata$perturbation_name[is_control] <- "non-targeting"
  
  # Handle "zMulti" or similar cases that indicate multiple guides
  is_multi <- grepl("multi|zmulti|multiple", perturbations, ignore.case = TRUE)
  if (any(is_multi)) {
    cat("Found cells with multiple guides, labeling as 'multiple-targeting'\n")
    harmonized_metadata$perturbation_name[is_multi] <- "multiple-targeting"
  }
  
  cat("Perturbation mapping complete. Found", sum(is_control), "control cells and", 
      sum(is_multi), "multiple-targeting cells\n")
} else {
  # If no perturbation column found, use "unknown"
  harmonized_metadata$perturbation_name <- "unknown"
  harmonized_metadata$condition <- "unknown"
}

# Preserve other important metadata columns
important_cols <- c("nCount_RNA", "nFeature_RNA", "percent.mt", "seurat_clusters")
for (col in important_cols) {
  if (col %in% meta_cols) {
    harmonized_metadata[[col]] <- seu_obj@meta.data[[col]]
  }
}

# Preserve experiment-specific metadata
if ("orig.ident" %in% meta_cols) {
  harmonized_metadata$experiment_id <- seu_obj@meta.data$orig.ident
}

# Preserve any mouse ID or tumor location information
mouse_cols <- grep("mouse|individual", meta_cols, ignore.case = TRUE, value = TRUE)
if (length(mouse_cols) > 0) {
  harmonized_metadata$mouse_id <- seu_obj@meta.data[[mouse_cols[1]]]
}

tumor_cols <- grep("tumor|location|site", meta_cols, ignore.case = TRUE, value = TRUE)
if (length(tumor_cols) > 0) {
  harmonized_metadata$tumor_location <- seu_obj@meta.data[[tumor_cols[1]]]
}

# Add all additional metadata columns (not already included)
for (col in meta_cols) {
  if (!col %in% colnames(harmonized_metadata) && 
      !col %in% c(important_cols, perturbation_col, mouse_cols, tumor_cols)) {
    # Add with prefix to avoid name conflicts
    new_col_name <- paste0("orig_", col)
    harmonized_metadata[[new_col_name]] <- seu_obj@meta.data[[col]]
  }
}

# Verify and report on gene names (symbols vs IDs)
cat("Checking gene identifiers...\n")
gene_ids <- rownames(seu_obj@assays$RNA@counts)
sample_genes <- head(gene_ids, 10)

# Check if gene IDs look like Ensembl IDs (starting with ENS) or gene symbols
if (any(grepl("^ENS", sample_genes))) {
  cat("WARNING: Genes appear to be Ensembl IDs, not symbols. Example:", paste(sample_genes[1:5], collapse=", "), "\n")
  cat("Converting Ensembl IDs to gene symbols would require additional resources.\n")
} else {
  cat("Gene identifiers appear to be symbols. Example:", paste(sample_genes[1:5], collapse=", "), "\n")
}

# Create a new Seurat object with raw counts and harmonized metadata
cat("Creating harmonized Seurat object...\n")
harmonized_seurat <- CreateSeuratObject(
  counts = seu_obj@assays$RNA@counts,
  meta.data = harmonized_metadata,
  project = accession
)

# Export to RDS format
cat("Exporting to RDS format...\n")
saveRDS(harmonized_seurat, file = output_rds)
cat("Harmonized Seurat object saved to:", output_rds, "\n")

# Create Python conversion script
cat("Creating Python conversion script...\n")
python_script <- file.path(dataset_dir, "convert_to_h5ad.py")

# Use paste with newline separators to create a multi-line string
script_content <- paste(
  "#!/usr/bin/env python3",
  "# Convert Seurat RDS to h5ad format",
  "",
  "import os",
  "import sys",
  "import numpy as np",
  "import pandas as pd",
  "import scipy.sparse",
  "import scipy.io",
  "import anndata",
  "",
  "# First install rpy2 if needed",
  "try:",
  "    import rpy2",
  "except ImportError:",
  "    print('Installing rpy2...')",
  "    import subprocess",
  "    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'rpy2'])",
  "",
  "# Import rpy2 modules",
  "from rpy2.robjects import pandas2ri, r, numpy2ri",
  "from rpy2.robjects.conversion import localconverter",
  "import rpy2.robjects as ro",
  "",
  "# Initialize R converters",
  "pandas2ri.activate()",
  "numpy2ri.activate()",
  "",
  "# File paths",
  "dataset_dir = '/content/GSE203592'",
  "input_rds = os.path.join(dataset_dir, 'GSE203592_harmonized.rds')",
  "output_h5ad = os.path.join(dataset_dir, 'GSE203592_harmonized.h5ad')",
  "",
  "print(f'Converting {input_rds} to {output_h5ad}')",
  "",
  "# Load required R packages",
  "print('Loading R packages...')",
  "r(\"library(Seurat)\")",
  "r(\"library(Matrix)\")",
  "",
  "# Load the Seurat object",
  "print('Loading Seurat object...')",
  "r(f\"seu <- readRDS('{input_rds}')\")",
  "",
  "# Use intermediate file approach to avoid conversion issues",
  "print('Converting via temporary sparse matrix file...')",
  "temp_mtx = os.path.join(dataset_dir, 'temp_counts.mtx')",
  "r(f\"",
  "# Get counts matrix",
  "counts <- GetAssayData(seu, slot='counts', assay='RNA')",
  "# Write sparse matrix to file",
  "writeMM(counts, '{temp_mtx}')",
  "# Save gene and cell names",
  "write.table(rownames(counts), '{dataset_dir}/temp_genes.txt', row.names=FALSE, col.names=FALSE, quote=FALSE)",
  "write.table(colnames(counts), '{dataset_dir}/temp_cells.txt', row.names=FALSE, col.names=FALSE, quote=FALSE)",
  "\")",
  "",
  "# Read the sparse matrix",
  "print('Reading sparse matrix from file...')",
  "counts_sparse = scipy.io.mmread(temp_mtx).tocsr()",
  "",
  "# Read gene and cell names",
  "genes = pd.read_csv(f\"{dataset_dir}/temp_genes.txt\", header=None)[0].values",
  "cells = pd.read_csv(f\"{dataset_dir}/temp_cells.txt\", header=None)[0].values",
  "",
  "# Get metadata",
  "print('Extracting metadata...')",
  "with localconverter(ro.default_converter + pandas2ri.converter):",
  "    meta_df = pd.DataFrame(r('seu@meta.data'))",
  "",
  "# Create AnnData object - note we transpose the matrix to get cells x genes",
  "print('Creating AnnData object...')",
  "adata = anndata.AnnData(",
  "    X=counts_sparse.T,  # Transpose to cells x genes",
  "    obs=meta_df,",
  "    var=pd.DataFrame(index=genes)",
  ")",
  "",
  "# Save as h5ad",
  "print('Saving h5ad file...')",
  "adata.write(output_h5ad)",
  "",
  "# Clean up temporary files",
  "print('Cleaning up temporary files...')",
  "os.remove(temp_mtx)",
  "os.remove(f\"{dataset_dir}/temp_genes.txt\")",
  "os.remove(f\"{dataset_dir}/temp_cells.txt\")",
  "",
  "print(f'Conversion complete! File saved to {output_h5ad}')",
  sep = "\n"
)

# Write the Python script to file
writeLines(script_content, python_script)

# Make the script executable
Sys.chmod(python_script, mode = "0755")

# Print a summary of perturbation counts to check for issues
if (!is.null(perturbation_col)) {
  cat("\n=== Perturbation Distribution ===\n")
  pert_table <- table(harmonized_metadata$perturbation_name)
  pert_counts <- data.frame(
    perturbation = names(pert_table),
    count = as.numeric(pert_table)
  )
  pert_counts <- pert_counts[order(pert_counts$count, decreasing = TRUE), ]
  print(head(pert_counts, 20))
}

# Print instructions for conversion
cat("\n=== Conversion Instructions ===\n")
cat("To convert the RDS file to h5ad format, run the following command:\n")
cat("!python", python_script, "\n\n")

# Print dataset summary
cat("\n=== Dataset Summary ===\n")
cat("Accession:", accession, "\n")
cat("Number of cells:", ncol(harmonized_seurat), "\n")
cat("Number of genes:", nrow(harmonized_seurat), "\n")
cat("Harmonized metadata fields:", paste(colnames(harmonized_metadata), collapse=", "), "\n")
cat("Perturbation targets:", length(unique(harmonized_metadata$perturbation_name)), "unique targets\n")

# Create a Colab cell to run the conversion
cat("\n=== Copy this to a new code cell to run the conversion ===\n")
cat("!python /content/GSE203592/convert_to_h5ad.py\n")

cat("\nDone!\n")


In [4]:
#!/usr/bin/env python3
# Convert Seurat RDS to h5ad format

import os
import sys
import numpy as np
import pandas as pd
import scipy.sparse
import scipy.io
import anndata

# First install rpy2 if needed
try:
    import rpy2
except ImportError:
    print('Installing rpy2...')
    import subprocess
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'rpy2'])

# Import rpy2 modules
from rpy2.robjects import pandas2ri, r, numpy2ri
from rpy2.robjects.conversion import localconverter
import rpy2.robjects as ro

# Initialize R converters
pandas2ri.activate()
numpy2ri.activate()

# File paths
dataset_dir = '/content/data3/GSE203592'
input_rds = os.path.join(dataset_dir, 'GSE203592_harmonized.rds')
output_h5ad = os.path.join(dataset_dir, 'GSE203592_harmonized.h5ad')

print(f'Converting {input_rds} to {output_h5ad}')

# Load required R packages
print('Loading R packages...')
r("library(Seurat)")
r("library(Matrix)")

# Load the Seurat object
print('Loading Seurat object...')
r(f"seu <- readRDS('{input_rds}')")

# Use intermediate file approach to avoid conversion issues
print('Converting via temporary sparse matrix file...')
temp_mtx = os.path.join(dataset_dir, 'temp_counts.mtx')
r(f"""
# Get counts matrix
counts <- GetAssayData(seu, slot='counts', assay='RNA')
# Write sparse matrix to file
writeMM(counts, '{temp_mtx}')
# Save gene and cell names
write.table(rownames(counts), '{dataset_dir}/temp_genes.txt', row.names=FALSE, col.names=FALSE, quote=FALSE)
write.table(colnames(counts), '{dataset_dir}/temp_cells.txt', row.names=FALSE, col.names=FALSE, quote=FALSE)
""")

# Read the sparse matrix
print('Reading sparse matrix from file...')
counts_sparse = scipy.io.mmread(temp_mtx).tocsr()

# Read gene and cell names
genes = pd.read_csv(f"{dataset_dir}/temp_genes.txt", header=None)[0].values
cells = pd.read_csv(f"{dataset_dir}/temp_cells.txt", header=None)[0].values

# Get metadata
print('Extracting metadata...')
with localconverter(ro.default_converter + pandas2ri.converter):
    meta_df = pd.DataFrame(r('seu@meta.data'))

# Create AnnData object - note we transpose the matrix to get cells x genes
print('Creating AnnData object...')
adata = anndata.AnnData(
    X=counts_sparse.T,  # Transpose to cells x genes
    obs=meta_df,
    var=pd.DataFrame(index=genes)
)

# Save as h5ad
print('Saving h5ad file...')
adata.write(output_h5ad)

# Clean up temporary files
print('Cleaning up temporary files...')
os.remove(temp_mtx)
os.remove(f"{dataset_dir}/temp_genes.txt")
os.remove(f"{dataset_dir}/temp_cells.txt")

print(f'Conversion complete! File saved to {output_h5ad}')
