In [1]:
# Load required libraries
library(DropletUtils)
library(Matrix)
library(scater)

# Load raw data
raw_counts <- read10xCounts("data/raw_feature_bc_matrix.h5")
cat("Raw matrix dimensions:", dim(raw_counts), "\n")

# R doesnt only load GEX data (as i did in python -> 22040 genes) but other data too resulting in total of 74000 genes
# this means we are getting more cells with total UMI > lower, resulting in more cells being tested
# this is why R found over 33k cells and python found only 15k as non-empty

gex_mask <- rowData(raw_counts)$Type == "Gene Expression"
raw_matrix <- counts(raw_counts)[gex_mask, ]
# Get the count matrix
#raw_matrix <- counts(raw_counts)

# check if GEX filtering worked
cat("Number of genes after GEX filtering:", nrow(raw_matrix), "\n")

# Show some basic statistics
total_counts <- colSums(raw_matrix)
cat("Total cells with more than 100 UMI:", sum(total_counts > 100), "\n")

# Convert DelayedMatrix to dgCMatrix for compatibility
cat("Converting DelayedMatrix to dgCMatrix...\n")
raw_matrix <- as(raw_matrix, "dgCMatrix")
cat("Type of raw_matrix after conversion:", class(raw_matrix), "\n")
cat("Matrix dimensions:", dim(raw_matrix), "\n")

Loading required package: SingleCellExperiment

Loading required package: SummarizedExperiment

Loading required package: MatrixGenerics

Loading required package: matrixStats


Attaching package: ‘MatrixGenerics’


The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    

Raw matrix dimensions: 61313 722431 
Number of genes after GEX filtering: 22040 
Total cells with more than 100 UMI: 15000 
Converting DelayedMatrix to dgCMatrix...
Type of raw_matrix after conversion: dgCMatrix 
Matrix dimensions: 22040 722431 


In [3]:
# Create runs_R directory if it doesn't exist
if (!dir.exists("runs_R")) {
    dir.create("runs_R")
}

# Initialize log dataframe
log_data <- data.frame(
    timestamp = character(),
    niters = numeric(),
    fdr_0_001 = numeric(),
    fdr_0_01 = numeric(),
    fdr_0_05 = numeric(),
    calculated_retain = numeric(),
    lower = numeric(),
    runtime_seconds = numeric(),
    total_cells = numeric(),
    tested_cells = numeric(),
    data_shape = character(),
    stringsAsFactors = FALSE
)

# Run EmptyDrops X times
X <- 4
for (run_i in 1:X) {
    cat("Running EmptyDrops R implementation - Run", run_i, "of", X, "...\n")
    
    # Record start time
    start_time <- Sys.time()
    
    # Run EmptyDrops with documented default parameters
    runtime <- system.time({
        r_results <- emptyDrops(
            m = raw_matrix,
            lower = 100,           # Lower bound for empty droplets
            retain = NULL,         # Auto-determine from barcodeRanks
            niters = 10000,        # Monte Carlo iterations
            test.ambient = FALSE,  # Don't test ambient droplets
            ignore = NULL,         # No additional filtering
            alpha = Inf,           # Multinomial distribution
            round = TRUE,          # Round non-integer counts
            by.rank = NULL         # Use count threshold, not rank
        )
    })
    
    # Get metadata
    r_metadata <- metadata(r_results)
    
    # Calculate FDR statistics
    valid_pvals <- !is.na(r_results$PValue)
    fdr_0_001 <- sum(r_results$FDR[valid_pvals] < 0.001, na.rm = TRUE)
    fdr_0_01 <- sum(r_results$FDR[valid_pvals] < 0.01, na.rm = TRUE)
    fdr_0_05 <- sum(r_results$FDR[valid_pvals] < 0.05, na.rm = TRUE)
    
    # Count tested cells (those with p-values)
    tested_cells <- sum(valid_pvals)
    total_cells <- nrow(r_results)
    
    # Save results to CSV
    results_df <- data.frame(
        Total = r_results$Total,
        LogProb = r_results$LogProb,
        PValue = r_results$PValue,
        Limited = r_results$Limited,
        FDR = r_results$FDR
    )
    
    # Write results CSV
    write.csv(results_df, file = paste0("runs_R/run_", run_i, "_results.csv"), row.names = FALSE)
    
    # Add to log
    log_entry <- data.frame(
        timestamp = format(start_time, "%Y-%m-%d %H:%M:%S"),
        niters = r_metadata$niters,
        fdr_0_001 = fdr_0_001,
        fdr_0_01 = fdr_0_01,
        fdr_0_05 = fdr_0_05,
        calculated_retain = r_metadata$retain,
        lower = r_metadata$lower,
        runtime_seconds = runtime[["elapsed"]],
        total_cells = total_cells,
        tested_cells = tested_cells,
        data_shape = paste(dim(raw_matrix), collapse = "x"),
        stringsAsFactors = FALSE
    )
    
    log_data <- rbind(log_data, log_entry)
    
    # Display run summary
    cat("Run", run_i, "completed in", runtime[["elapsed"]], "seconds\n")
    cat("FDR < 0.05:", fdr_0_05, "cells\n")
    cat("Tested cells:", tested_cells, "/", total_cells, "\n\n")
}

# Save log file
write.csv(log_data, file = "runs_R/run_log_r.csv", row.names = FALSE)

# Display final summary
cat("All ", X, " runs completed!\n")
cat("Results saved to runs_R/ directory\n")
cat("Log saved to runs_R/run_log_r.csv\n")
print(summary(log_data))

Running EmptyDrops R implementation - Run 1 of 4 ...
Run 1 completed in 65.52 seconds
FDR < 0.05: 8193 cells
Tested cells: 15000 / 722431 

Running EmptyDrops R implementation - Run 2 of 4 ...
Run 2 completed in 39.124 seconds
FDR < 0.05: 8227 cells
Tested cells: 15000 / 722431 

Running EmptyDrops R implementation - Run 3 of 4 ...
Run 3 completed in 82.835 seconds
FDR < 0.05: 8166 cells
Tested cells: 15000 / 722431 

Running EmptyDrops R implementation - Run 4 of 4 ...
Run 4 completed in 51.755 seconds
FDR < 0.05: 8203 cells
Tested cells: 15000 / 722431 

All  4  runs completed!
Results saved to runs_R/ directory
Log saved to runs_R/run_log_r.csv
  timestamp             niters        fdr_0_001       fdr_0_01   
 Length:4           Min.   :10000   Min.   :6885   Min.   :7492  
 Class :character   1st Qu.:10000   1st Qu.:6890   1st Qu.:7518  
 Mode  :character   Median :10000   Median :6896   Median :7530  
                    Mean   :10000   Mean   :6914   Mean   :7528  
              