# Notebook for DE Analysis from a single-cell map

## Load Packages

In [1]:
message("Loading packages...")

suppressMessages(library(Seurat))
suppressPackageStartupMessages(library(DESeq2))
suppressMessages(library(dplyr))
suppressPackageStartupMessages(library(stringr))
suppressMessages(library(data.table))
suppressMessages(library(Matrix))

Loading packages...



## Define Inputs

In [2]:
# Inputs: single-cell map and/or metadata
seurat_obj_path <- '/nfs/lab/sara/projects/PanKbase/singlecell/input/2023_01_27_hpap_res0p3.rds'
metadata_path <- '/nfs/lab/sara/projects/PanKbase/singlecell/input/Donor_Summary_127.csv'

# Load the object and/or metadata
message("Loading data...")
data <- readRDS(seurat_obj_path)
meta <- read.csv(metadata_path)

Loading data...



#### Format Inputs - Optional

In [3]:
# Ensure that the metadata has only the donors present in the seurat object - no more no less. Modify or comment out this chunk according to your needs
meta <- read.csv(metadata_path)
meta <- meta[meta$donor_ID %in% unique(data@meta.data$library), ]

dib_stat  <- data@meta.data[,c('library', 'Diabetes_Status_w_AAB', 'chemistry', 'tissue_source')]
dib_stat <- unique(dib_stat)
rownames(dib_stat) <- NULL

meta <- merge(meta, dib_stat, by.x = 'donor_ID', by.y='library')

meta <- rename(meta, diabetes_status = Diabetes_Status_w_AAB)
meta <- rename(meta, sex = gender)

rownames(meta) <- meta$donor_ID

In [4]:
# Set cell identities
Idents(data) <- data@meta.data$cell_type

# Define the samples based on metadata or the single-cell map
samples <- unique(meta$donor_ID)
unique_cell_types <- unique(data$cell_type)

# Default assay should be 'RNA'
DefaultAssay(data) <- 'RNA'
gex.counts <- GetAssayData(data, layer = 'counts')

# Create the output directory if it doesn't exist
output_dir <- '/nfs/lab/sara/projects/PanKbase/singlecell/analytical_library/output'

if (!dir.exists(output_dir)) {
  dir.create(output_dir, recursive = TRUE)
}

In [5]:
print(paste("Unique donors:", length(samples)))
print(paste("Unique cell types:", length(unique_cell_types)))

[1] "Unique donors: 65"
[1] "Unique cell types: 14"


## Make Pseudobulk Matrices for each cell type

In [6]:
# Output directory for cell counts
outdir_cell_counts <- file.path(output_dir, "cell_counts")
dir.create(outdir_cell_counts, showWarnings = FALSE)

# Loop over each unique cell type
for (cell_type in unique_cell_types) {
  # Get barcodes for the current cell type
  bcs <- names(Idents(data)[Idents(data) == cell_type])
  
  # Subset the gene expression counts matrix for the cell type
  counts <- gex.counts[, colnames(gex.counts) %in% bcs]
  
  # Initialize a matrix to store pseudobulk counts for each sample
  pseudobulk_counts <- matrix(0, nrow = nrow(gex.counts), ncol = length(samples))
  row.names(pseudobulk_counts) <- row.names(gex.counts)
  
  # Loop over each sample
  for (i in seq_along(samples)) {
    sample <- samples[i]
    
    # Get barcodes for the current sample
    sample_bcs <- row.names(data@meta.data[data@meta.data$library == sample,])
    
    # Subset counts for the current sample's barcodes
    counts_cut <- counts[, colnames(counts) %in% sample_bcs]
    
    # Sum the counts across the barcodes for the current sample
    if (is.null(ncol(counts_cut)) || ncol(counts_cut) == 0) {
      sum_counts <- rep(0, nrow(counts))
    } else {
      sum_counts <- rowSums(counts_cut)
    }
      
    # Add the summed counts to the pseudobulk matrix
    pseudobulk_counts[, i] <- sum_counts
  }
  
  # Convert the matrix to a dataframe and assign sample names as column names
  pseudobulk_df <- as.data.frame(pseudobulk_counts)
  colnames(pseudobulk_df) <- samples
  
  # Save the pseudobulk counts
  output_file <- file.path(outdir_cell_counts, paste0(cell_type, "_sample_gex_total_counts.txt"))
  write.table(pseudobulk_df, output_file, sep = '\t', quote = FALSE, col.names = TRUE, row.names = TRUE)
}

## Define helper functions for DESeq

In [7]:
# Function to read the paths of the count matrices from the dir
readCounts <- function(counts_dir) {
  # Get all files matching the pattern
  files <- list.files(counts_dir, pattern = "_sample_gex_total_counts.txt$", full.names = TRUE)
  
  # Initialize an empty dict to store the cell type and the respective counts matrices' file path
  counts_dict <- list()
  
  # Loop over each file and extract the cell type from the filename
  for (file_path in files) {
    cell_type <- gsub("_sample_gex_total_counts.txt", "", basename(file_path))
    
    # Read the count matrix and store it in dict
    counts_dict[[cell_type]] <- read.csv(paste0(file_path), header = TRUE, row.names=1, sep='\t', check.names = FALSE)
  }
  
  return(counts_dict)
}

In [8]:
# Scale covariates function
scaleCovariates <- function(coldata_subset, covariates) {
  if (!is.null(covariates)) {
    coldata_subset[covariates] <- lapply(coldata_subset[covariates], function(x) {
      if (is.numeric(x)) {
        return(scale(x))  # Scale continuous covariates
      }
      return(x)  # Return unchanged for categorical covariates
    })
  }
  return(coldata_subset)
}

## Perform DESeq

In [9]:
# Define the function to perform DESeq2 analysis
runDESeq2 <- function(counts_matrix, coldata, condition_col, condition_levels, covariates = NULL, outdir_deseq, cell_type) {
  # Ensure the condition column exists in the coldata
  if (!(condition_col %in% colnames(coldata))) {
    stop(paste("Condition column", condition_col, "not found in coldata."))
  }

  # Subset the coldata for the specified condition levels (e.g., T1D vs ND)
  coldata_subset <- coldata[coldata[[condition_col]] %in% condition_levels, ]

  # Match the subset of coldata with the counts matrix
  counts_subset <- counts_matrix[, rownames(coldata_subset)]
  
  # Calculate half the sample size for each condition_levels - used as the minimum number of samples to meet gene count thresholds
  n_levels <- sapply(condition_levels, function(level) {
    sum(coldata_subset[[condition_col]] == level)
  })
  n_levels_half <- floor(n_levels / 2)
  
  # Filter genes based on the counts criteria - at least half the samples per condition have greater than 5 counts
  genes_to_keep <- rownames(counts_subset)[
    rowSums(counts_subset[, coldata_subset[[condition_col]] == condition_levels[1]] >= 5) >= n_levels_half[1] &
    rowSums(counts_subset[, coldata_subset[[condition_col]] == condition_levels[2]] >= 5) >= n_levels_half[2]
  ]

  # Subset counts matrix to keep only the genes that passed the filter
  counts_subset <- counts_subset[genes_to_keep, ]

  # Re-level the factor so the first condition is the reference level
  coldata_subset[[condition_col]] <- factor(coldata_subset[[condition_col]], levels = condition_levels)

  # Build the design formula with covariates
  if (!is.null(covariates) && all(covariates %in% colnames(coldata_subset))) {
    design_formula <- as.formula(paste("~", paste(c(covariates, condition_col), collapse = " + ")))
  } else {
    design_formula <- as.formula(paste("~", condition_col))
  }
    
  # Create DESeq2 dataset
  dds <- DESeqDataSetFromMatrix(countData = counts_subset, colData = coldata_subset, design = design_formula)

  # Comment this out for the default type = "ratio" - general purpose with balanced counts
  # Estimate size factors using the "poscounts" method - handles sparse datasets with many zeros 
  # Estimate size factors using the "iterate" method - handles outliers or widely varying library sizes
  #dds <- estimateSizeFactors(dds, type = "poscounts")
    
  # Run DESeq2
  dds <- DESeq(dds)

  # Extract results for the specified condition comparison
  res <- results(dds, contrast = c(condition_col, condition_levels[1], condition_levels[2]))
  res <- res[order(res$padj), ]

  # Save results
  condition_comparison <- paste(condition_levels[1], "vs", condition_levels[2], sep = "_")
  output_filename <- paste0(outdir_deseq, "/", cell_type, "_", condition_comparison, "_deseq.tsv")

  write.table(res, output_filename, sep = "\t", quote = FALSE, row.names = TRUE, col.names = TRUE)

  message(paste("Complete! DESeq analysis for", condition_levels[1], "vs", condition_levels[2], "in", cell_type, "cells."))

  return(dds)
}

In [10]:
run_DESeq2_for_all_cell_types <- function(cell_types_list = NULL, counts_dir, coldata, condition_col, condition_levels, covariates = NULL, output_dir) {
  # Ensure the base output directory exists or create it
  outdir_deseq <- file.path(output_dir, "deseq")
  if (!dir.exists(outdir_deseq)) {
    dir.create(outdir_deseq, recursive = TRUE)
  }
  
  # If cell_types_list is not provided, read all count matrices from the directory
  if (is.null(cell_types_list)) {
    counts_dict <- readCounts(counts_dir)
    cell_types_list <- names(counts_dict)  # Use names of the counts_dict as cell types
  } else {
    counts_dict <- readCounts(counts_dir)[cell_types_list]  # Only read specified cell types
  }
  
  # Loop over each cell type and run DESeq2
  for (cell_type in cell_types_list) {
    # Retrieve the counts matrix for the current cell type
    counts_matrix <- counts_dict[[cell_type]]

    # Skip if the counts data for the cell type is missing
    if (is.null(counts_matrix)) {
      message(paste("Skipping DESeq2 for", cell_type, "due to missing counts data."))
      next
    }
    
    # Run DESeq2 for the current cell type and save results
    runDESeq2(counts_matrix = counts_matrix, coldata = coldata, condition_col = condition_col, 
               condition_levels = condition_levels, covariates = covariates, 
               outdir_deseq = outdir_deseq, cell_type = cell_type)
  }
  
  message("DESeq2 analysis completed for specified cell types.")
}

## Custom DESeq Parameters

#### Required Arguments
- **coldata**: metadata file that includes sample information, conditions and covariates for DESeq analysis
- **condition_col**: The column name in coldata that represents the experimental condition to test (e.g., "diabetes_status")
- **condition_levels**: A vector specifying the two condition levels to compare in DESeq (e.g., c("T1D", "ND"))
- **counts_dir**: The directory where the pseudobulk matrices for each cell type will be saved
- **output_dir**: The directory where DESeq output will be saved

#### Optional Arguments
- **cell_types_list**: (default = NULL) list of specific cell types of interest to run DESeq on. If not provided, all cell types in counts_dir are used.
- **covariates**: (default = NULL) A list of additional covariates from coldata to include in the DESeq model (e.g., age, sex)

In [11]:
### Required
coldata <- meta  # Sample metadata with condition and covariates
condition_col <- "diabetes_status"
condition_levels <- c("T1D", "ND")
counts_dir <- "/nfs/lab/sara/projects/PanKbase/singlecell/analytical_library/output/cell_counts"
output_dir <- "/nfs/lab/sara/projects/PanKbase/singlecell/analytical_library/output" 

### Optional
cell_types_list <- c("Acinar", "Endothelial")
covariates <- c("age_years", "sex", "bmi")

# To run DESeq2 for a specific cell type
run_DESeq2_for_all_cell_types(cell_types_list = cell_types_list, 
                              coldata = coldata,
                              condition_col = condition_col, 
                              condition_levels = condition_levels, 
                              counts_dir = counts_dir, 
                              output_dir = output_dir,                              
                              covariates = covariates)

# # To run DESeq2 for all cell types - entire cell_counts dir
# run_DESeq2_for_all_cell_types(coldata = coldata,
#                               condition_col = condition_col, 
#                               condition_levels = condition_levels, 
#                               counts_dir = counts_dir, 
#                               output_dir = output_dir,                              
#                               covariates = covariates))

“some variables in design formula are characters, converting to factors”
  the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function

  the design formula contains one or more numeric variables that have mean or
  standard deviation larger than 5 (an arbitrary threshold to trigger this message).
  Including numeric variables with large mean can induce collinearity with the intercept.
  Users should center and scale numeric variables in the design to improve GLM convergence.

estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

Complete! DESeq analysis for T1D vs ND in Acinar cells.

“some variables in design formula are characters, converting to factors”
  the design formu