# 2024-026 Peak Set Overrepresentation Analysis

This notebook details processes for running ORA on differential peaks obtained from 2024-026.

**Expected Outputs**
 - CSV file of ORA results
 - rds of raw ClusterProfiler objects

## Initialize Environment

Import the necessary packages and output package versions:

In [None]:
# Import necessary packages
library(ggplot2)
library(clusterProfiler)
library(scales)
library(viridis)
library(dplyr)

# Load data
exp_dir     <- "/home/dalbao/2024-026-Tcf7ATAC"
nfcore_dir  <- "01_nfcore_241228"
nb_dir      <- "02_bp_notebooks"
prefix      <- "DA-ORA"

# Set input DAPS (csv file)
input <- "250611-bp-DA-DAPS_noShrink_reduced-Unannotated.csv"
# Add folder structure to input
input <- paste(nb_dir, "daps", input, sep = "/")

# Peak Sets File Pattern
ps_pattern <- "noShrink.+peakSets.csv$"              # Files with this pattern

# nf-core ATAC-seq pipeline calls peaks from two different libraries:
library     <- "merged_library"
# If library == merged_library, lib_sh is mLb
# else if it is == merged_replicates, lib_sh is mRp
lib_sh      <- ifelse(library == "merged_library", "mLb", "mRp")

# nf-core ATAC-seq pipeline calls peaks in two types, broad_peak and narrow_peak:
peak_type   <- "broad_peak"
# If peak_type == broad_peak, peak_sh is bp
peak_sh     <- ifelse(peak_type == "broad_peak", "bp", "np")

# Add peak type and date to prefix
prefix <- paste0(format(Sys.Date(), "%y%m%d"), "-", peak_sh, "-", prefix)

# Move to working directory
setwd(exp_dir)

# Toggle automatic data saving
savedata <- TRUE

**Load gene sets and examine:**

In [1]:
# Loop and load all CSV files that end with peak_sets.csv in the directory.
# List all CSV files that end with "peak_sets.csv" in the peak_sets directory
peak_files <- list.files(path = paste(nb_dir, "peak_sets", sep = "/"), pattern = ps_pattern, full.names = TRUE)

print(peak_files)

# Initialize an empty list to hold data
sigs_list <- list()

# Loop through each file and read the data
for (file in peak_files) {
    sigs_list[[file]] <- read.csv(file)
}

# Combine all data frames in the list into one data frame
sigs <- do.call(rbind, sigs_list)


# Check peak list
table(sigs$ps_name)
# Show top 5 rows:
head(sigs)

ERROR: Error: object 'nb_dir' not found


**Load dataset and examine:**

In [None]:
# Load degs
degs <- read.csv(input)

# Reorder to make similar to scDatasets
degs <- degs[, c(1,4,7,6,8,9)]
# Rename
colnames(degs) <- c("group", "names", "statistic", "scores", "pvals", "pvals_adj")

# Show top 5 rows
head(degs)

## Overrepresentation Analysis

Perform ORA per comparison and save these analyses to a list. Moreover, for each analysis, only keep genes in set that are in the analysis. First create a function:

In [None]:
# Convert peak sets to gene sets terminology
colnames(sigs) <- c("gs_name", "gene_symbol")

# Define a function that takes the entire degs list, a cluster of interest, and gene signatures then returns a gseaResult object.
performORA <- function(degs, sigs, pvalueCutoff = 0.05) {

    # Remove genes in sigs that are not in cluster analysis
    sigs_analysis <- sigs[sigs$gene_symbol %in% degs$names, ]

    # Filter degs based on pvalue and positive scores
    degs_filtered <- degs %>%
        filter(pvals_adj < pvalueCutoff & scores > 0)

    print(length(degs_filtered$names)) # Print number of genes in degs_filtered

    ora <- enricher(degs_filtered$names,
                    TERM2GENE = sigs_analysis,
                    minGSSize = 5,
                    maxGSSize = 40000,
                    pvalueCutoff = 1)

    return(ora) # Return oraResult object
} # End of function

Now that a function is defined, loop through all clusters within degs df:

In [None]:
# Define a list to hold gseaResults
enrichment_results = list()

# Define an empty dataframe
results_df <- data.frame()

# Define a directionality factor
direction <- c(up = 1, down = -1)

# Loop over all clusters in degs df
for (cluster in unique(degs$group)){
# for (cluster in "shRunx3_vs_shCD19"){

    for(directionality in names(direction)){

        degs_subset <- degs %>% 
            filter(group == cluster) %>% # Filter degs for the current cluster
            mutate(scores = scores * direction[directionality]) # multiply scores by directionality factor

        cluster_name <- paste0(cluster, "_", directionality) # Create a cluster name with directionality

        # Assign oraResult from performORA() to a specific item in enrichment_results
        enrichment_results[cluster_name] <- performORA(degs = degs_subset, sigs = sigs, pvalueCutoff = 0.1)

        # Extract dataframe of results
        df <- enrichment_results[[cluster_name]]@result
        # Label cluster for the extracted dataframe
        df$cluster <- cluster_name
        # Put cluster column as the firstmost column
        df <- df[, c(13, 1:12)]

        # Rowbind dataframes to combine all clusters into one
        # Conditional, if results_df is empty, assign current df:
        if(dim(results_df)[1] == 0){
            results_df <- df
            print("Start")
        }else{ # If data exists, rowbind
            results_df <- rbind.data.frame(results_df, df)
        }

        # Show progress
        print(cluster_name)

        rm(df, cluster_name) # Cleanup
        gc(full = TRUE)

    }
}
# End of loop

In [None]:
head(results_df %>% select(-"geneID"))

**Save CSV of GSEA results and the raw ClusterProfiler objects:**

In [None]:
# Use BH p-adj
results_df$BH <- p.adjust(results_df$pvalue, method = "BH")


if(savedata){
    save_dir <- paste(nb_dir, "psea", sep = "/")

    # Save list containing gseaResult objects to RDS
    saveRDS(enrichment_results, paste(save_dir, "/", prefix, ".rds", sep = ""))

    # Save list containing gseaResult objects to RDS
    write.csv(results_df, paste(save_dir, "/", prefix, ".complete.csv", sep = ""), row.names = FALSE)

    # Save list containing gseaResult objects to RDS
    write.csv(results_df %>% select(-"geneID"), paste(save_dir, "/", prefix, ".noGeneID.csv", sep = ""), row.names = FALSE)
}

### Visualize GSEA Results

**Pre-process data to order clusters accordingly and filter out non-significant results:**

In [None]:
# # Replace BH of 0 with 1e-300
# results_df$BH[results_df$BH == 0] <- 1e-300

# Compute smallest non-zero BH value
min_nonzero <- min(results_df$BH[results_df$BH > 0], na.rm = TRUE)

# Get base-10 exponent and compute pseudocount
min_exp <- floor(log10(min_nonzero))
pseudocount <- 10^(min_exp - 2)

# Replace 0s with the pseudocount
results_df$BH[results_df$BH == 0] <- pseudocount


# Calculate negative logarithm of q-value
results_df$log_qvalue <- -log10(results_df$BH)

# Filter significant q-values
significant_df <- subset(results_df, BH < 0.05)

**Define a function to plot data:**

In [None]:
plotORAbubble <- function(df, name){
    # Define color scale
    color_scale <- viridis::magma(3)  # Generate a color scale with 2 colors

    # Ensure limits are symmetric around 0
    FE_range <- range(df$FoldEnrichment, na.rm = TRUE)
    max_abs_FE <- max(abs(FE_range))
    limits <- c(1, max_abs_FE)

    # Create bubble plot
    p <- ggplot(df, aes(x = cluster, y = ID)) +
      geom_point(aes(color = FoldEnrichment, size = log_qvalue)) +
      scale_color_gradientn(
        colors = color_scale,
        limits = limits,
        guide = guide_colorbar(title = "Enrichment")
      ) +
      # scale_size_continuous(
      #   breaks = seq(0, max(df$log_qvalue), by = 0.5),
      #   labels = seq(0, max(df$log_qvalue), by = 0.5),
      #   guide = guide_legend(title = "-log10(Q-value)")
      # ) +
      labs(
        title = paste("Peak Overrepresentation", name),  # Set the plot title
        x = "Cluster",  # Label for the X-axis
        y = "ID"  # Label for the Y-axis
      ) +
      theme_minimal() +  # Set the plot theme to minimal
      theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate and align X-axis labels
    return(p)
}

**Plot GSEAs by comparison:**

In [None]:
subset_vs_naive   <- significant_df[!(grepl("_vs_", significant_df$cluster)), ]
print(plotORAbubble(subset_vs_naive, "Subset vs Naive"))

shRunx3_vs_shCD19 <- significant_df[    ((grepl("shRunx3_vs", significant_df$cluster)) &
                                        !(grepl("D5", significant_df$cluster))),
                                    ]
print(plotORAbubble(shRunx3_vs_shCD19, "shRunx3 vs shCD19"))

shRunx3_vs_shCD19 <- significant_df[    ((grepl("shRunx3_vs", significant_df$cluster)) & 
                                        !(grepl("D5", significant_df$cluster))) &
                                        (! grepl("ST", significant_df$ID)),
                                    ]
print(plotORAbubble(shRunx3_vs_shCD19, "shRunx3 vs shCD19 (no ST Sigs)"))

intersubset       <- significant_df[    (grepl("_vs_", significant_df$cluster)) & 
                                        (!grepl("shRunx3_vs_|D5", significant_df$cluster)),
                                    ]
print(plotORAbubble(intersubset, "Intersubset Comparisons"))