# 2024-026 Peak Set Enrichment Analysis

This notebook details processes for running PSEA on differential peaks obtained from 2024-026.

**Expected Outputs**
 - CSV file of GSEA results
 - rds of raw ClusterProfiler objects

### Initialize Environment

Import the necessary packages and output package versions:

In [None]:
# Import necessary packages
library(ggplot2)
library(clusterProfiler)
library(DOSE)
library(scales)
library(viridis)
library(dplyr)

# # Load data
# working_dir <- "/home/dalbao/2024-026-Tcf7ATAC/01_nfcore_241228"

# # Move to working directory
# setwd(working_dir)

# Load data
exp_dir     <- "/home/dalbao/2024-026-Tcf7ATAC"
nfcore_dir  <- "01_nfcore_241228"
nb_dir      <- "02_bp_notebooks"
prefix      <- "DAnST-PSEA_noShrink_lfc"

# Set input DAPS (csv file)
input <- "250610-bp-DAnST-DAPS_noShrink-Unannotated.csv"
# Add folder structure to input
input <- paste(nb_dir, "daps", input, sep = "/")

# Peak Sets File Pattern
ps_pattern <- "noShrink.+peakSets.csv$"              # Files with this pattern

# nf-core ATAC-seq pipeline calls peaks from two different libraries:
library     <- "merged_library"
# If library == merged_library, lib_sh is mLb
# else if it is == merged_replicates, lib_sh is mRp
lib_sh      <- ifelse(library == "merged_library", "mLb", "mRp")

# nf-core ATAC-seq pipeline calls peaks in two types, broad_peak and narrow_peak:
peak_type   <- "broad_peak"
# If peak_type == broad_peak, peak_sh is bp
peak_sh     <- ifelse(peak_type == "broad_peak", "bp", "np")

# Add peak type and date to prefix
prefix <- paste0(format(Sys.Date(), "%y%m%d"), "-", peak_sh, "-", prefix)

# Move to working directory
setwd(exp_dir)

# Toggle automatic data saving
savedata <- TRUE

**Load gene sets and examine:**

In [None]:
# Loop and load all CSV files that end with peak_sets.csv in the directory.
# List all CSV files that end with "peak_sets.csv" in the peak_sets directory
peak_files <- list.files(path = paste(nb_dir, "peak_sets", sep = "/"), pattern = ps_pattern, full.names = TRUE)

print(peak_files)

# Initialize an empty list to hold data
sigs_list <- list()

# Loop through each file and read the data
for (file in peak_files) {
    sigs_list[[file]] <- read.csv(file)
}

# Combine all data frames in the list into one data frame
sigs <- do.call(rbind, sigs_list)


# Check peak list
table(sigs$ps_name)
# Show top 5 rows:
head(sigs)

**Load dataset and examine:**

In [None]:
# Load degs
degs <- read.csv(input)

# Reorder to make similar to scDatasets
degs <- degs[, c(1,4,7,6,8,9)]
# Rename
colnames(degs) <- c("group", "names", "statistic", "scores", "pvals", "pvals_adj")

# Show top 5 rows
head(degs)

### Gene Set Enrichment Analysis

Perform GSEA analyses per cluster and save these analyses to a list. Moreover, prepare degs to have an ordered list suitable for clusterProfiler; order genes from highest to lowest scores. First create a function:

In [None]:
# Convert peak sets to gene sets terminology
colnames(sigs) <- c("gs_name", "gene_symbol")

# Define a function that takes the entire degs list, a cluster of interest, and gene signatures then returns a gseaResult object.
performGSEA <- function(degs, cluster, sigs, seed = TRUE, set_seed = 42){
    # Extract cluster of interest
    cluster <- degs[degs$group == cluster, ]
    # Extract Z scores
    cluster.genes <- cluster$scores
    # Name Z scores by gene
    names(cluster.genes) <- cluster$names
    # Sort genes by decreasing score (needed for clusterProfiler)
    cluster.genes <- sort(cluster.genes, decreasing = TRUE)

    # Perform GSEA
    if(seed){
        set.seed(42) # For computational reproducibility
    }
    gsea <- GSEA(   cluster.genes,
                    TERM2GENE=sigs,
                    nPerm = 10000,
                    minGSSize = 5,
                    maxGSSize = 20000,
                    pvalueCutoff = 1,
                    by="DOSE",
                    seed = seed)

    return(gsea) # Return gseaResult object
} # End of fucntion

Now that a function is defined, loop through all clusters within degs df:

In [None]:
# Define a list to hold gseaResults
enrichment_results = list()

# Define an empty dataframe
results_df <- data.frame()

# Loop over all clusters in degs df
for (cluster in unique(degs$group)){
    # Assign gseaResult from performGSEA() to a specific item in enrichment_results
    enrichment_results[cluster] <- performGSEA(degs = degs, cluster = cluster, sigs = sigs, seed = TRUE, set_seed = 42)

    # Extract dataframe of results
    df <- enrichment_results[[cluster]]@result
    # Label cluster for the extracted dataframe
    df$cluster <- cluster
    # Put cluster column as the firstmost column
    df <- df[, c(12, 1:11)]

    # Rowbind dataframes to combine all clusters into one
    # Conditional, if results_df is empty, assign current df:
    if(dim(results_df)[1] == 0){
        results_df <- df
        print("Start")
    }else{ # If data exists, rowbind
        results_df <- rbind.data.frame(results_df, df)
    }

    # Show progress
    print(cluster)

    rm(df, cluster) # Cleanup
    gc(full = TRUE)
}
# End of loop

**Save CSV of GSEA results and the raw ClusterProfiler objects:**

In [None]:
# Use BH p-adj
results_df$BH <- p.adjust(results_df$pvalue, method = "BH")


if(savedata){
    save_dir <- paste(nb_dir, "psea", sep = "/")

    # Save list containing gseaResult objects to RDS
    saveRDS(enrichment_results, paste(save_dir, "/", prefix, ".rds", sep = ""))

    # Save list containing gseaResult objects to RDS
    write.csv(results_df, paste(save_dir, "/", prefix, ".csv", sep = ""), row.names = FALSE)
}

**Save the results of the differential expression analysis in a CSV file and the modified adata file:**

### Visualize GSEA Results

**Pre-process data to order clusters accordingly and filter out non-significant results:**

In [None]:
# Calculate negative logarithm of q-value
results_df$log_qvalue <- -log10(results_df$BH)

# Filter significant q-values
significant_df <- subset(results_df, BH < 0.05)

**Define a function to plot data:**

In [None]:
plotGSEAbubble <- function(df, name){
    # Define color scale
    color_scale <- viridis::cividis(3)  # Generate a color scale with 2 colors

    # Ensure limits are symmetric around 0
    NES_range <- range(df$NES, na.rm = TRUE)
    max_abs_NES <- max(abs(NES_range))
    limits <- c(-max_abs_NES, max_abs_NES)

    # Create bubble plot
    p <- ggplot(df, aes(x = cluster, y = ID)) +
      geom_point(aes(color = NES, size = log_qvalue)) +
      scale_color_gradientn(
        colors = color_scale,
        limits = limits,
        guide = guide_colorbar(title = "NES")
      ) +
      scale_size_continuous(
        breaks = seq(0, max(df$log_qvalue), by = 0.5),
        labels = seq(0, max(df$log_qvalue), by = 0.5),
        guide = guide_legend(title = "-log10(Q-value)")
      ) +
      labs(
        title = paste("Gene Set Enrichment", name),  # Set the plot title
        x = "Cluster",  # Label for the X-axis
        y = "ID"  # Label for the Y-axis
      ) +
      theme_minimal() +  # Set the plot theme to minimal
      theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate and align X-axis labels
    return(p)
}

**Plot GSEAs by comparison:**

In [None]:
subset_vs_naive   <- significant_df[!(grepl("_vs_", significant_df$cluster)), ]
print(plotGSEAbubble(subset_vs_naive, "Subset vs Naive"))
shRunx3_vs_shCD19 <- significant_df[((grepl("shRunx3_vs", significant_df$cluster)) & !(grepl("D5", significant_df$cluster))), ]
print(plotGSEAbubble(shRunx3_vs_shCD19, "shRunx3 vs shCD19"))
intersubset       <- significant_df[(grepl("_vs_", significant_df$cluster)) & (!grepl("shRunx3_vs_|D5", significant_df$cluster)), ]
print(plotGSEAbubble(intersubset, "Intersubset Comparisons"))
intersubset       <- significant_df[(grepl("D5", significant_df$cluster)), ]
print(plotGSEAbubble(intersubset, "Day 5 Comparisons"))