In [1]:
require(dplyr)
require(ggpubr)
require(data.table)
library(tidyverse)

library(dplyr)
library(broom)
library(tidyr)
library(purrr)
options(warn = -1)

Loading required package: dplyr


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: ggpubr

Loading required package: ggplot2

Loading required package: data.table


Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last


── [1mAttaching core tidyverse packages[22m ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mpurrr    [39m 1.0.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mreadr    [39m 2.1.4     
── [1mConflicts[22m ───────────────────────────────────

**Functions**

In [2]:
# Define a function to perform one-vs-all Wilcoxon tests for each Level2 within Level1
compare_one_vs_all <- function(data) {
  # List to store results
  results <- list()
  
  # Loop over each Level1 category
  unique_level1 <- unique(data$Level1)
  for (level1 in unique_level1) {
    # Filter data for the current Level1 category and remove rows where activity is NA
    level1_data <- data %>%
      filter(Level1 == level1, !is.na(activity))
    
    # Get unique combinations of Level2 and TF
    combinations <- expand.grid(Level2 = unique(level1_data$Level2), TF = unique(level1_data$TF))
    
    # Perform one-vs-all comparisons for each combination
    for (i in 1:nrow(combinations)) {
      level2 <- combinations$Level2[i]
      tf <- combinations$TF[i]
      
      # Filter data for the current Level2 and TF
      one_data <- level1_data %>% filter(Level2 == level2, TF == tf)
      all_data <- level1_data %>% filter(Level2 != level2, TF == tf)

      mean_one <- mean(one_data$activity)
      mean_all <- mean(all_data$activity)
      
      # Ensure there are at least two observations in each group
      if (nrow(one_data) >= 3 && nrow(all_data) >= 3) {
        # Perform Wilcoxon test (one-sided: greater)
        test_result <- wilcox.test(one_data$activity, all_data$activity, alternative = "two.sided")
        
        # Clean up and store the result
        tidy_result <- tidy(test_result)
        tidy_result$Level1 <- level1
        tidy_result$Level2 <- level2
        tidy_result$TF <- tf
        tidy_result$mean_one <- mean_one
        tidy_result$mean_all <- mean_all
        results[[paste(level1, level2, tf)]] <- tidy_result
      }
    }
  }
  
  # Combine all results into a single data frame
  final_results <- bind_rows(results, .id = "comparison")
  return(final_results)
}

In [3]:
library(dplyr)
library(broom)
compare_disease_within_level2 <- function(data) {
  results <- list()
  
  for (level1 in unique(data$Level1)) {
    level1_data <- data %>%
      filter(Level1 == level1, !is.na(activity))
    
    combinations <- expand.grid(Level2 = unique(level1_data$Level2), TF = unique(level1_data$TF))
    
    for (i in 1:nrow(combinations)) {
      level2 <- combinations$Level2[i]
      tf <- combinations$TF[i]
      
      subset_data <- level1_data %>%
        filter(Level2 == level2, TF == tf)
      
      for (disease in unique(subset_data$Disease)) {
        one_group <- subset_data %>% filter(Disease == disease)
        all_others <- subset_data %>% filter(Disease != disease)
        
        mean_one <- mean(one_group$activity, na.rm = TRUE)
        mean_all <- mean(all_others$activity, na.rm = TRUE)
        
        if (nrow(one_group) >= 3 && nrow(all_others) >= 3) {
          test_result <- wilcox.test(one_group$activity, all_others$activity, alternative = "two.sided")
          
          tidy_result <- tidy(test_result)
          tidy_result$Level1 <- level1
          tidy_result$Level2 <- level2
          tidy_result$TF <- tf
          tidy_result$Disease <- disease
          tidy_result$mean_one <- mean_one
          tidy_result$mean_all <- mean_all
          
          results[[paste(level1, level2, tf, disease, sep = "_")]] <- tidy_result
        }
      }
    }
  }
  
  final_results <- bind_rows(results, .id = "comparison")
  return(final_results)
}

**Parameters**

In [4]:
# Inputs
# Parameters
diseases = c("SLE", "Cirrhosis", "Flu", "HNSCC")
workDir <- "/scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas/"
# --- Input 
task_dir = paste0(workDir, "03_downstream_analysis/07_gene_regulatory_network/")
results_path <- paste0(task_dir, "/results/")
data_folder <- paste0(results_path,'test_revision_20250402/TFactivity_level2/results/')

# --- Output 
outputpath = paste0(data_folder,'figures/')

# Palette
color_palette_path <- paste0(task_dir,'data/colors_palette.R')
source(color_palette_path)

# Separate plots per disease

In [12]:
for (disease in diseases) {
  
  # Read input
  inputpath <- file.path(data_folder, paste0("STAT1_SP1_Level2_", disease, ".csv"))
  level2 <- read.csv(inputpath, header = TRUE)

  # Orders
  tf_order <- c("STAT1", "SP1")
  l1_cell_type_order <- names(cell_colors)
  l2_cell_type_order <- names(cell_level2_colors)

  # Prepare data
  toplot_level2_all <- level2 %>%
    filter(!Level1 %in% c("Cycling_cells", "Progenitors", "Platelets")) %>%
    pivot_longer(cols = c(STAT1, SP1), names_to = "TF", values_to = "activity") %>%
    mutate(
      Level1 = factor(Level1, levels = l1_cell_type_order),
      Level2 = factor(Level2, levels = l2_cell_type_order),
      TF = factor(TF, levels = tf_order)
    )

  # Plot
  plot_l2 <- ggboxplot(
      toplot_level2_all,
      x = "Level2", y = "activity", fill = "Level2"
    ) +
    facet_grid(rows = vars(TF), cols = vars(Level1), scales = "free_x") +
    scale_fill_manual(values = cell_level2_colors) +
    theme_bw() +
    ggtitle(label = paste0(disease, " TF activity (Level2)")) +
    theme(
      axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
      strip.text = element_text(size = 8)
    )

  # Save
  ggsave(
    filename = paste0(disease, "_TFactivity_level2.pdf"),
    plot = plot_l2,
    device = "pdf",
    width = 20,
    height = 8,
    path = outputpath
  )
}

# Mono - all diseases together

In [14]:
library(dplyr)
library(tidyr)
library(ggpubr)
library(ggplot2)

# Initialize list to store per-disease data
mono_dfs <- list()

# Loop over diseases
for (disease in diseases) {
  
  inputpath <- file.path(data_folder, paste0("STAT1_SP1_Level2_", disease, ".csv"))
  level2 <- read.csv(inputpath, header = TRUE)

  tf_order <- c("STAT1", "SP1")
  l1_cell_type_order <- names(cell_colors)
  l2_cell_type_order <- names(cell_level2_colors)

  toplot_level2_all <- level2 %>%
    filter(!Level1 %in% c("Cycling_cells", "Progenitors", "Platelets")) %>%
    pivot_longer(cols = c(STAT1, SP1), names_to = "TF", values_to = "activity") %>%
    mutate(
      Level1 = factor(Level1, levels = l1_cell_type_order),
      Level2 = factor(Level2, levels = l2_cell_type_order),
      TF = factor(TF, levels = tf_order),
      Disease = disease  # tag disease for later
    )

  # Subset to Mono
  mono_dfs[[disease]] <- toplot_level2_all %>% filter(Level1 == "Mono")
}

# Combine all Mono rows
mono_all_diseases <- bind_rows(mono_dfs)

# Plot
plot_l2 <- ggplot(mono_all_diseases, aes(x = Level2, y = activity, fill = Disease)) +
  geom_boxplot(outlier.size = 0.5) +
  facet_wrap(~TF, scales = "free") +
  theme_bw() +
  scale_fill_brewer(palette = "Set2") +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    strip.text = element_text(size = 10)
  ) +
  labs(title = "TF Activity in Mono Cells by Level2 and Disease")

ggsave(
    filename = paste0("Mono_TFactivity_level2.pdf"),
    plot = plot_l2,
    device = "pdf",
    width = 20,
    height = 8,
    path = outputpath
)

# p-vals comparisons

**SLE and Flu**

In [10]:
for (disease in c("SLE", "Flu")) {
  # Read input
  inputpath <- file.path(data_folder, paste0("STAT1_SP1_Level2_", disease, ".csv"))
  level2 <- read.csv(inputpath, header = TRUE)

  # Orders
  tf_order <- c("STAT1", "SP1")
  l1_cell_type_order <- names(cell_colors)
  l2_cell_type_order <- names(cell_level2_colors)

  # Prepare data
  toplot_level2_all <- level2 %>%
    filter(!Level1 %in% c("Cycling_cells", "Progenitors", "Platelets")) %>%
    pivot_longer(cols = c(STAT1, SP1), names_to = "TF", values_to = "activity") %>%
    mutate(
      Level1 = factor(Level1, levels = l1_cell_type_order),
      Level2 = factor(Level2, levels = l2_cell_type_order),
      TF = factor(TF, levels = tf_order)
    )
  final_comparison_results <- compare_one_vs_all(toplot_level2_all)
  final_comparison_results_adjusted <- final_comparison_results %>%
  # Adjust p-values globally first
  mutate(
    pval_adj = p.adjust(p.value, method = "BH")
  ) %>%
  # Group by TFs to adjust p-values within each TF group
  #group_by(TF) %>%
  #mutate(
  #  pval_adj_indep = p.adjust(p.value, method = "BH")
  #) %>%
  #ungroup() %>%
  # Filter based on the unadjusted p-value
  #filter(pval_adj < 0.05) %>%
  # Arrange by p-value to see the most significant results first
  arrange(Level1, TF, pval_adj)

  tables_path <- paste0(outputpath, "/supplementary_tables/", disease, "_Level2_pval.csv")
  write.csv(final_comparison_results_adjusted, tables_path, row.names = FALSE)

}

In [None]:
# Example usage with your data frame
final_comparison_results <- compare_one_vs_all(toplot_level2)

**Within celltype across diseases**

In [6]:
mono_dfs <- list()

# Loop over diseases
for (disease in diseases) {
  
  inputpath <- file.path(data_folder, paste0("STAT1_SP1_Level2_", disease, ".csv"))
  level2 <- read.csv(inputpath, header = TRUE)

  tf_order <- c("STAT1", "SP1")
  l1_cell_type_order <- names(cell_colors)
  l2_cell_type_order <- names(cell_level2_colors)

  toplot_level2_all <- level2 %>%
    filter(!Level1 %in% c("Cycling_cells", "Progenitors", "Platelets")) %>%
    pivot_longer(cols = c(STAT1, SP1), names_to = "TF", values_to = "activity") %>%
    mutate(
      Level1 = factor(Level1, levels = l1_cell_type_order),
      Level2 = factor(Level2, levels = l2_cell_type_order),
      TF = factor(TF, levels = tf_order),
      Disease = disease  # tag disease for later
    )

  # Subset to Mono
  mono_dfs[[disease]] <- toplot_level2_all %>% filter(Level1 == "Mono")
}

# Combine all Mono rows
mono_all_diseases <- bind_rows(mono_dfs)
mono_all_diseases_sp1 <- mono_all_diseases[mono_all_diseases$TF == "SP1", ]

In [7]:
head(mono_all_diseases_sp1)

Level2,level_0,level_1,Level1,studyID,sex,sampleID,disease,binned_age,TF,activity,Disease
<fct>,<chr>,<chr>,<fct>,<chr>,<chr>,<chr>,<chr>,<chr>,<fct>,<dbl>,<chr>
Mono_IFNresponse,Mono,Perez2022_1004_T0_Mono_IFNresponse-3,Mono,Perez2022,female,Perez2022_1004_T0,SLE,31-40,SP1,0.3311216,SLE
Mono_IFNresponse,Mono,Perez2022_1014_T0_Mono_IFNresponse-3,Mono,Perez2022,female,Perez2022_1014_T0,SLE,31-40,SP1,-1.5119505,SLE
Mono_IFNresponse,Mono,Perez2022_1015_T0_Mono_IFNresponse-3,Mono,Perez2022,female,Perez2022_1015_T0,SLE,51-60,SP1,-1.659106,SLE
Mono_IFNresponse,Mono,Perez2022_1019_T0_Mono_IFNresponse-3,Mono,Perez2022,female,Perez2022_1019_T0,SLE,61-70,SP1,0.2892323,SLE
Mono_IFNresponse,Mono,Perez2022_1046_T0_Mono_IFNresponse-3,Mono,Perez2022,female,Perez2022_1046_T0,SLE,51-60,SP1,-0.4458592,SLE
Mono_IFNresponse,Mono,Perez2022_1051_T0_Mono_IFNresponse-3,Mono,Perez2022,male,Perez2022_1051_T0,SLE,41-50,SP1,-0.4720872,SLE


In [8]:
# List to store results
results <- list()
data <- mono_all_diseases_sp1
combinations <- expand.grid(Level2 = unique(data$Level2), TF = unique(data$TF), disease = unique(data$Disease))
combinations <- expand.grid(Level2 = unique(data$Level2), TF = unique(data$TF))
combinations

Level2,TF
<fct>,<fct>
Mono_IFNresponse,SP1
Mono_classical,SP1
Mono_inflammatory,SP1
Mono_nonClassical,SP1
Mono_regulatory,SP1


In [9]:
i = 1
level2 <- combinations$Level2[i]
tf <- combinations$TF[i]
sel_disease <- "SLE"
one_data <- data %>% filter(Level2 == level2, 
                            TF == tf, 
                            disease == sel_disease)
unique(one_data$disease)

In [10]:
for (i in 1:nrow(combinations)) {
  level2 <- combinations$Level2[i]
  tf <- combinations$TF[i]
  level1 <- "Mono"
  
  # Filter data for the current Level2 and TF
  for (sel_disease in unique(data$Disease)) {
      one_data <- data %>% filter(Level2 == level2, TF == tf, disease == sel_disease)
      all_data <- data %>% filter(Level2 == level2, TF == tf, disease != sel_disease)

      mean_one <- mean(one_data$activity)
      mean_all <- mean(all_data$activity)

      # Ensure there are at least two observations in each group
      if (nrow(one_data) >= 3 && nrow(all_data) >= 3) {
        # Perform Wilcoxon test (one-sided: greater)
        test_result <- wilcox.test(one_data$activity, all_data$activity, alternative = "two.sided")
        
        # Clean up and store the result
        tidy_result <- tidy(test_result)
        tidy_result$Level1 <- level1
        tidy_result$Level2 <- level2
        tidy_result$Disease <- sel_disease
        tidy_result$TF <- tf
        tidy_result$mean_one <- mean_one
        tidy_result$mean_all <- mean_all
        results[[paste(level1, level2, tf, sel_disease)]] <- tidy_result
      }
  }
}

In [11]:
final_results <- bind_rows(results, .id = "comparison")
final_results

comparison,statistic,p.value,method,alternative,Level1,Level2,Disease,TF,mean_one,mean_all
<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<fct>,<chr>,<fct>,<dbl>,<dbl>
Mono Mono_IFNresponse SP1 SLE,1920,0.03183529,Wilcoxon rank sum test with continuity correction,two.sided,Mono,Mono_IFNresponse,SLE,SP1,-1.03507372,-0.71736185
Mono Mono_IFNresponse SP1 Cirrhosis,1086,0.002858592,Wilcoxon rank sum test with continuity correction,two.sided,Mono,Mono_IFNresponse,Cirrhosis,SP1,-0.10472738,-1.01115242
Mono Mono_IFNresponse SP1 Flu,379,0.03930354,Wilcoxon rank sum test with continuity correction,two.sided,Mono,Mono_IFNresponse,Flu,SP1,-1.65480322,-0.93689729
Mono Mono_IFNresponse SP1 HNSCC,2003,0.03382681,Wilcoxon rank sum test with continuity correction,two.sided,Mono,Mono_IFNresponse,HNSCC,SP1,-0.5874391,-1.01904188
Mono Mono_classical SP1 SLE,2204,0.05143995,Wilcoxon rank sum test with continuity correction,two.sided,Mono,Mono_classical,SLE,SP1,-0.18994491,0.12157425
Mono Mono_classical SP1 Cirrhosis,1040,0.01646189,Wilcoxon rank sum test with continuity correction,two.sided,Mono,Mono_classical,Cirrhosis,SP1,0.76706407,-0.1639728
Mono Mono_classical SP1 Flu,641,0.1849295,Wilcoxon rank sum test with continuity correction,two.sided,Mono,Mono_classical,Flu,SP1,-0.70612975,-0.08871043
Mono Mono_classical SP1 HNSCC,2111,0.05651618,Wilcoxon rank sum test with continuity correction,two.sided,Mono,Mono_classical,HNSCC,SP1,0.26981813,-0.17435601
Mono Mono_inflammatory SP1 SLE,1379,0.0110242,Wilcoxon rank sum test with continuity correction,two.sided,Mono,Mono_inflammatory,SLE,SP1,0.02458958,0.59241248
Mono Mono_inflammatory SP1 Cirrhosis,900,0.02531521,Wilcoxon rank sum test with continuity correction,two.sided,Mono,Mono_inflammatory,Cirrhosis,SP1,1.07772825,0.08086133


In [12]:
final_comparison_results_adjusted <- final_results %>%
    # Adjust p-values globally first
    mutate(
    pval_adj = p.adjust(p.value, method = "BH")
    ) %>%
    # Group by TFs to adjust p-values within each TF group
    #group_by(TF) %>%
    #mutate(
    #  pval_adj_indep = p.adjust(p.value, method = "BH")
    #) %>%
    #ungroup() %>%
    # Filter based on the unadjusted p-value
    #filter(pval_adj < 0.05) %>%
    # Arrange by p-value to see the most significant results first
    arrange(Level2, TF, pval_adj)

final_comparison_results_adjusted

comparison,statistic,p.value,method,alternative,Level1,Level2,Disease,TF,mean_one,mean_all,pval_adj
<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<fct>,<chr>,<fct>,<dbl>,<dbl>,<dbl>
Mono Mono_classical SP1 Cirrhosis,1040,0.01646189,Wilcoxon rank sum test with continuity correction,two.sided,Mono,Mono_classical,Cirrhosis,SP1,0.76706407,-0.1639728,0.0557871204
Mono Mono_classical SP1 SLE,2204,0.05143995,Wilcoxon rank sum test with continuity correction,two.sided,Mono,Mono_classical,SLE,SP1,-0.18994491,0.12157425,0.0888508263
Mono Mono_classical SP1 HNSCC,2111,0.05651618,Wilcoxon rank sum test with continuity correction,two.sided,Mono,Mono_classical,HNSCC,SP1,0.26981813,-0.17435601,0.0894839502
Mono Mono_classical SP1 Flu,641,0.1849295,Wilcoxon rank sum test with continuity correction,two.sided,Mono,Mono_classical,Flu,SP1,-0.70612975,-0.08871043,0.2509757246
Mono Mono_nonClassical SP1 SLE,1282,2.648907e-05,Wilcoxon rank sum test with continuity correction,two.sided,Mono,Mono_nonClassical,SLE,SP1,-1.33527118,-0.53157489,0.0005032924
Mono Mono_nonClassical SP1 HNSCC,2333,7.708345e-05,Wilcoxon rank sum test with continuity correction,two.sided,Mono,Mono_nonClassical,HNSCC,SP1,-0.33347321,-1.28167243,0.0007322927
Mono Mono_nonClassical SP1 Cirrhosis,977,0.01761699,Wilcoxon rank sum test with continuity correction,two.sided,Mono,Mono_nonClassical,Cirrhosis,SP1,-0.38076315,-1.20954538,0.0557871204
Mono Mono_nonClassical SP1 Flu,524,0.69958,Wilcoxon rank sum test with continuity correction,two.sided,Mono,Mono_nonClassical,Flu,SP1,-1.26993596,-1.16653933,0.781883558
Mono Mono_inflammatory SP1 SLE,1379,0.0110242,Wilcoxon rank sum test with continuity correction,two.sided,Mono,Mono_inflammatory,SLE,SP1,0.02458958,0.59241248,0.0523649342
Mono Mono_inflammatory SP1 Cirrhosis,900,0.02531521,Wilcoxon rank sum test with continuity correction,two.sided,Mono,Mono_inflammatory,Cirrhosis,SP1,1.07772825,0.08086133,0.0687127182


In [13]:
tables_path <- paste0(outputpath, "/supplementary_tables/", "Mono_comparison_across_diseases.csv")
write.csv(final_comparison_results_adjusted, tables_path, row.names = FALSE)


In [None]:


# Assuming final_comparison_results is already created
final_comparison_results_adjusted <- final_comparison_results %>%
  # Adjust p-values globally first
  mutate(
    pval_adj = p.adjust(p.value, method = "BH")
  ) %>%
  # Group by TFs to adjust p-values within each TF group
  #group_by(TF) %>%
  #mutate(
  #  pval_adj_indep = p.adjust(p.value, method = "BH")
  #) %>%
  #ungroup() %>%
  # Filter based on the unadjusted p-value
  #filter(pval_adj < 0.05) %>%
  # Arrange by p-value to see the most significant results first
  arrange(Level1, TF, pval_adj)

output_path <- "results/supplementary_tables/SLE_Level2_pval.csv"
write.csv(final_comparison_results_adjusted, output_path, row.names = FALSE)

# Print the adjusted and filtered results
final_comparison_results_adjusted

**Boxplots**

In [None]:
plot_l2 <- ggboxplot(toplot_level2, x = "Level2", y = "activity", fill = "Level2") +
    facet_grid(rows = vars(TF), cols = vars(Level1), scales = "free_x"
              ) +
    scale_fill_manual(values = cell_level2_colors) +
    theme_bw() +
    ggtitle(label = "") +
    theme(
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=0.5)
    )

In [None]:
plot_l2

In [None]:
ggsave(
    plot_l2,
    filename = "sle_TFactivity_level2_test.pdf",
    device = "pdf",
    width = 20,
    height = 8,
    path = outputpath)