In [1]:
# suppress warnings globally
options(warn = -1)

# load libraries
library(readxl)
library(tidyr)
library(tidyverse)

# set working directory
setwd('/lustre/home/reynaj/Projects/20241011.Byrd_Lab.IBD_NuLisa')

# set the output directory
outdir <- "results/aggregated/comp_data/"

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mpurrr    [39m 1.0.2
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtibble   [39m 3.2.1
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


## Process the clinical data

In [2]:
# load clinical data
fn <- "results/raw/ADA_IBD_Saliva_Biospecimen Manifest Form-for-NULISA_241022.V2.Updated_Sample20.xlsx"
clinical_data <- read_excel(fn, sheet = "Aliquot Information")

In [3]:
# rename columns for programming use
transform_string <- function(x) {
  x <- tolower(x)            # Convert to lowercase
  x <- gsub(" ", "_", x)     # Replace spaces with "_"
  x <- gsub("\\(", "_", x)   # Replace "(" with "_"
  x <- gsub("\\)", "", x)    # Remove ")"
  return(x)
}

# Apply the function to the vector
transformed_cols <- sapply(as.vector(colnames(clinical_data)), transform_string)
colnames(clinical_data) <- transformed_cols

In [4]:
# remove nan samples
clinical_data <- clinical_data[!is.na(clinical_data$project_name),]
clinical_sample_names = clinical_data$original_subject_id

In [5]:
# Create a new row as a dataframe
new_row <- data.frame(
  original_subject_id = c("SC_Rep01", "SC_Rep02", "SC_Rep03"),
  sample_id = c("SC_Rep01", "SC_Rep02", "SC_Rep03"),
  ibd_diagnosis = c("Alamar_Sample_Control", "Alamar_Sample_Control", "Alamar_Sample_Control"),
  disease_activity = c("N/A", "N/A", "N/A"),
  matched_subject_id = c(999, 999, 999)
)

In [6]:
# Add the new row to the dataframe
clinical_data <- bind_rows(clinical_data, new_row)

In [7]:
# def indicator function
get_indicator <- function(x, check_list, categories) {
  
  if (x %in% check_list) {
    return(categories[[1]])
  } 
  else {
    return(categories[[2]])
  }
}

# add ibd disease indicator
ibd_check_list = c("CD", "UC", "IBD-U")
ibd_indicator <- sapply(as.vector(clinical_data$ibd_diagnosis), get_indicator, check_list=ibd_check_list, categories=c("IBD Super Group", "Control Super Group"))
clinical_data$ibd_indicator <- ibd_indicator

# add disease activity indicator
da_check_list = c("Moderate", "Mild")
da_indicator <- sapply(as.vector(clinical_data$disease_activity), get_indicator, check_list=da_check_list, categories=c("Active Disease", "In-active Disease"))
clinical_data$disease_activity_indicator <- da_indicator

In [8]:
clinical_data[1:3,]

project_name,sample_type,sample_id,collection_date,box_id,box_location,total_volume__ml,original_subject_id,event_name,ibd_diagnosis,date_collection,disease_activity,matched_subject_id,notes,ibd_indicator,disease_activity_indicator
<chr>,<chr>,<chr>,<dttm>,<dbl>,<chr>,<dbl>,<chr>,<dbl>,<chr>,<dttm>,<chr>,<dbl>,<chr>,<chr>,<chr>
Byrd_IBD_Saliva_sup,Saliva,TRP-34082,2023-11-30 15:34:05,1,A1,1100,TRP-34082,1,CD,2023-11-30,Quiescent,1,,IBD Super Group,In-active Disease
Byrd_IBD_Saliva_sup,Saliva,TRP-34086,2023-11-30 15:34:05,1,A2,1100,TRP-34086,1,CD,2023-11-30,Quiescent,3,,IBD Super Group,In-active Disease
Byrd_IBD_Saliva_sup,Saliva,TRP-42335,2023-12-20 16:51:00,1,A3,1000,TRP-42335,1,CD,2023-12-20,Quiescent,7,,IBD Super Group,In-active Disease


## Process the protein levels

In [9]:
# load assay data
fn <- "results/raw/P-000458_ADA_NULISAseq_Inflammation Panel_1-NPQ Counts_2-Target Detectability_3-Sample Information_2024_08_26.xlsx"
data <- read_excel(fn, sheet = "NPQ Counts");

# remove the pattern [A-Z]_[0-9]+
new_sample_names <- gsub("[A-Z]_[0-9]+_", "", data$SampleName)
data$SampleName <- new_sample_names

In [10]:
# pivot the data to make a matrix
wide_data <- data %>% pivot_wider(id_cols = SampleName, names_from = Target, values_from = NPQ)

# join with the wetlab data to aggregate replicate values
wide_data <- wide_data %>% left_join(clinical_data, by=c("SampleName" = "original_subject_id"))

# aggregate protein levels on matched_subject_id
wide_data <- wide_data %>%
  group_by(matched_subject_id) %>%
  summarise(across(where(is.numeric), mean, na.rm = TRUE))

# get a final/shared protein list
prot_targets <- c("matched_subject_id", unique(data$Target))
final_prot_targets <- prot_targets[prot_targets %in% colnames(wide_data)]

# filter by this final list
wide_data <- wide_data[, final_prot_targets]

In [11]:
# transpose the data to be in the correct format for limma
# rows = genes
# columns = samples
t_data = t(wide_data)

# clean up the first row
colnames(t_data) <- t_data[1,]
t_data <- t_data[-1,]

# save
outfn <- file.path(outdir, "protein_levels.npq.tsv")
write.table(t_data, outfn, sep = "\t", col.names = TRUE, quote=FALSE)

In [12]:
t_data[1:3,]

Unnamed: 0,1,3,4,5,6,7,8,9,10,11,⋯,56,57,58,59,61,63,64,67,201,999
AGER,5.493463,5.58393,8.194804,6.063246,2.69924,6.4691,5.507946,3.547912,0.0,6.089471,⋯,0.0,5.509312,0.0,7.240361,8.127315,6.178675,0.0,7.956293,7.745765,13.157
AGRP,8.765074,10.49972,11.190956,9.749962,7.201741,11.61736,8.958267,9.236556,10.76013,10.918441,⋯,11.42757,10.696368,7.636685,10.421485,9.25526,9.750478,9.101757,11.25174,10.561049,12.59489
ANGPT1,15.762125,16.95419,16.759397,19.285812,17.125303,17.49572,15.013518,16.14003,14.50187,17.668777,⋯,13.85228,17.370012,16.297275,16.403978,16.844907,18.141203,17.822363,18.138904,16.18778,13.15632


## Harmonized and save data

In [13]:
# aggregate the clinical data
final_clinical_data <- clinical_data %>%
  group_by(matched_subject_id) %>%
  slice_max(order_by = event_name, with_ties = FALSE)

# order the dataframe based on the order of new sample names
final_clinical_data <- final_clinical_data[match(colnames(t_data), final_clinical_data$matched_subject_id), ]

# save 
outfn <- file.path(outdir, "clinical_data.tsv")
write.table(final_clinical_data, outfn, sep = "\t", col.names = TRUE, row.names=FALSE, quote=FALSE)