In [13]:
library(readxl)
library(tidyr)
library(tidyverse)

# Suppress warnings globally
options(warn = -1)

# set working directory
setwd('/lustre/home/reynaj/Projects/20241011.Byrd_Lab.IBD_NuLisa')

## Process the protein levels

In [14]:
# load assay data
fn <- "results/raw/P-000458_ADA_NULISAseq_Inflammation Panel_1-NPQ Counts_2-Target Detectability_3-Sample Information_2024_08_26.xlsx"
data <- read_excel(fn, sheet = "NPQ Counts");

In [15]:
data[1:3,]

Panel,PanelLotNumber,PlateID,SampleName,SampleType,Target,AlamarTargetID,UniProtID,ProteinName,SampleQC,LOD,NPQ
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
200-Plex Inflammation Panel v2,panelLot022,Plate_01,A_01_TRP-34082,Sample,AGER,t5521,Q15109,Advanced glycosylation end product-specific receptor,PASS,0,5.493463
200-Plex Inflammation Panel v2,panelLot022,Plate_01,A_02_TRP-34086,Sample,AGER,t5521,Q15109,Advanced glycosylation end product-specific receptor,PASS,0,4.453723
200-Plex Inflammation Panel v2,panelLot022,Plate_01,A_03_TRP-42335,Sample,AGER,t5521,Q15109,Advanced glycosylation end product-specific receptor,PASS,0,6.4691


In [16]:
# pivot the data to make a matrix
wide_data <- data %>% pivot_wider(id_cols = SampleName, names_from = Target, values_from = NPQ)

# transpose the data to be in the correct format for limma
# rows = genes
# columns = samples
t_data = t(wide_data)

# clean up the first row
colnames(t_data) <- t_data[1,]
t_data <- as.data.frame(t_data[-1,])

# get the column names
sample_names <- colnames(t_data)

# remove the pattern [A-Z]_[0-9]+
new_sample_names <- gsub("[A-Z]_[0-9]+_", "", sample_names)

# assign the new column names back to the data frame
colnames(t_data) <- new_sample_names

# # trim whitespace and convert to numeric
# t_data <- apply(t_data, 2, function(x) as.numeric(trimws(x)))

write.table(t_data, "results/comp_data/protein_levels.npq.tsv", sep = "\t", col.names = TRUE, quote=FALSE)

## Process the clinical data

In [17]:
# load clinical data
fn <- "results/raw/ADA_IBD_Saliva_Biospecimen Manifest Form-for-NULISA_241022.V2.Updated_Sample20.xlsx"
clinical_data <- read_excel(fn, sheet = "Aliquot Information")

In [18]:
# rename columns for programming use
transform_string <- function(x) {
  x <- tolower(x)            # Convert to lowercase
  x <- gsub(" ", "_", x)     # Replace spaces with "_"
  x <- gsub("\\(", "_", x)   # Replace "(" with "_"
  x <- gsub("\\)", "", x)    # Remove ")"
  return(x)
}

# Apply the function to the vector
transformed_cols <- sapply(as.vector(colnames(clinical_data)), transform_string)
colnames(clinical_data) <- transformed_cols

In [19]:
# remove nan samples
clinical_data <- clinical_data[!is.na(clinical_data$project_name),]
clinical_sample_names = clinical_data$original_subject_id

In [20]:
# Create a new row as a dataframe
new_row <- data.frame(
  original_subject_id = c("SC_Rep01", "SC_Rep02", "SC_Rep03"),
  sample_id = c("SC_Rep01", "SC_Rep02", "SC_Rep03"),
  ibd_diagnosis = c("Alamar_Sample_Control", "Alamar_Sample_Control", "Alamar_Sample_Control"),
  disease_activity = c("N/A", "N/A", "N/A")
)

In [21]:
# Add the new row to the dataframe
clinical_data <- bind_rows(clinical_data, new_row)

# order the dataframe based on the order of new sample names
clinical_data <- clinical_data[match(new_sample_names, clinical_data$original_subject_id), ]

In [22]:
# def indicator function
get_indicator <- function(x, check_list, categories) {
  
  if (x %in% check_list) {
    return(categories[[1]])
  } 
  else {
    return(categories[[2]])
  }
}

# add ibd disease indicator
ibd_check_list = c("CD", "UC", "IBD-U")
ibd_indicator <- sapply(as.vector(clinical_data$ibd_diagnosis), get_indicator, check_list=ibd_check_list, categories=c("IBD Super Group", "Control Super Group"))
clinical_data$ibd_indicator <- ibd_indicator

# add disease activity indicator
da_check_list = c("Moderate", "Mild")
da_indicator <- sapply(as.vector(clinical_data$disease_activity), get_indicator, check_list=da_check_list, categories=c("Active Disease", "In-active Disease"))
clinical_data$disease_activity_indicator <- da_indicator

In [23]:
write.table(clinical_data, "results/comp_data/clinical_data.tsv", sep = "\t", col.names = TRUE, row.names=FALSE, quote=FALSE)