population_antigen_relation.Rmd

---
title: "DBS and plasma relation investigation"
author: "Creator: Leo Dahl, Contributors: Tea Dodig-Crnkovic, Leo Dahl"
date: "`r format(Sys.time(), '%d %B, %Y')`"
output:
  html_document:
    toc: true
    code_folding: hide
    toc_float:
      collapsed: false 
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```


<style type="text/css">

h1 { 
  font-size: 25px;
  margin-top: 1.5cm;
  margin-bottom: 0.5cm;
}
h2 { 
  font-size: 18px;
  margin-top: 1cm;
  margin-bottom: 0.5cm;
}
h3 {
	font-size: 15px;
	margin-top: 0.5cm;
	margin-bottom: 0.5cm;
}
</style>

Description
===========

> This script compares antigen MFI measurements with olink protein measurements by looking at correlation. 


```{r packages, message=F, results="hide"}
options(stringsAsFactors = F)
library(dplyr)
library(DT)
library(stringr)
library(tibble)
library(tidyr)
library(psych)
library(pheatmap)
library(reshape2)
library(ggplot2)
library(umap)
library(patchwork)
library(openxlsx)
```


Load data
=========

```{r load data}

source("load_cov_npx.R")
cov_npx <- load_cov_npx("abspqn_adj")

# Olink data
npx_ol <- cov_npx$npx
sinfo_ol <- cov_npx$sinfo
binder_ol <- cov_npx$binder
rownames(binder_ol) <- binder_ol$unique_id

# IgG/IgM assay batch 1 
b1_path <- "../data/dbs_batch1.RData"
load(b1_path)
sinfo_b1 <- sinfo
mfi_b1 <- mfi
binder_b1 <- binder

# Batch 2
b2_path <- "../data/dbs_batch2.RData"
load(b2_path)
sinfo_b2 <- sinfo
mfi_b2 <- mfi
binder_b2 <- binder

out_dir <- "../results/population_antigen_relation/"

```

Used NPX data:  
**`r cov_npx$data_set`**

Path to NPX data:  
**`r cov_npx$data_path`**

Path to data batch 1:  
**`r b1_path`**

Path to data batch 2:  
**`r b2_path`**

Data prep
---------

```{r prepare data}

# Get overlap of samples in the plasma and olink sets, remove labels for replicates and such
# Add column of name after strsplit to use for picking rows later
sinfo_b1$samp_name <- sinfo_b1$id %>%
  strsplit("\\*") %>%
  sapply(., function(x) x[1])

sinfo_b2$samp_name <- sinfo_b2$id %>%
  strsplit("\\*") %>%
  sapply(., function(x) x[1])

# olink info has "." instead of "+" in unique sample IDs, replace with "." for matching
samp_overlap <- intersect(c(sinfo_b1$samp_name, sinfo_b2$samp_name), 
                          gsub("\\.", "\\+", sinfo_ol$unique_participant_ID))


# Data contains a UMAP-plate and a SPK-plate, assayed on 2020-06-23 and 2020-11-13 respectively. Keep the ones on the UMAP plate (studyset 1)
sinfo_overlap <- sinfo_ol %>%
	filter(unique_participant_ID %in% gsub("\\+", "\\.", samp_overlap)) %>%
	# Remove bridge samples from SPK-plate and longitudinal samples 
	filter(grp %in% c("Population", "Bridge")) %>%
	filter(!(grp == "Bridge" & Analysis_date == "2020-11-13"))

# Find samples that appear more than once, to merge the NPX of those that are the same
dupl_samp <- unique(sinfo_overlap[which(duplicated(sinfo_overlap$unique_participant_ID)), "unique_participant_ID", drop = T])

# Isolate the duplicated samples to calculate means
sinfo_dupl <- sinfo_overlap %>% filter(unique_participant_ID %in% dupl_samp)
npx_dupl <- npx_ol[sinfo_dupl$sample_id, ]
sinfo_overlap <- sinfo_overlap %>% filter(!(unique_participant_ID %in% dupl_samp))
npx_ol <- npx_ol[!(rownames(npx_ol) %in% rownames(npx_dupl)), ]

# Initialise matrix for merged duplicated samples
npx_dupl_merge <- matrix(nrow = length(dupl_samp), 
                         ncol = ncol(npx_ol),
                         dimnames = list(dupl_samp, colnames(npx_ol)))
for (samp in dupl_samp) {
  npx_dupl_merge[samp, ] <- 
    colMeans(npx_dupl[sinfo_dupl %>% filter(unique_participant_ID == samp) %>% pull(sample_id), ])
}

# Add the new merged NPX values to the rest, add one sinfo entry for each duplicated sample
npx_ol <- rbind(npx_ol, npx_dupl_merge)
sinfo_overlap <- rbind(sinfo_overlap, 
                       sinfo_dupl[-which(duplicated(sinfo_dupl$unique_participant_ID)), ])
rownames(sinfo_overlap) <- sinfo_overlap$unique_participant_ID

# Update overlapping sample vector to only have UMAP- or spike-positive samples
samp_overlap <- rownames(sinfo_overlap)
# Change rownames to not have to substitute . for + or vice versa all the time
rownames(mfi_b1) <- gsub("\\+", "\\.", rownames(mfi_b1))
rownames(mfi_b2) <- gsub("\\+", "\\.", rownames(mfi_b2))

# In this case there are no samples with any labels that end up in the final selection, can just use samp_overlap to directly pick out samples
mfi_b1 <- mfi_b1[rownames(mfi_b1) %in% samp_overlap, ]
mfi_b2 <- mfi_b2[rownames(mfi_b2) %in% samp_overlap, ]
npx_ol <- npx_ol[samp_overlap, ]

# Take average of related binders in batch 1 and batch 2, such as SPK_01 and SPK_02, or RBD_01, 02, 03, 04 etc since they are measuring the same-ish thing
# Initialise matrices to hold SPK, RBD and NCP values
# Also keep IgG and IgM separate
antigens <- c("SPK", "RBD", "NCP") #"EBN"
igs <- c("IgG", "IgM")
mfi_b1_red <- matrix(nrow = nrow(mfi_b1),
                     ncol = length(antigens)*2, 
                     dimnames = list(rownames(mfi_b1), 
                                     paste0(antigens, "_", rep(igs, each=length(antigens)))))
mfi_b2_red <- matrix(nrow = nrow(mfi_b2),
                     ncol = length(antigens)*2,
                     dimnames = list(rownames(mfi_b2), 
                                     paste0(antigens, "_", rep(igs, each=length(antigens)))))
# dataframe for saving names of merged binders
merged_b <- data.frame(matrix(nrow = length(antigens)*2, ncol = 4, 
                              dimnames = list(1:(length(antigens)*2), c("Batch", "Antigen", "IgG", "IgM"))))
merged_b$Batch <- rep(1:2, each = length(antigens)); merged_b$Antigen <- rep(antigens, 2)

# The merging
for (ag in antigens) {
  for (ig in igs) {
    b1_merge_binders <- binder_b1 %>% filter(str_detect(Name, ag) & detect_Ab == ig)
    merged_b[merged_b$Batch == 1 & 
               merged_b$Antigen == ag, ig] <- paste0(b1_merge_binders$id, collapse = ", ")
    
    # Only merge if there is something to merge, i.e. more than one antigen
    if (nrow(b1_merge_binders) < 2) {
      mfi_b1_red[, paste0(ag, "_", ig)] <- 
        mfi_b1[rownames(mfi_b1_red), b1_merge_binders$id]
    } else {
      mfi_b1_red[, paste0(ag, "_", ig)] <- 
        rowMeans(mfi_b1[rownames(mfi_b1_red), b1_merge_binders$id])
    }
    
    b2_merge_binders <- binder_b2 %>% filter(str_detect(Name, ag) & detect_Ab == ig)
    merged_b[merged_b$Batch == 2 &
               merged_b$Antigen == ag, ig] <- paste0(b2_merge_binders$id, collapse = ", ")
    
    if (nrow(b2_merge_binders) < 2) {
      mfi_b2_red[, paste0(ag, "_", ig)] <- 
        mfi_b2[rownames(mfi_b2_red), b2_merge_binders$id]
    } else {
      mfi_b2_red[, paste0(ag, "_", ig)] <- 
        rowMeans(mfi_b2[rownames(mfi_b2_red), b2_merge_binders$id])
    }
  }
}

mfi_merge <- rbind(mfi_b1_red, mfi_b2_red)[rownames(npx_ol), ]  # Put in same row-order as NPX 

```

Samples that overlapped between the Olink proteins and batches 1&2.  
`r datatable(sinfo_overlap[, c("subj.id", "Population_batch", "umap_seropositive_IgG_IgM", "serostatus")])`

Number of samples in total: `r nrow(sinfo_overlap)`  
Number of samples analysed on 2020-06-23 ("UMAP-plate"): `r sinfo_overlap %>% filter(Analysis_date == "2020-06-23") %>% nrow()`  
Number of samples analysed on 2020-11-13 ("SPK-plate"): `r sinfo_overlap %>% filter(Analysis_date == "2020-11-13") %>% nrow()`  
Number of UMAP-positive samples: `r sinfo_overlap %>% filter(umap_seropositive_IgG_IgM == 1) %>% nrow()`  
Number of spike positive (IgG + IgM) samples: `r sinfo_overlap %>% filter(serostatus == "IgG + IgM") %>% nrow()`  
Overlap of UMAP and spike-positive: `r sinfo_overlap %>% filter(umap_seropositive_IgG_IgM == 1 & serostatus == "IgG + IgM") %>% nrow()`

`r if(nrow(sinfo_dupl) > 0){"The following samples were assayed multiple times, the replicates were merged by taking their means.  "}`
`r if(nrow(sinfo_dupl) > 0){knitr::kable(table(sinfo_dupl$unique_participant_ID), col.names = c("Sample", "Frequency")) %>% kableExtra::kable_styling(full_width=F)}`

Merged antigens for each batch, antigen type (SPK, RBD, or NCP) and Ig type (IgG, IgM).  
`r datatable(merged_b, rownames=F)`

Also load studyset 3

```{r}

source("sep_datasets.R")
ds3 <- load_cov_npx(paste0("ds3_", cov_npx$data_set)) %>% merge_repl()
binder_ds3 <- ds3$binder
sinfo_ds3 <- ds3$sinfo %>% rename(serostatus = grp) # %>% filter(both_pos | !any_pos)
# Merge values from assays measuring the same antigen, skip antigens not in main data sets
mfi_ds3 <- ds3$mfi %>% filter(sample_id %in% ds3$npx$sample_id) %>%
  select(sample_id, contains(c("Spike", "Nucleocapsid", "RBD")), -contains("0.4")) %>%
  remove_rownames() %>%
  # Merge antigens by making long format data, grouping by sample and antigen, taking average, and transforming back to wide format
  pivot_longer(cols = -sample_id, names_to = "ag", values_to = "mfi") %>%
  mutate(ag = case_when(str_detect(ag, "^Spike") ~ "SPK_IgG",
                        str_detect(ag, "^Nucleo") ~ "NCP_IgG",
                        str_detect(ag, "^RBD") ~ "RBD_IgG")) %>%
  group_by(sample_id, ag) %>%
  mutate(mean_mfi = mean(mfi)) %>%
  distinct(sample_id, ag, mean_mfi) %>%
  pivot_wider(id_cols = sample_id, names_from = "ag", values_from = "mean_mfi") %>%
  column_to_rownames("sample_id")

# Make list of data to use
dat_list <- list(
  "studyset1" = list(
    "npx" = npx_ol[match(sinfo_overlap %>% filter(Analysis_date == "2020-06-23") %>% rownames(), rownames(npx_ol)), ],
    "mfi" = mfi_merge[match(sinfo_overlap %>% filter(Analysis_date == "2020-06-23") %>% rownames(), rownames(mfi_merge)), ],
    "sinfo" = sinfo_overlap %>% filter(Analysis_date == "2020-06-23")
  ),
  "studyset2" = list(
    "npx" = npx_ol[match(sinfo_overlap %>% filter(Analysis_date == "2020-11-13") %>% rownames(), rownames(npx_ol)), ],
    "mfi" = mfi_merge[match(sinfo_overlap %>% filter(Analysis_date == "2020-11-13") %>% rownames(), rownames(mfi_merge)), ],
    "sinfo" = sinfo_overlap %>% filter(Analysis_date == "2020-11-13")
  ),
  "studyset3" = list(
    "npx" = ds3$npx %>% column_to_rownames("sample_id"),
    "mfi" = mfi_ds3,
    "sinfo" = sinfo_ds3 %>% filter(sample_id %in% ds3$npx$sample_id)
  )
)
```

Correlation plots
=================

```{r}
cor_res <- lapply(dat_list, function(x) {
  corr <- corr.test(x$npx, x$mfi[rownames(x$npx), ])
  
  corr_long <- corr$r %>% as.data.frame() %>%
    rownames_to_column("protein") %>%
    pivot_longer(cols = -protein, names_to = "antigen", values_to = "correlation") %>%
    # Add p-values
    left_join((corr$p %>% as.data.frame() %>% rownames_to_column("protein") %>%
        pivot_longer(cols = -protein, names_to = "antigen", values_to = "pvalue")), by = c("protein", "antigen")) %>%
    # Add FDR, per antigen
    group_by(antigen) %>%
    mutate(fdr = p.adjust(pvalue, method = "fdr")) %>%
    ungroup() %>%
    # Add what Ig is being looked at
    mutate(ig = str_extract(antigen, "Ig\\w"))
  
  return(corr_long)
})
```

```{r}
lapply(names(cor_res), function(x) {
  for (which_ig in sort(unique(cor_res[[x]]$ig))) {
    cor_res[[x]] %>% filter(ig == which_ig) %>%
      select(protein, antigen, correlation) %>%
      pivot_wider(id_cols = "protein", names_from = "antigen", values_from = "correlation") %>%
      column_to_rownames("protein") %>% t() %>%
      pheatmap(., cellwidth = 2, border_color = NA, fontsize_col = 2.3, main = paste0(x, ", ", which_ig))
  }
})
```

Tables of correlation
=====================

Displaying correlations with nominal p-values below 0.05

```{r}
htmltools::tagList(sapply(names(cor_res), function(x) {
  datatable(cor_res[[x]] %>% filter(pvalue < 0.05) %>% arrange(pvalue),
            caption = paste0("Correlations with nominal p-values below 0.05 in ", x)) %>%
    formatSignif(columns = 3:5, digits = 3)
}, simplify = F))
```

## Volcano plots of correlations

```{r}
lapply(cor_res, function(ds) {
  ggplot(ds, aes(x = correlation, y = -log10(pvalue))) +
    geom_point() +
    facet_wrap(~ ig, nrow = 1) +
    theme_classic()
})

lapply(names(cor_res), function(ds) {
  cor_res[[ds]] %>% mutate(studyset = ds)
}) %>% data.table::rbindlist() %>%
  ggplot(aes(x = correlation, y = -log10(pvalue))) +
  geom_point() +
  xlim(-1, 1) +
  ggh4x::facet_grid2(rows = vars(studyset), cols = vars(ig), scales = "free", independent = T) +
  theme_classic()
```

Save table of correlations
==========================

```{r}
tbl_out <- rbind(
    cor_res$studyset1 %>% mutate(studyset = "studyset1"),
    cor_res$studyset2 %>% mutate(studyset = "studyset2"),
    cor_res$studyset3 %>% mutate(studyset = "studyset3")
) %>% select(-ig) %>%
    rename(corr = correlation, pval = pvalue) %>%
    separate_wider_delim(cols = antigen, delim = "_", names = c("antigen", "ig")) %>%
    pivot_wider(id_cols = c(protein, antigen, ig), names_from = studyset, values_from = c(corr, pval, fdr)) %>%
    left_join(binder_ol %>% select(unique_id, assay, gene_name, uniprot_ID), by = c("protein" = "unique_id")) %>%
    select(-protein) %>% relocate(assay, gene_name, uniprot_ID)

write.xlsx(tbl_out, paste0(out_dir, format(Sys.time(), "%Y-%m-%d_%H%M%S"), "_corr_results.xlsx"), asTable = T)

```

Session information
===================

```{r}
sessionInfo()
```