-
Notifications
You must be signed in to change notification settings - Fork 0
/
population_antigen_relation.Rmd
373 lines (304 loc) · 12.9 KB
/
population_antigen_relation.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
---
title: "DBS and plasma relation investigation"
author: "Creator: Leo Dahl, Contributors: Tea Dodig-Crnkovic, Leo Dahl"
date: "`r format(Sys.time(), '%d %B, %Y')`"
output:
html_document:
toc: true
code_folding: hide
toc_float:
collapsed: false
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
<style type="text/css">
h1 {
font-size: 25px;
margin-top: 1.5cm;
margin-bottom: 0.5cm;
}
h2 {
font-size: 18px;
margin-top: 1cm;
margin-bottom: 0.5cm;
}
h3 {
font-size: 15px;
margin-top: 0.5cm;
margin-bottom: 0.5cm;
}
</style>
Description
===========
> This script compares antigen MFI measurements with olink protein measurements by looking at correlation.
```{r packages, message=F, results="hide"}
options(stringsAsFactors = F)
library(dplyr)
library(DT)
library(stringr)
library(tibble)
library(tidyr)
library(psych)
library(pheatmap)
library(reshape2)
library(ggplot2)
library(umap)
library(patchwork)
library(openxlsx)
```
Load data
=========
```{r load data}
source("load_cov_npx.R")
cov_npx <- load_cov_npx("abspqn_adj")
# Olink data
npx_ol <- cov_npx$npx
sinfo_ol <- cov_npx$sinfo
binder_ol <- cov_npx$binder
rownames(binder_ol) <- binder_ol$unique_id
# IgG/IgM assay batch 1
b1_path <- "../data/dbs_batch1.RData"
load(b1_path)
sinfo_b1 <- sinfo
mfi_b1 <- mfi
binder_b1 <- binder
# Batch 2
b2_path <- "../data/dbs_batch2.RData"
load(b2_path)
sinfo_b2 <- sinfo
mfi_b2 <- mfi
binder_b2 <- binder
out_dir <- "../results/population_antigen_relation/"
```
Used NPX data:
**`r cov_npx$data_set`**
Path to NPX data:
**`r cov_npx$data_path`**
Path to data batch 1:
**`r b1_path`**
Path to data batch 2:
**`r b2_path`**
Data prep
---------
```{r prepare data}
# Get overlap of samples in the plasma and olink sets, remove labels for replicates and such
# Add column of name after strsplit to use for picking rows later
sinfo_b1$samp_name <- sinfo_b1$id %>%
strsplit("\\*") %>%
sapply(., function(x) x[1])
sinfo_b2$samp_name <- sinfo_b2$id %>%
strsplit("\\*") %>%
sapply(., function(x) x[1])
# olink info has "." instead of "+" in unique sample IDs, replace with "." for matching
samp_overlap <- intersect(c(sinfo_b1$samp_name, sinfo_b2$samp_name),
gsub("\\.", "\\+", sinfo_ol$unique_participant_ID))
# Data contains a UMAP-plate and a SPK-plate, assayed on 2020-06-23 and 2020-11-13 respectively. Keep the ones on the UMAP plate (studyset 1)
sinfo_overlap <- sinfo_ol %>%
filter(unique_participant_ID %in% gsub("\\+", "\\.", samp_overlap)) %>%
# Remove bridge samples from SPK-plate and longitudinal samples
filter(grp %in% c("Population", "Bridge")) %>%
filter(!(grp == "Bridge" & Analysis_date == "2020-11-13"))
# Find samples that appear more than once, to merge the NPX of those that are the same
dupl_samp <- unique(sinfo_overlap[which(duplicated(sinfo_overlap$unique_participant_ID)), "unique_participant_ID", drop = T])
# Isolate the duplicated samples to calculate means
sinfo_dupl <- sinfo_overlap %>% filter(unique_participant_ID %in% dupl_samp)
npx_dupl <- npx_ol[sinfo_dupl$sample_id, ]
sinfo_overlap <- sinfo_overlap %>% filter(!(unique_participant_ID %in% dupl_samp))
npx_ol <- npx_ol[!(rownames(npx_ol) %in% rownames(npx_dupl)), ]
# Initialise matrix for merged duplicated samples
npx_dupl_merge <- matrix(nrow = length(dupl_samp),
ncol = ncol(npx_ol),
dimnames = list(dupl_samp, colnames(npx_ol)))
for (samp in dupl_samp) {
npx_dupl_merge[samp, ] <-
colMeans(npx_dupl[sinfo_dupl %>% filter(unique_participant_ID == samp) %>% pull(sample_id), ])
}
# Add the new merged NPX values to the rest, add one sinfo entry for each duplicated sample
npx_ol <- rbind(npx_ol, npx_dupl_merge)
sinfo_overlap <- rbind(sinfo_overlap,
sinfo_dupl[-which(duplicated(sinfo_dupl$unique_participant_ID)), ])
rownames(sinfo_overlap) <- sinfo_overlap$unique_participant_ID
# Update overlapping sample vector to only have UMAP- or spike-positive samples
samp_overlap <- rownames(sinfo_overlap)
# Change rownames to not have to substitute . for + or vice versa all the time
rownames(mfi_b1) <- gsub("\\+", "\\.", rownames(mfi_b1))
rownames(mfi_b2) <- gsub("\\+", "\\.", rownames(mfi_b2))
# In this case there are no samples with any labels that end up in the final selection, can just use samp_overlap to directly pick out samples
mfi_b1 <- mfi_b1[rownames(mfi_b1) %in% samp_overlap, ]
mfi_b2 <- mfi_b2[rownames(mfi_b2) %in% samp_overlap, ]
npx_ol <- npx_ol[samp_overlap, ]
# Take average of related binders in batch 1 and batch 2, such as SPK_01 and SPK_02, or RBD_01, 02, 03, 04 etc since they are measuring the same-ish thing
# Initialise matrices to hold SPK, RBD and NCP values
# Also keep IgG and IgM separate
antigens <- c("SPK", "RBD", "NCP") #"EBN"
igs <- c("IgG", "IgM")
mfi_b1_red <- matrix(nrow = nrow(mfi_b1),
ncol = length(antigens)*2,
dimnames = list(rownames(mfi_b1),
paste0(antigens, "_", rep(igs, each=length(antigens)))))
mfi_b2_red <- matrix(nrow = nrow(mfi_b2),
ncol = length(antigens)*2,
dimnames = list(rownames(mfi_b2),
paste0(antigens, "_", rep(igs, each=length(antigens)))))
# dataframe for saving names of merged binders
merged_b <- data.frame(matrix(nrow = length(antigens)*2, ncol = 4,
dimnames = list(1:(length(antigens)*2), c("Batch", "Antigen", "IgG", "IgM"))))
merged_b$Batch <- rep(1:2, each = length(antigens)); merged_b$Antigen <- rep(antigens, 2)
# The merging
for (ag in antigens) {
for (ig in igs) {
b1_merge_binders <- binder_b1 %>% filter(str_detect(Name, ag) & detect_Ab == ig)
merged_b[merged_b$Batch == 1 &
merged_b$Antigen == ag, ig] <- paste0(b1_merge_binders$id, collapse = ", ")
# Only merge if there is something to merge, i.e. more than one antigen
if (nrow(b1_merge_binders) < 2) {
mfi_b1_red[, paste0(ag, "_", ig)] <-
mfi_b1[rownames(mfi_b1_red), b1_merge_binders$id]
} else {
mfi_b1_red[, paste0(ag, "_", ig)] <-
rowMeans(mfi_b1[rownames(mfi_b1_red), b1_merge_binders$id])
}
b2_merge_binders <- binder_b2 %>% filter(str_detect(Name, ag) & detect_Ab == ig)
merged_b[merged_b$Batch == 2 &
merged_b$Antigen == ag, ig] <- paste0(b2_merge_binders$id, collapse = ", ")
if (nrow(b2_merge_binders) < 2) {
mfi_b2_red[, paste0(ag, "_", ig)] <-
mfi_b2[rownames(mfi_b2_red), b2_merge_binders$id]
} else {
mfi_b2_red[, paste0(ag, "_", ig)] <-
rowMeans(mfi_b2[rownames(mfi_b2_red), b2_merge_binders$id])
}
}
}
mfi_merge <- rbind(mfi_b1_red, mfi_b2_red)[rownames(npx_ol), ] # Put in same row-order as NPX
```
Samples that overlapped between the Olink proteins and batches 1&2.
`r datatable(sinfo_overlap[, c("subj.id", "Population_batch", "umap_seropositive_IgG_IgM", "serostatus")])`
Number of samples in total: `r nrow(sinfo_overlap)`
Number of samples analysed on 2020-06-23 ("UMAP-plate"): `r sinfo_overlap %>% filter(Analysis_date == "2020-06-23") %>% nrow()`
Number of samples analysed on 2020-11-13 ("SPK-plate"): `r sinfo_overlap %>% filter(Analysis_date == "2020-11-13") %>% nrow()`
Number of UMAP-positive samples: `r sinfo_overlap %>% filter(umap_seropositive_IgG_IgM == 1) %>% nrow()`
Number of spike positive (IgG + IgM) samples: `r sinfo_overlap %>% filter(serostatus == "IgG + IgM") %>% nrow()`
Overlap of UMAP and spike-positive: `r sinfo_overlap %>% filter(umap_seropositive_IgG_IgM == 1 & serostatus == "IgG + IgM") %>% nrow()`
`r if(nrow(sinfo_dupl) > 0){"The following samples were assayed multiple times, the replicates were merged by taking their means. "}`
`r if(nrow(sinfo_dupl) > 0){knitr::kable(table(sinfo_dupl$unique_participant_ID), col.names = c("Sample", "Frequency")) %>% kableExtra::kable_styling(full_width=F)}`
Merged antigens for each batch, antigen type (SPK, RBD, or NCP) and Ig type (IgG, IgM).
`r datatable(merged_b, rownames=F)`
Also load studyset 3
```{r}
source("sep_datasets.R")
ds3 <- load_cov_npx(paste0("ds3_", cov_npx$data_set)) %>% merge_repl()
binder_ds3 <- ds3$binder
sinfo_ds3 <- ds3$sinfo %>% rename(serostatus = grp) # %>% filter(both_pos | !any_pos)
# Merge values from assays measuring the same antigen, skip antigens not in main data sets
mfi_ds3 <- ds3$mfi %>% filter(sample_id %in% ds3$npx$sample_id) %>%
select(sample_id, contains(c("Spike", "Nucleocapsid", "RBD")), -contains("0.4")) %>%
remove_rownames() %>%
# Merge antigens by making long format data, grouping by sample and antigen, taking average, and transforming back to wide format
pivot_longer(cols = -sample_id, names_to = "ag", values_to = "mfi") %>%
mutate(ag = case_when(str_detect(ag, "^Spike") ~ "SPK_IgG",
str_detect(ag, "^Nucleo") ~ "NCP_IgG",
str_detect(ag, "^RBD") ~ "RBD_IgG")) %>%
group_by(sample_id, ag) %>%
mutate(mean_mfi = mean(mfi)) %>%
distinct(sample_id, ag, mean_mfi) %>%
pivot_wider(id_cols = sample_id, names_from = "ag", values_from = "mean_mfi") %>%
column_to_rownames("sample_id")
# Make list of data to use
dat_list <- list(
"studyset1" = list(
"npx" = npx_ol[match(sinfo_overlap %>% filter(Analysis_date == "2020-06-23") %>% rownames(), rownames(npx_ol)), ],
"mfi" = mfi_merge[match(sinfo_overlap %>% filter(Analysis_date == "2020-06-23") %>% rownames(), rownames(mfi_merge)), ],
"sinfo" = sinfo_overlap %>% filter(Analysis_date == "2020-06-23")
),
"studyset2" = list(
"npx" = npx_ol[match(sinfo_overlap %>% filter(Analysis_date == "2020-11-13") %>% rownames(), rownames(npx_ol)), ],
"mfi" = mfi_merge[match(sinfo_overlap %>% filter(Analysis_date == "2020-11-13") %>% rownames(), rownames(mfi_merge)), ],
"sinfo" = sinfo_overlap %>% filter(Analysis_date == "2020-11-13")
),
"studyset3" = list(
"npx" = ds3$npx %>% column_to_rownames("sample_id"),
"mfi" = mfi_ds3,
"sinfo" = sinfo_ds3 %>% filter(sample_id %in% ds3$npx$sample_id)
)
)
```
Correlation plots
=================
```{r}
cor_res <- lapply(dat_list, function(x) {
corr <- corr.test(x$npx, x$mfi[rownames(x$npx), ])
corr_long <- corr$r %>% as.data.frame() %>%
rownames_to_column("protein") %>%
pivot_longer(cols = -protein, names_to = "antigen", values_to = "correlation") %>%
# Add p-values
left_join((corr$p %>% as.data.frame() %>% rownames_to_column("protein") %>%
pivot_longer(cols = -protein, names_to = "antigen", values_to = "pvalue")), by = c("protein", "antigen")) %>%
# Add FDR, per antigen
group_by(antigen) %>%
mutate(fdr = p.adjust(pvalue, method = "fdr")) %>%
ungroup() %>%
# Add what Ig is being looked at
mutate(ig = str_extract(antigen, "Ig\\w"))
return(corr_long)
})
```
```{r}
lapply(names(cor_res), function(x) {
for (which_ig in sort(unique(cor_res[[x]]$ig))) {
cor_res[[x]] %>% filter(ig == which_ig) %>%
select(protein, antigen, correlation) %>%
pivot_wider(id_cols = "protein", names_from = "antigen", values_from = "correlation") %>%
column_to_rownames("protein") %>% t() %>%
pheatmap(., cellwidth = 2, border_color = NA, fontsize_col = 2.3, main = paste0(x, ", ", which_ig))
}
})
```
Tables of correlation
=====================
Displaying correlations with nominal p-values below 0.05
```{r}
htmltools::tagList(sapply(names(cor_res), function(x) {
datatable(cor_res[[x]] %>% filter(pvalue < 0.05) %>% arrange(pvalue),
caption = paste0("Correlations with nominal p-values below 0.05 in ", x)) %>%
formatSignif(columns = 3:5, digits = 3)
}, simplify = F))
```
## Volcano plots of correlations
```{r}
lapply(cor_res, function(ds) {
ggplot(ds, aes(x = correlation, y = -log10(pvalue))) +
geom_point() +
facet_wrap(~ ig, nrow = 1) +
theme_classic()
})
lapply(names(cor_res), function(ds) {
cor_res[[ds]] %>% mutate(studyset = ds)
}) %>% data.table::rbindlist() %>%
ggplot(aes(x = correlation, y = -log10(pvalue))) +
geom_point() +
xlim(-1, 1) +
ggh4x::facet_grid2(rows = vars(studyset), cols = vars(ig), scales = "free", independent = T) +
theme_classic()
```
Save table of correlations
==========================
```{r}
tbl_out <- rbind(
cor_res$studyset1 %>% mutate(studyset = "studyset1"),
cor_res$studyset2 %>% mutate(studyset = "studyset2"),
cor_res$studyset3 %>% mutate(studyset = "studyset3")
) %>% select(-ig) %>%
rename(corr = correlation, pval = pvalue) %>%
separate_wider_delim(cols = antigen, delim = "_", names = c("antigen", "ig")) %>%
pivot_wider(id_cols = c(protein, antigen, ig), names_from = studyset, values_from = c(corr, pval, fdr)) %>%
left_join(binder_ol %>% select(unique_id, assay, gene_name, uniprot_ID), by = c("protein" = "unique_id")) %>%
select(-protein) %>% relocate(assay, gene_name, uniprot_ID)
write.xlsx(tbl_out, paste0(out_dir, format(Sys.time(), "%Y-%m-%d_%H%M%S"), "_corr_results.xlsx"), asTable = T)
```
Session information
===================
```{r}
sessionInfo()
```