/
1_QC.Rmd
537 lines (471 loc) · 17.6 KB
/
1_QC.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
---
title: "QC of Psen2S4Ter Zebrafish"
author: "Steve Pederson"
date: "`r format(Sys.time(), '%d %B, %Y')`"
output:
html_document:
fig_caption: yes
number_sections: yes
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(
autodep = TRUE,
echo = TRUE,
warning = FALSE,
message = FALSE
)
```
# Setup
```{r loadPackages}
library(ngsReports)
library(tidyverse)
library(magrittr)
library(edgeR)
library(AnnotationHub)
library(ensembldb)
library(scales)
library(pander)
library(cowplot)
library(corrplot)
```
```{r setOptions}
if (interactive()) setwd(here::here())
theme_set(theme_bw())
panderOptions("big.mark", ",")
panderOptions("table.split.table", Inf)
panderOptions("table.style", "rmarkdown")
twoCols <- c(rgb(0.8, 0.1, 0.1), rgb(0.2, 0.2, 0.8))
```
```{r annotationSetup}
ah <- AnnotationHub() %>%
subset(species == "Danio rerio") %>%
subset(rdataclass == "EnsDb")
ensDb <- ah[["AH74989"]]
grTrans <- transcripts(ensDb)
trLengths <- exonsBy(ensDb, "tx") %>%
width() %>%
vapply(sum, integer(1))
mcols(grTrans)$length <- trLengths[names(grTrans)]
```
```{r samplesAndLabels}
samples <- read_csv("data/samples.csv") %>%
mutate(reads = str_extract(fqName, "R[12]")) %>%
dplyr::rename(Filename = fqName)
labels <- structure(
samples$sampleID,
names = samples$Filename
)
```
In order to perform adequate QC, an `EnsDb` object was obtained for Ensembl release `r ensemblVersion(ensDb)` using the `AnnotationHub` package.
This provided the GC content and length for each of the `r comma(length(grTrans))` transcripts contained in that release.
Metadata for each fastq file was also loaded.
Reads were provided as paired-end reads, with $n=4$ samples for each genotype.
# Raw Data
## Library Sizes
```{r rawFqc}
rawFqc <- list.files(
path = "data/0_rawData/FastQC/",
pattern = "zip",
full.names = TRUE
) %>%
FastqcDataList()
```
Library Sizes for the raw, unprocessed dataset ranged between `r pander(comma(range(readTotals(rawFqc)$Total_Sequences)))` reads.
```{r plotLibSizes, fig.cap = "*Library Sizes for each sample before any processing was undertaken.*"}
r1 <- grepl("R1", fqName(rawFqc))
plotReadTotals(rawFqc[r1], labels = labels[r1], barCols = twoCols)
```
## GC Content
As this was a total RNA dataset, GC content will provide a clear marker of the success of rRNA depletion.
This was plotted, with all R2 reads showing a far greater spike in GC content at 81%, which is likely due to incomplete rRNA depletion.
In particular, the mutant samples appeared most significantly affected raising possibility that overall GC content may be affected in these samples.
```{r gcPlots, fig.cap="*GC content for R1 and R2 reads. Notably, the R2 reads had clearer spikes in GC content above 70%*", fig.height=5}
gcPlots <- list(
r1 = plotGcContent(
x = rawFqc[r1],
labels = labels[r1],
plotType = "line",
gcType = "Transcriptome",
species = "Drerio"
),
r2 = plotGcContent(
x = rawFqc[!r1],
labels = labels[!r1],
plotType = "line",
gcType = "Transcriptome",
species = "Drerio"
)
)
lg <- get_legend(gcPlots$r2 + theme(legend.position = "bottom"))
plot_grid(
plot_grid(
r1 = gcPlots$r1 +
ggtitle("R1: GC Distribution", subtitle = c()) +
theme(legend.position = "none"),
r2 = gcPlots$r2 +
ggtitle("R2: GC Distribution", subtitle = c()) +
theme(legend.position = "none")
),
lg = lg,
nrow = 2,
rel_heights = c(5,2)
)
```
```{r gcPerc, fig.cap=paste("*Percentages of each library which contain >70% GC. Using the known theoretical distribution, this should be", percent_format(0.01)(sum(dplyr::filter(getGC(gcTheoretical, "Drerio", type = "Trans"), GC_Content > 70)$Drerio)), "of the total library.*"), fig.width=8}
gc <- getModule(rawFqc, "Per_sequence_GC")
rawGC <- gc %>%
group_by(Filename) %>%
mutate(Freq = Count / sum(Count)) %>%
dplyr::filter(GC_Content > 70) %>%
summarise(Freq = sum(Freq)) %>%
arrange(desc(Freq)) %>%
left_join(samples)
rawGC %>%
ggplot(aes(sampleID, Freq, fill = reads)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~genotype, scales = "free_x") +
scale_y_continuous(labels = percent) +
scale_fill_manual(values = twoCols) +
labs(x = "Sample", y = "Percent of Total")
```
As an alternative viewpoint, the standard deviation of observed GC frequencies from the expected GC frequencies, as obtained from the known transcriptome, were calculated.
This may be a less subjective approach, which is more applicable to other datasets.
Very similar patterns were seen as to those using GC% > 70.
```{r gcDev}
gcDev <- gc %>%
left_join(samples) %>%
group_by(sampleName, sampleID) %>%
mutate(Freq = Count / sum(Count)) %>%
left_join(
getGC(gcTheoretical, "Drerio", "Trans")
) %>%
dplyr::rename(actual = Drerio) %>%
mutate(res = Freq - actual) %>%
summarise(ss = sum(res^2), n = n()) %>%
ungroup() %>%
mutate(sd = sqrt(ss / (n - 1)))
```
```{r, fig.cap = "*Standard deviations of observed GC frequency Vs Expected GC frequency using the theoretical GC across both sets of reads.*"}
gcDev %>%
left_join(samples) %>%
ggplot(aes(sampleID, sd)) +
geom_bar(stat= "identity", position ="dodge") +
facet_wrap(~genotype, scales = "free_x") +
scale_fill_manual()
```
## Over-represented Sequences
The top 30 Overrepresented sequences were analysed using `blastn` and most were found to be mitochondrial rRNA sequences.
```{r}
getModule(rawFqc, "Overrep") %>%
group_by(Sequence, Possible_Source) %>%
summarise(`Found In` = n(), `Highest Percentage` = max(Percentage)) %>%
arrange(desc(`Highest Percentage`), desc(`Found In`)) %>%
ungroup() %>%
dplyr::slice(1:30) %>%
mutate(`Highest Percentage` = percent_format(0.01)(`Highest Percentage`/100)) %>%
pander(
justify = "llrr",
caption = paste(
"*Top", nrow(.),"Overrepresented sequences.",
"The number of samples they were found in is shown,",
"along with the percentage of the most 'contaminated' sample.*"
)
)
```
## Trimming
```{r trimStats}
trimFqc <- list.files(
path = "data/1_trimmedData/FastQC/",
pattern = "zip",
full.names = TRUE
) %>%
FastqcDataList()
trimStats <- readTotals(rawFqc) %>%
dplyr::rename(Raw = Total_Sequences) %>%
left_join(readTotals(trimFqc), by = "Filename") %>%
dplyr::rename(Trimmed = Total_Sequences) %>%
dplyr::filter(grepl("R1", Filename)) %>%
mutate(
Discarded = 1 - Trimmed/Raw,
Retained = Trimmed / Raw
)
```
After adapter trimming between `r pander(range(percent_format(0.01)(trimStats$Discarded)))` of reads were discarded.
No other improvement was noted.
# Aligned Data
Trimmed reads were:
1. aligned to the reference genome using `STAR 2.7.0d` and summarised to each gene using `featureCounts`. These counts were to be used for all gene-level analysis
2. aligned using kallisto to a modified transcriptome which contained the *psen2^S4Ter^* sequence as an 'alternative' transcript of *psen2*. This data was used for genotype checking, and assessing any GC bias in the data. Whilst transcript-level information is included for rRNA genes in the `EnsDb` object, these transcripts are **excluded** by default from the reference transcriptome provided by Ensembl.
```{r catchKallisto, include=FALSE}
kallisto <- list.dirs("data/3_kallisto/") %>%
setdiff("data/3_kallisto/") %>%
catchKallisto()
```
## Genotype Checking
```{r checkGenotype, fig.cap = "*CPM obtained for both WT and S4Ter mutant transcripts of psen2*", fig.width=8}
cpm(kallisto$counts) %>%
.[c("ENSDART00000006381.8", "psen2S4Ter"),] %>%
as.data.frame() %>%
rownames_to_column("psen2") %>%
pivot_longer(cols = contains("kall"), names_to = "Sample", values_to = "CPM") %>%
mutate(
Sample = basename(Sample),
psen2 = case_when(
psen2 == "psen2S4Ter" ~ "S4Ter",
psen2 != "psen2S4Ter" ~ "WT"
)
) %>%
left_join(samples, by = c("Sample" = "sampleName")) %>%
ggplot(aes(sampleID, CPM, fill = psen2)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = twoCols) +
facet_wrap(~genotype, nrow = 1, scales = "free_x")
```
As all samples demonstrated the expected expression patterns, no mislabelling was detected in this dataset.
## GC Content
```{r gcInfo}
gcInfo <- kallisto$counts %>%
as.data.frame() %>%
rownames_to_column("tx_id") %>%
dplyr::filter(
tx_id != "psen2S4Ter"
) %>%
as_tibble() %>%
pivot_longer(
cols = contains("kall"),
names_to = "sampleName",
values_to = "counts"
) %>%
dplyr::filter(
counts > 0
) %>%
mutate(
sampleName = basename(sampleName),
tx_id_version = tx_id,
tx_id = str_replace(tx_id, "(ENSDART[0-9]+).[0-9]+", "\\1")
) %>%
left_join(
mcols(grTrans) %>% as.data.frame()
) %>%
dplyr::select(
ends_with("id"), sampleName, counts, gc_content, length
) %>%
split(f = .$sampleName) %>%
lapply(function(x){
DataFrame(
gc = Rle(x$gc_content/100, x$counts),
logLen = Rle(log10(x$length), x$counts)
)
}
)
gcSummary <- gcInfo %>%
vapply(function(x){
c(mean(x$gc), sd(x$gc), mean(x$logLen), sd(x$logLen))
}, numeric(4)
) %>%
t() %>%
set_colnames(
c("mn_gc", "sd_gc", "mn_logLen", "sd_logLen")
) %>%
as.data.frame() %>%
rownames_to_column("sampleName") %>%
as_tibble() %>%
left_join(dplyr::filter(samples, reads == "R1")) %>%
dplyr::select(starts_with("sample"), genotype, contains("_"))
```
```{r gcCors}
gcCors <- rawGC %>%
dplyr::filter(reads == "R1") %>%
dplyr::select(starts_with("sample"), genotype, Freq, reads) %>%
left_join(gcSummary) %>%
left_join(gcDev) %>%
dplyr::select(Freq, mn_gc, mn_logLen, sd) %>%
cor()
```
The initial QC steps identified an unusually large proportion of reads with >70% GC content, and noticeable deviations from the expected GC content.
As such an exploration of any residual impacts this may have on the remainder of the library was performed.
A *run length encoded* (RLE) vector was formed for each sample taking the number of reads for each transcript as the run lengths, and both the GC content and length of each transcript as the values.
Transcript lengths were transformed to the log~10~ scale due to the wide variety of lengths contained in the transcriptome.
From these RLEs, the mean GC and mean length was calculated for each sample, and these values were compared to the proportion of raw reads with > 70% GC, taking these values from the R1 libraries only.
- **A correlation of `r percent(gcCors["Freq", "mn_gc"])` was found between the mean GC content after summarisation to transcript-level counts and the proportion of the original sequence libraries with >70% GC.** This makes intuitive sense and suggests that the amount of high-GC reads in the initial libraries, as representative of incomplete rRNA removal, negatively biases the GC composition of the non-ribosomal RNA component of the library.
- **A correlation of `r percent(gcCors["Freq", "mn_logLen"])` was found was found between the mean transcript length of the libraries after summarisation to transcript-level counts and the proportion of the original sequence libraries with >70% GC.** This is far less obvious mechanistically, but suggests that **incomplete rRNA removal also biases the fragmentation or length selection process**.
- **Similar patterns were seen when using the standard deviations** as compared to the expected values.
Given the more successful rRNA-removal in the wild-type samples, this dataset is likely to contain significant GC and Length bias which is non-biological in origin.
```{r plotBias, fig.cap = "*Comparison of bias introduced by incomplete rRNA removal. Regression lines are shown along with standard error bands for each comparison.*", fig.width=8, fig.height=4}
a <- gcSummary %>%
left_join(rawGC) %>%
dplyr::filter(reads == "R1") %>%
ggplot(aes(Freq, mn_logLen)) +
geom_point(aes(colour = genotype), size = 3) +
geom_smooth(method = "lm") +
scale_shape_manual(values = c(19, 1)) +
labs(
x = "Proportion of initial library with > 70% GC",
y = "Mean log(length)",
colour = "Genotype"
)
b <- gcSummary %>%
left_join(rawGC) %>%
dplyr::filter(reads == "R1") %>%
ggplot(aes(Freq, mn_gc)) +
geom_point(aes(colour = genotype), size = 3) +
geom_smooth(method = "lm") +
scale_shape_manual(values = c(19, 1)) +
scale_y_continuous(labels = percent) +
labs(
x = "Proportion of initial library with > 70% GC",
y = "Mean GC Content",
colour = "Genotype"
)
plot_grid(
a + theme(legend.position = "none"),
b + theme(legend.position = "none"),
get_legend(b),
nrow = 1,
rel_widths = c(4,4,1)
)
```
```{r plotGcSD, fig.cap = "*Comparison of bias introduced by incomplete rRNA removal, measured by the standard deviation of observed GC Vs expected GC. Regression lines are shown along with standard error bands for each comparison.*", fig.width=8, fig.height=4}
a <- gcSummary %>%
left_join(gcDev) %>%
ggplot(aes(sd, mn_logLen)) +
geom_point(aes(colour = genotype), size = 3) +
geom_smooth(method = "lm") +
scale_shape_manual(values = c(19, 1)) +
labs(
x = "Standard Deviation\n(Observed Vs Expected)",
y = "Mean log(length)",
colour = "Genotype"
)
b <- gcSummary %>%
left_join(gcDev) %>%
ggplot(aes(sd, mn_gc)) +
geom_point(aes(colour = genotype), size = 3) +
geom_smooth(method = "lm") +
scale_shape_manual(values = c(19, 1)) +
scale_y_continuous(labels = percent) +
labs(
x = "Standard Deviation\n(Observed Vs Expected)",
y = "Mean GC Content",
colour = "Genotype"
)
plot_grid(
a + theme(legend.position = "none"),
b + theme(legend.position = "none"),
get_legend(b),
nrow = 1,
rel_widths = c(4,4,1)
)
```
## PCA
An initial PCA was performed using transcript-level counts to assess general patterns in the data.
Transcripts were included if >12 reads over all libraries were allocated to that identifiers.
Correlations were checked between the first three components and mean GC content (as described above), mean log~10~ transcript length, genotype and the proportion of the initial libraries with GC content > 70%.
The confounding of genotype and GC variability may need careful handling in this dataset.
```{r transDGE}
transDGE <- kallisto$counts %>%
set_colnames(basename(colnames(.))) %>%
divide_by(kallisto$annotation$Overdispersion) %>%
.[rowSums(.) > 12,] %>%
DGEList(
samples = tibble(
sampleName = colnames(.)
) %>%
left_join(samples) %>%
dplyr::select(sampleName, sample = sampleID, genotype) %>%
distinct(sampleName, .keep_all = TRUE)
)
```
```{r pca}
pca <- cpm(transDGE, log = TRUE) %>%
t() %>%
prcomp()
```
```{r pcaCorrs, fig.cap="*Correlations between the first three principal components and measured variables. Genotypes were converted to an ordered categorical variable for the purposes of visualisation*"}
pca$x %>%
as.data.frame() %>%
rownames_to_column("sampleName") %>%
left_join(gcSummary) %>%
as_tibble() %>%
left_join(
dplyr::filter(rawGC, reads == "R1")
) %>%
left_join(gcDev) %>%
dplyr::select(
PC1, PC2, PC3,
Mean_GC = mn_gc,
Mean_Length = mn_logLen,
Initial_GC70 = Freq,
SD = sd,
genotype
) %>%
mutate(genotype = as.numeric(as.factor(genotype))) %>%
cor() %>%
corrplot(
type = "lower",
diag = FALSE,
addCoef.col = 1, addCoefasPercent = TRUE
)
```
```{r plotPCA, fig.width=10, fig.cap = "*PCA plot showing genotype, initial GC content > 70%, deviation from theoretical and mean GC content after summarisation to transcript-level.*"}
a <- pca$x %>%
as.data.frame() %>%
rownames_to_column("sampleName") %>%
left_join(
dplyr::filter(rawGC, reads == "R1")
) %>%
as_tibble() %>%
ggplot(aes(PC1, PC2, colour = genotype, size = Freq)) +
geom_point() +
labs(
x = paste0("PC1 (", percent(summary(pca)$importance["Proportion of Variance","PC1"]),")"),
y = paste0("PC2 (", percent(summary(pca)$importance["Proportion of Variance","PC2"]),")"),
size = "Raw GC > 70%",
colour = "Genotype"
)
b <- pca$x %>%
as.data.frame() %>%
rownames_to_column("sampleName") %>%
left_join(gcSummary) %>%
left_join(
dplyr::filter(rawGC, reads == "R1")
) %>%
as_tibble() %>%
ggplot(aes(PC1, mn_gc)) +
geom_point(aes(colour = genotype, size = Freq)) +
geom_smooth(method = "lm") +
scale_y_continuous(labels = percent) +
labs(
x = paste0("PC1 (", percent(summary(pca)$importance["Proportion of Variance","PC1"]),")"),
y = "Mean GC",
size = "Raw GC > 70%",
colour = "Genotype"
)
c <- pca$x %>%
as.data.frame() %>%
rownames_to_column("sampleName") %>%
left_join(gcDev) %>%
left_join(
rawGC %>% dplyr::filter(reads == "R1")
) %>%
ggplot(aes(PC1, sd)) +
geom_point(aes(colour = genotype, size = Freq)) +
geom_smooth(method = "lm") +
labs(
x = paste0("PC1 (", percent(summary(pca)$importance["Proportion of Variance","PC1"]),")"),
y = "Standard Deviation",
size = "Raw GC > 70%",
colour = "Genotype"
)
plot_grid(
plot_grid(
a + theme(legend.position = "none"),
b + theme(legend.position = "none"),
c + theme(legend.position = "none"),
nrow = 1
),
get_legend(b + theme(legend.position = "bottom")),
nrow = 2,
rel_heights = c(4,1)
)
```