/
1_QC.Rmd
449 lines (391 loc) · 14.7 KB
/
1_QC.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
---
title: "QC of Psen2S4Ter Zebrafish"
author: "Steve Pederson"
date: "`r format(Sys.time(), '%d %B, %Y')`"
output:
html_document:
fig_caption: yes
number_sections: yes
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(
autodep = TRUE,
echo = TRUE,
warning = FALSE,
message = FALSE
)
```
# Setup
```{r loadPackages}
library(ngsReports)
library(tidyverse)
library(magrittr)
library(edgeR)
library(AnnotationHub)
library(ensembldb)
library(scales)
library(pander)
library(cowplot)
library(corrplot)
```
```{r setOptions}
if (interactive()) setwd(here::here())
theme_set(theme_bw())
panderOptions("big.mark", ",")
panderOptions("table.split.table", Inf)
panderOptions("table.style", "rmarkdown")
twoCols <- c(rgb(0.8, 0.1, 0.1), rgb(0.2, 0.2, 0.8))
```
```{r annotationSetup}
ah <- AnnotationHub() %>%
subset(species == "Danio rerio") %>%
subset(rdataclass == "EnsDb")
ensDb <- ah[["AH74989"]]
grTrans <- transcripts(ensDb)
trLengths <- exonsBy(ensDb, "tx") %>%
width() %>%
vapply(sum, integer(1))
mcols(grTrans)$length <- trLengths[names(grTrans)]
```
```{r samplesAndLabels}
samples <- read_csv("data/samples.csv") %>%
mutate(reads = str_extract(fqName, "R[12]")) %>%
dplyr::rename(Filename = fqName)
labels <- structure(
samples$sampleID,
names = samples$Filename
)
```
In order to perform adequate QC, an `EnsDb` object was obtained for Ensembl release `r ensemblVersion(ensDb)` using the `AnnotationHub` package.
This provided the GC content and length for each of the `r comma(length(grTrans))` transcripts contained in that release.
Metadata for each fastq file was also loaded.
Reads were provided as paired-end reads, with $n=4$ samples for each genotype.
# Raw Data
## Library Sizes
```{r rawFqc}
rawFqc <- list.files(
path = "data/0_rawData/FastQC/",
pattern = "zip",
full.names = TRUE
) %>%
FastqcDataList()
```
Library Sizes for the raw, unprocessed dataset ranged between `r pander(comma(range(readTotals(rawFqc)$Total_Sequences)))` reads.
```{r plotLibSizes, fig.cap = "*Library Sizes for each sample before any processing was undertaken.*"}
r1 <- grepl("R1", fqName(rawFqc))
plotReadTotals(rawFqc[r1], labels = labels[r1], barCols = twoCols)
```
## GC Content
As this was a total RNA dataset, GC content will provide a clear marker of the success of rRNA depletion.
This was plotted, with all R2 reads showing a far greater spike in GC content at 81%, which is likely due to incomplete rRNA depletion.
In particular, the mutant samples appeared most significantly affected raising possibility that overall GC content may be affected in these samples.
```{r gcPlots, fig.cap="*GC content for R1 and R2 reads. Notably, the R2 reads had clearer spikes in GC content above 70%*", fig.height=5}
gcPlots <- list(
r1 = plotGcContent(
x = rawFqc[r1],
labels = labels[r1],
plotType = "line",
gcType = "Transcriptome",
species = "Drerio"
),
r2 = plotGcContent(
x = rawFqc[!r1],
labels = labels[!r1],
plotType = "line",
gcType = "Transcriptome",
species = "Drerio"
)
)
lg <- get_legend(gcPlots$r2 + theme(legend.position = "bottom"))
plot_grid(
plot_grid(
r1 = gcPlots$r1 +
ggtitle("R1: GC Distribution", subtitle = c()) +
theme(legend.position = "none"),
r2 = gcPlots$r2 +
ggtitle("R2: GC Distribution", subtitle = c()) +
theme(legend.position = "none")
),
lg = lg,
nrow = 2,
rel_heights = c(5,2)
)
```
```{r gcPerc, fig.cap=paste("*Percentages of each library which contain >70% GC. Using the known theoretical distribution, this should be", percent_format(0.01)(sum(dplyr::filter(getGC(gcTheoretical, "Drerio", type = "Trans"), GC_Content > 70)$Drerio)), "of the total library.*"), fig.width=8}
rawGC <- getModule(rawFqc, "Per_sequence_GC") %>%
group_by(Filename) %>%
mutate(Freq = Count / sum(Count)) %>%
dplyr::filter(GC_Content > 70) %>%
summarise(Freq = sum(Freq)) %>%
arrange(desc(Freq)) %>%
left_join(samples)
rawGC %>%
ggplot(aes(sampleID, Freq, fill = reads)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~genotype, scales = "free_x") +
scale_y_continuous(labels = percent) +
scale_fill_manual(values = twoCols) +
labs(x = "Sample", y = "Percent of Total")
```
## Over-represented Sequences
The top 30 Overrepresented sequences were analysed using `blastn` and most were found to be mitochondrial rRNA sequences.
```{r}
getModule(rawFqc, "Overrep") %>%
group_by(Sequence, Possible_Source) %>%
summarise(`Found In` = n(), `Highest Percentage` = max(Percentage)) %>%
arrange(desc(`Highest Percentage`), desc(`Found In`)) %>%
ungroup() %>%
dplyr::slice(1:30) %>%
mutate(`Highest Percentage` = percent_format(0.01)(`Highest Percentage`/100)) %>%
pander(
justify = "llrr",
caption = paste(
"*Top", nrow(.),"Overrepresented sequences.",
"The number of samples they were found in is shown,",
"along with the percentage of the most 'contaminated' sample.*"
)
)
```
## Trimming
```{r trimStats}
trimFqc <- list.files(
path = "data/1_trimmedData/FastQC/",
pattern = "zip",
full.names = TRUE
) %>%
FastqcDataList()
trimStats <- readTotals(rawFqc) %>%
dplyr::rename(Raw = Total_Sequences) %>%
left_join(readTotals(trimFqc), by = "Filename") %>%
dplyr::rename(Trimmed = Total_Sequences) %>%
dplyr::filter(grepl("R1", Filename)) %>%
mutate(
Discarded = 1 - Trimmed/Raw,
Retained = Trimmed / Raw
)
```
After adapter trimming between `r pander(range(percent_format(0.01)(trimStats$Discarded)))` of reads were discarded.
No other improvement was noted.
# Aligned Data
Trimmed reads were:
1. aligned to the reference genome using `STAR 2.7.0d` and summarised to each gene using `featureCounts`. These counts were to be used for all gene-level analysis
2. aligned using kallisto to a modified transcriptome which contained the *psen2^S4Ter^* sequence as an 'alternative' transcript of *psen2*. This data was used for genotype checking, and assessing any GC bias in the data. Whilst transcript-level information is included for rRNA genes in the `EnsDb` object, these transcripts are **excluded** by default from the reference transcriptome provided by Ensembl.
```{r catchKallisto, include=FALSE}
kallisto <- list.dirs("data/3_kallisto/") %>%
setdiff("data/3_kallisto/") %>%
catchKallisto()
```
## Genotype Checking
```{r checkGenotype, fig.cap = "*CPM obtained for both WT and S4Ter mutant transcripts of psen2*", fig.width=8}
cpm(kallisto$counts) %>%
.[c("ENSDART00000006381.8", "psen2S4Ter"),] %>%
as.data.frame() %>%
rownames_to_column("psen2") %>%
pivot_longer(cols = contains("kall"), names_to = "Sample", values_to = "CPM") %>%
mutate(
Sample = basename(Sample),
psen2 = case_when(
psen2 == "psen2S4Ter" ~ "S4Ter",
psen2 != "psen2S4Ter" ~ "WT"
)
) %>%
left_join(samples, by = c("Sample" = "sampleName")) %>%
ggplot(aes(sampleID, CPM, fill = psen2)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = twoCols) +
facet_wrap(~genotype, nrow = 1, scales = "free_x")
```
As all samples demonstrated the expected expression patterns, no mislabelling was detected in this dataset.
## GC Content
```{r gcInfo}
gcInfo <- kallisto$counts %>%
as.data.frame() %>%
rownames_to_column("tx_id") %>%
dplyr::filter(
tx_id != "psen2S4Ter"
) %>%
as_tibble() %>%
pivot_longer(
cols = contains("kall"),
names_to = "sampleName",
values_to = "counts"
) %>%
dplyr::filter(
counts > 0
) %>%
mutate(
sampleName = basename(sampleName),
tx_id_version = tx_id,
tx_id = str_replace(tx_id, "(ENSDART[0-9]+).[0-9]+", "\\1")
) %>%
left_join(
mcols(grTrans) %>% as.data.frame()
) %>%
dplyr::select(
ends_with("id"), sampleName, counts, gc_content, length
) %>%
split(f = .$sampleName) %>%
lapply(function(x){
DataFrame(
gc = Rle(x$gc_content/100, x$counts),
logLen = Rle(log10(x$length), x$counts)
)
}
)
gcSummary <- gcInfo %>%
vapply(function(x){
c(mean(x$gc), sd(x$gc), mean(x$logLen), sd(x$logLen))
}, numeric(4)
) %>%
t() %>%
set_colnames(
c("mn_gc", "sd_gc", "mn_logLen", "sd_logLen")
) %>%
as.data.frame() %>%
rownames_to_column("sampleName") %>%
as_tibble() %>%
left_join(dplyr::filter(samples, reads == "R1")) %>%
dplyr::select(starts_with("sample"), genotype, contains("_"))
```
```{r gcCors}
gcCors <- rawGC %>%
dplyr::filter(reads == "R1") %>%
dplyr::select(starts_with("sample"), genotype, Freq, reads) %>%
left_join(gcSummary) %>%
dplyr::select(Freq, mn_gc, mn_logLen) %>%
cor()
```
The initial QC steps identified an unusually large proportion of reads with >70% GC content.
As such an exploration of any residual impacts this may have on the remainder of the library was performed.
A *run length encoded* (RLE) vector was formed for each sample taking the number of reads for each transcript as the run lengths, and both the GC content and length of each transcript as the values.
Transcript lengths were transformed to the log~10~ scale due to the wide variety of lengths contained in the transcriptome.
From these RLEs, the mean GC and mean length was calculated for each sample, and these values were compared to the proportion of raw reads with > 70% GC, taking these values from the R1 libraries only.
- **A correlation of `r percent(gcCors["Freq", "mn_gc"])` was found between the mean GC content after summarisation to transcript-level counts and the proportion of the original sequence libraries with >70% GC.** This makes intuitive sense and suggests that the amount of high-GC reads in the initial libraries, as representative of incomplete rRNA removal, negatively biases the GC composition of the non-ribosomal RNA component of the library.
- **A correlation of `r percent(gcCors["Freq", "mn_logLen"])` was found was found between the mean transcript length of the libraries after summarisation to transcript-level counts and the proportion of the original sequence libraries with >70% GC.** This is far less obvious mechanistically, but suggests that **incomplete rRNA removal also biases the fragmentation or length selection process**.
Given the more successful rRNA-removal in the wild-type samples, this dataset is likely to contain significant GC and Length bias which is non-biological in origin.
```{r plotBias, fig.cap = "*Comparison of bias introduced by incomplete rRNA removal. Regression lines are shown along with standard error bands for each comparison.*", fig.width=8, fig.height=4}
a <- gcSummary %>%
left_join(rawGC) %>%
dplyr::filter(reads == "R1") %>%
ggplot(aes(Freq, mn_logLen)) +
geom_point(aes(colour = genotype), size = 3) +
geom_smooth(method = "lm") +
scale_shape_manual(values = c(19, 1)) +
labs(
x = "Proportion of initial library with > 70% GC",
y = "Mean log(length)",
colour = "Genotype"
)
b <- gcSummary %>%
left_join(rawGC) %>%
dplyr::filter(reads == "R1") %>%
ggplot(aes(Freq, mn_gc)) +
geom_point(aes(colour = genotype), size = 3) +
geom_smooth(method = "lm") +
scale_shape_manual(values = c(19, 1)) +
scale_y_continuous(labels = percent) +
labs(
x = "Proportion of initial library with > 70% GC",
y = "Mean GC Content",
colour = "Genotype"
)
plot_grid(
a + theme(legend.position = "none"),
b + theme(legend.position = "none"),
get_legend(b),
nrow = 1,
rel_widths = c(4,4,1)
)
```
## PCA
An initial PCA was performed using transcript-level counts to assess general patterns in the data.
Transcripts were included if >12 reads over all libraries were allocated to that identifiers.
Correlations were checked between the first three components and mean GC content (as described above), mean log~10~ transcript length, genotype and the proportion of the initial libraries with GC content > 70%.
The confounding of genotype and GC variability may need careful handling in this dataset.
```{r transDGE}
transDGE <- kallisto$counts %>%
set_colnames(basename(colnames(.))) %>%
divide_by(kallisto$annotation$Overdispersion) %>%
.[rowSums(.) > 12,] %>%
DGEList(
samples = tibble(
sampleName = colnames(.)
) %>%
left_join(samples) %>%
dplyr::select(sampleName, sample = sampleID, genotype) %>%
distinct(sampleName, .keep_all = TRUE)
)
```
```{r pca}
pca <- cpm(transDGE, log = TRUE) %>%
t() %>%
prcomp()
```
```{r pcaCorrs, fig.cap="*Correlations between the first three principal components and measured variables. Genotypes were converted to an ordered categorical variable for the purposes of visualisation*"}
pca$x %>%
as.data.frame() %>%
rownames_to_column("sampleName") %>%
left_join(gcSummary) %>%
as_tibble() %>%
left_join(
dplyr::filter(rawGC, reads == "R1")
) %>%
dplyr::select(
PC1, PC2, PC3,
Mean_GC = mn_gc,
Mean_Length = mn_logLen,
Initial_GC70 = Freq,
genotype
) %>%
mutate(genotype = as.numeric(as.factor(genotype))) %>%
cor() %>%
corrplot(
type = "lower",
diag = FALSE,
addCoef.col = 1, addCoefasPercent = TRUE
)
```
```{r plotPCA, fig.cap = "*PCA plot showing genotype, initial GC content > 70% and mean GC content after summarisation to transcript-level.*"}
a <- pca$x %>%
as.data.frame() %>%
rownames_to_column("sampleName") %>%
left_join(
dplyr::filter(rawGC, reads == "R1")
) %>%
as_tibble() %>%
ggplot(aes(PC1, PC2, colour = genotype, size = Freq)) +
geom_point() +
labs(
x = paste0("PC1 (", percent(summary(pca)$importance["Proportion of Variance","PC1"]),")"),
y = paste0("PC2 (", percent(summary(pca)$importance["Proportion of Variance","PC2"]),")"),
size = "Raw GC > 70%",
colour = "Genotype"
)
b <- pca$x %>%
as.data.frame() %>%
rownames_to_column("sampleName") %>%
left_join(gcSummary) %>%
left_join(
dplyr::filter(rawGC, reads == "R1")
) %>%
as_tibble() %>%
ggplot(aes(PC1, mn_gc)) +
geom_point(aes(colour = genotype, size = Freq)) +
geom_smooth(method = "lm") +
scale_y_continuous(labels = percent) +
labs(
x = paste0("PC1 (", percent(summary(pca)$importance["Proportion of Variance","PC1"]),")"),
y = "Mean GC",
size = "Raw GC > 70%",
colour = "Genotype"
)
plot_grid(
plot_grid(
a + theme(legend.position = "none"),
b + theme(legend.position = "none")
),
get_legend(b + theme(legend.position = "bottom")),
nrow = 2,
rel_heights = c(4,1)
)
```