analysis/1_QC.Rmd

---
title: "QC of Psen2S4Ter Zebrafish"
author: "Steve Pederson"
date: "`r format(Sys.time(), '%d %B, %Y')`"
output: 
  html_document: 
    fig_caption: yes
    number_sections: yes
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(
    autodep = TRUE,
	echo = TRUE,
	warning = FALSE,
	message = FALSE
)
```

# Setup

```{r loadPackages}
library(ngsReports)
library(tidyverse)
library(magrittr)
library(edgeR)
library(AnnotationHub)
library(ensembldb)
library(scales)
library(pander)
library(cowplot)
library(corrplot)
```

```{r setOptions}
if (interactive()) setwd(here::here())
theme_set(theme_bw())
panderOptions("big.mark", ",")
panderOptions("table.split.table", Inf)
panderOptions("table.style", "rmarkdown")
twoCols <- c(rgb(0.8, 0.1, 0.1), rgb(0.2, 0.2, 0.8))
```


```{r annotationSetup}
ah <- AnnotationHub() %>%
	subset(species == "Danio rerio") %>%
	subset(rdataclass == "EnsDb")
ensDb <- ah[["AH74989"]]
grTrans <- transcripts(ensDb)
trLengths <- exonsBy(ensDb, "tx") %>%
	width() %>%
	vapply(sum, integer(1))
mcols(grTrans)$length <- trLengths[names(grTrans)]
```


```{r samplesAndLabels}
samples <- read_csv("data/samples.csv") %>%
    mutate(reads = str_extract(fqName, "R[12]")) %>%
    dplyr::rename(Filename = fqName)
labels <- structure(
    samples$sampleID,
    names = samples$Filename
)
```

In order to perform adequate QC, an `EnsDb` object was obtained for Ensembl release `r ensemblVersion(ensDb)` using the  `AnnotationHub` package.
This provided the GC content and length for each of the `r comma(length(grTrans))` transcripts contained in that release.

Metadata for each fastq file was also loaded.
Reads were provided as paired-end reads, with $n=4$ samples for each genotype.

# Raw Data

## Library Sizes

```{r rawFqc}
rawFqc <- list.files(
    path = "data/0_rawData/FastQC/",
    pattern = "zip",
    full.names = TRUE
    ) %>%
    FastqcDataList()
```

Library Sizes for the raw, unprocessed dataset ranged between `r pander(comma(range(readTotals(rawFqc)$Total_Sequences)))` reads.

```{r plotLibSizes, fig.cap = "*Library Sizes for each sample before any processing was undertaken.*"}
r1 <- grepl("R1", fqName(rawFqc))
plotReadTotals(rawFqc[r1], labels = labels[r1], barCols = twoCols)
```

## GC Content

As this was a total RNA dataset, GC content will provide a clear marker of the success of rRNA depletion.
This was plotted, with all R2 reads showing a far greater spike in GC content at 81%, which is likely due to incomplete rRNA depletion.
In particular, the mutant samples appeared most significantly affected raising possibility that overall GC content may be affected in these samples.


```{r gcPlots, fig.cap="*GC content for R1 and R2 reads. Notably, the R2 reads had clearer spikes in GC content above 70%*", fig.height=5}
gcPlots <- list(
    r1 = plotGcContent(
        x = rawFqc[r1], 
        labels = labels[r1],
        plotType = "line",
        gcType = "Transcriptome",
        species = "Drerio"
    ),
    r2 = plotGcContent(
        x = rawFqc[!r1], 
        labels = labels[!r1],
        plotType = "line",
        gcType = "Transcriptome",
        species = "Drerio"
    )
)
lg <- get_legend(gcPlots$r2 + theme(legend.position = "bottom")) 
plot_grid(
    plot_grid(
        r1 = gcPlots$r1 + 
            ggtitle("R1: GC Distribution", subtitle = c()) +
            theme(legend.position = "none"),
        r2 = gcPlots$r2 + 
            ggtitle("R2: GC Distribution", subtitle = c()) +
            theme(legend.position = "none")
    ),
    lg = lg,
    nrow = 2,
    rel_heights = c(5,2)
)
```


```{r gcPerc, fig.cap=paste("*Percentages of each library which contain >70% GC. Using the known theoretical distribution, this should be", percent_format(0.01)(sum(dplyr::filter(getGC(gcTheoretical, "Drerio", type = "Trans"), GC_Content > 70)$Drerio)), "of the total library.*"), fig.width=8}
gc <- getModule(rawFqc, "Per_sequence_GC") 
rawGC <- gc %>% 
    group_by(Filename) %>% 
    mutate(Freq = Count / sum(Count)) %>%
    dplyr::filter(GC_Content > 70) %>% 
    summarise(Freq = sum(Freq)) %>% 
    arrange(desc(Freq)) %>%
    left_join(samples) 
rawGC %>%
    ggplot(aes(sampleID, Freq, fill = reads)) +
    geom_bar(stat = "identity", position = "dodge") +
    facet_wrap(~genotype, scales = "free_x") +
    scale_y_continuous(labels = percent) +
    scale_fill_manual(values = twoCols) +
    labs(x = "Sample", y = "Percent of Total")
```

As an alternative viewpoint, the standard deviation of observed GC frequencies from the expected GC frequencies, as obtained from the known transcriptome, were calculated.
This may be a less subjective approach, which is more applicable to other datasets.
Very similar patterns were seen as to those using GC% > 70.

```{r gcDev}
gcDev <- gc %>%
    left_join(samples) %>%
    group_by(sampleName, sampleID) %>%
    mutate(Freq = Count / sum(Count)) %>%
    left_join(
        getGC(gcTheoretical, "Drerio", "Trans")
    ) %>%
    dplyr::rename(actual = Drerio) %>%
    mutate(res = Freq - actual) %>%
    summarise(ss = sum(res^2), n = n()) %>%
    ungroup() %>%
    mutate(sd = sqrt(ss / (n - 1)))
```

```{r, fig.cap = "*Standard deviations of observed GC frequency Vs Expected GC frequency using the theoretical GC across both sets of reads.*"}
gcDev %>%
    left_join(samples) %>%
    ggplot(aes(sampleID, sd)) +
    geom_bar(stat= "identity", position ="dodge") +
    facet_wrap(~genotype, scales = "free_x") +
    scale_fill_manual()
```


## Over-represented Sequences

The top 30 Overrepresented sequences were analysed using `blastn` and most were found to be mitochondrial rRNA sequences.

```{r}
getModule(rawFqc, "Overrep") %>% 
    group_by(Sequence, Possible_Source) %>% 
    summarise(`Found In` = n(), `Highest Percentage` = max(Percentage)) %>% 
    arrange(desc(`Highest Percentage`), desc(`Found In`)) %>% 
    ungroup() %>% 
    dplyr::slice(1:30) %>%
    mutate(`Highest Percentage` = percent_format(0.01)(`Highest Percentage`/100)) %>%
    pander(
        justify = "llrr",
        caption = paste(
            "*Top", nrow(.),"Overrepresented sequences.",
            "The number of samples they were found in is shown,",
            "along with the percentage of the most 'contaminated' sample.*"
            )
    )
```

## Trimming

```{r trimStats}
trimFqc <- list.files(
    path = "data/1_trimmedData/FastQC/", 
    pattern = "zip", 
    full.names = TRUE
) %>%
    FastqcDataList()
trimStats <- readTotals(rawFqc) %>%
    dplyr::rename(Raw = Total_Sequences) %>%
    left_join(readTotals(trimFqc), by = "Filename") %>%
    dplyr::rename(Trimmed = Total_Sequences) %>%
    dplyr::filter(grepl("R1", Filename)) %>%
    mutate(
        Discarded = 1 - Trimmed/Raw,
        Retained = Trimmed / Raw
    )
```

After adapter trimming between `r pander(range(percent_format(0.01)(trimStats$Discarded)))` of reads were discarded.
No other improvement was noted.

# Aligned Data


Trimmed reads were:

1. aligned to the reference genome using `STAR 2.7.0d` and summarised to each gene using `featureCounts`. These counts were to be used for all gene-level analysis
2. aligned using kallisto to a modified transcriptome which contained the *psen2^S4Ter^* sequence as an 'alternative' transcript of *psen2*. This data was used for genotype checking, and assessing any GC bias in the data. Whilst transcript-level information is included for rRNA genes in the `EnsDb` object, these transcripts are **excluded** by default from the reference transcriptome provided by Ensembl.

```{r catchKallisto, include=FALSE}
kallisto <- list.dirs("data/3_kallisto/") %>%
	setdiff("data/3_kallisto/") %>%
	catchKallisto()
```

## Genotype Checking

```{r checkGenotype, fig.cap = "*CPM obtained for both WT and S4Ter mutant transcripts of psen2*", fig.width=8}
cpm(kallisto$counts) %>%
	.[c("ENSDART00000006381.8", "psen2S4Ter"),] %>%
	as.data.frame() %>%
	rownames_to_column("psen2") %>%
	pivot_longer(cols = contains("kall"), names_to = "Sample", values_to = "CPM") %>%
	mutate(
		Sample = basename(Sample),
		psen2 = case_when(
			psen2 == "psen2S4Ter" ~ "S4Ter",
			psen2 != "psen2S4Ter" ~ "WT"
		)
	) %>%
    left_join(samples, by = c("Sample" = "sampleName")) %>%
	ggplot(aes(sampleID, CPM, fill = psen2)) +
	geom_bar(stat = "identity") +
    scale_fill_manual(values = twoCols) +
	facet_wrap(~genotype, nrow = 1, scales = "free_x")
```

As all samples demonstrated the expected expression patterns, no mislabelling was detected in this dataset.

## GC Content


```{r gcInfo}
gcInfo <- kallisto$counts %>%
	as.data.frame() %>%
	rownames_to_column("tx_id") %>%
	dplyr::filter(
		tx_id != "psen2S4Ter"
	) %>%
	as_tibble() %>%
	pivot_longer(
		cols = contains("kall"), 
		names_to = "sampleName", 
		values_to = "counts"
	) %>%
	dplyr::filter(
		counts > 0
	) %>%
    mutate(
		sampleName = basename(sampleName),
		tx_id_version = tx_id,
		tx_id = str_replace(tx_id, "(ENSDART[0-9]+).[0-9]+", "\\1")
	) %>%
	left_join(
		mcols(grTrans) %>% as.data.frame()
	) %>%
	dplyr::select(
		ends_with("id"), sampleName, counts, gc_content, length
	) %>%
    split(f = .$sampleName) %>%
    lapply(function(x){
        DataFrame(
            gc = Rle(x$gc_content/100, x$counts),
            logLen = Rle(log10(x$length), x$counts)
        )
    }
    ) 
gcSummary <- gcInfo %>%
    vapply(function(x){
        c(mean(x$gc), sd(x$gc), mean(x$logLen), sd(x$logLen))
        }, numeric(4)
        ) %>%
    t() %>%
    set_colnames(
        c("mn_gc", "sd_gc", "mn_logLen", "sd_logLen")
    ) %>%
    as.data.frame() %>%
    rownames_to_column("sampleName") %>%
    as_tibble() %>%
    left_join(dplyr::filter(samples, reads == "R1")) %>%
    dplyr::select(starts_with("sample"), genotype, contains("_"))
```

```{r gcCors}
gcCors <- rawGC %>%
    dplyr::filter(reads == "R1") %>%
    dplyr::select(starts_with("sample"), genotype, Freq, reads) %>%
    left_join(gcSummary) %>%
    left_join(gcDev) %>%
    dplyr::select(Freq, mn_gc, mn_logLen, sd) %>%
    cor()
```


The initial QC steps identified an unusually large proportion of reads with >70% GC content, and noticeable deviations from the expected GC content.
As such an exploration of any residual impacts this may have on the remainder of the library was performed.

A *run length encoded* (RLE) vector was formed for each sample taking the number of reads for each transcript as the run lengths, and both the GC content and length of each transcript as the values.
Transcript lengths were transformed to the log~10~ scale due to the wide variety of lengths contained in the transcriptome.

From these RLEs, the mean GC and mean length was calculated for each sample, and these values were compared to the proportion of raw reads with > 70% GC, taking these values from the R1 libraries only.

- **A correlation of `r percent(gcCors["Freq", "mn_gc"])` was found between the mean GC content after summarisation to transcript-level counts and the proportion of the original sequence libraries with >70% GC.** This makes intuitive sense and suggests that the amount of high-GC reads in the initial libraries, as representative of incomplete rRNA removal, negatively biases the GC composition of the non-ribosomal RNA component of the library.
- **A correlation of `r percent(gcCors["Freq", "mn_logLen"])` was found was found between the mean transcript length of the libraries after summarisation to transcript-level counts and the proportion of the original sequence libraries with >70% GC.** This is far less obvious mechanistically, but suggests that **incomplete rRNA removal also biases the fragmentation or length selection process**.
- **Similar patterns were seen when using the standard deviations** as compared to the expected values.

Given the more successful rRNA-removal in the wild-type samples, this dataset is likely to contain significant GC and Length bias which is non-biological in origin.

```{r plotBias, fig.cap = "*Comparison of bias introduced by incomplete rRNA removal. Regression lines are shown along with standard error bands for each comparison.*", fig.width=8, fig.height=4}
a <- gcSummary %>%
    left_join(rawGC) %>%
    dplyr::filter(reads == "R1") %>%
    ggplot(aes(Freq, mn_logLen)) +
    geom_point(aes(colour = genotype), size = 3) +
    geom_smooth(method = "lm") +
    scale_shape_manual(values = c(19, 1)) +
    labs(
        x = "Proportion of initial library with > 70% GC",
        y = "Mean log(length)",
        colour = "Genotype"
    ) 
b <- gcSummary %>%
    left_join(rawGC) %>%
        dplyr::filter(reads == "R1") %>%
    ggplot(aes(Freq, mn_gc)) +
    geom_point(aes(colour = genotype), size = 3) +
    geom_smooth(method = "lm") +
    scale_shape_manual(values = c(19, 1)) +
    scale_y_continuous(labels = percent) +
    labs(
        x = "Proportion of initial library with > 70% GC",
        y = "Mean GC Content",
        colour = "Genotype"
    ) 
plot_grid(
    a + theme(legend.position = "none"),
    b + theme(legend.position = "none"),
    get_legend(b),
    nrow = 1,
    rel_widths = c(4,4,1)
)
```


```{r plotGcSD, fig.cap = "*Comparison of bias introduced by incomplete rRNA removal, measured by the standard deviation of observed GC Vs expected GC. Regression lines are shown along with standard error bands for each comparison.*", fig.width=8, fig.height=4}
a <- gcSummary %>%
    left_join(gcDev) %>%
    ggplot(aes(sd, mn_logLen)) +
    geom_point(aes(colour = genotype), size = 3) +
    geom_smooth(method = "lm") +
    scale_shape_manual(values = c(19, 1)) +
    labs(
        x = "Standard Deviation\n(Observed Vs Expected)",
        y = "Mean log(length)",
        colour = "Genotype"
    ) 
b <- gcSummary %>%
    left_join(gcDev) %>%
    ggplot(aes(sd, mn_gc)) +
    geom_point(aes(colour = genotype), size = 3) +
    geom_smooth(method = "lm") +
    scale_shape_manual(values = c(19, 1)) +
    scale_y_continuous(labels = percent) +
    labs(
        x = "Standard Deviation\n(Observed Vs Expected)",
        y = "Mean GC Content",
        colour = "Genotype"
    ) 
plot_grid(
    a + theme(legend.position = "none"),
    b + theme(legend.position = "none"),
    get_legend(b),
    nrow = 1,
    rel_widths = c(4,4,1)
)
```


## PCA

An initial PCA was performed using transcript-level counts to assess general patterns in the data.
Transcripts were included if >12 reads over all libraries were allocated to that identifiers.
Correlations were checked between the first three components and mean GC content (as described above), mean log~10~ transcript length, genotype and the proportion of the initial libraries with GC content > 70%.

The confounding of genotype and GC variability may need careful handling in this dataset.

```{r transDGE}
transDGE <- kallisto$counts %>%
    set_colnames(basename(colnames(.))) %>%
    divide_by(kallisto$annotation$Overdispersion) %>%
    .[rowSums(.) > 12,] %>%
    DGEList(
        samples = tibble(
            sampleName = colnames(.)
        ) %>%
            left_join(samples) %>%
            dplyr::select(sampleName, sample = sampleID, genotype) %>%
            distinct(sampleName, .keep_all = TRUE)
    ) 
```

```{r pca}
pca <- cpm(transDGE, log = TRUE) %>%
    t() %>%
    prcomp()
```

```{r pcaCorrs, fig.cap="*Correlations between the first three principal components and measured variables. Genotypes were converted to an ordered categorical variable for the purposes of visualisation*"}
pca$x %>%
    as.data.frame() %>%
    rownames_to_column("sampleName") %>%
    left_join(gcSummary) %>%
    as_tibble() %>% 
    left_join(
        dplyr::filter(rawGC, reads == "R1")
        ) %>% 
    left_join(gcDev) %>%
    dplyr::select(
        PC1, PC2, PC3, 
        Mean_GC = mn_gc, 
        Mean_Length = mn_logLen, 
        Initial_GC70 = Freq,
        SD = sd,
        genotype
    ) %>% 
    mutate(genotype = as.numeric(as.factor(genotype))) %>%
    cor() %>% 
    corrplot(
        type = "lower", 
        diag = FALSE, 
        addCoef.col = 1, addCoefasPercent = TRUE
    )
```

```{r plotPCA, fig.width=10, fig.cap = "*PCA plot showing genotype, initial GC content > 70%, deviation from theoretical and mean GC content after summarisation to transcript-level.*"}
a <- pca$x %>%
    as.data.frame() %>%
    rownames_to_column("sampleName") %>%
    left_join(
        dplyr::filter(rawGC, reads == "R1")
    ) %>%
    as_tibble() %>%
    ggplot(aes(PC1, PC2, colour = genotype, size = Freq)) +
    geom_point() +
    labs(
        x = paste0("PC1 (", percent(summary(pca)$importance["Proportion of Variance","PC1"]),")"),
        y = paste0("PC2 (", percent(summary(pca)$importance["Proportion of Variance","PC2"]),")"),
        size = "Raw GC > 70%",
        colour = "Genotype"
    )
b <- pca$x %>%
    as.data.frame() %>%
    rownames_to_column("sampleName") %>%
    left_join(gcSummary) %>%
    left_join(
        dplyr::filter(rawGC, reads == "R1")
    ) %>%
    as_tibble() %>%
    ggplot(aes(PC1, mn_gc)) +
    geom_point(aes(colour = genotype,  size = Freq)) +
    geom_smooth(method = "lm") +
    scale_y_continuous(labels = percent) +
    labs(
        x = paste0("PC1 (", percent(summary(pca)$importance["Proportion of Variance","PC1"]),")"),
        y = "Mean GC",
        size = "Raw GC > 70%",
        colour = "Genotype"
    )
c <- pca$x %>%
    as.data.frame() %>%
    rownames_to_column("sampleName") %>%
    left_join(gcDev) %>%
     left_join(
         rawGC %>% dplyr::filter(reads == "R1")
     ) %>%
     ggplot(aes(PC1, sd)) +
    geom_point(aes(colour = genotype, size = Freq)) +
    geom_smooth(method = "lm") +
    labs(
        x = paste0("PC1 (", percent(summary(pca)$importance["Proportion of Variance","PC1"]),")"),
        y = "Standard Deviation",
        size = "Raw GC > 70%",
        colour = "Genotype"
    )
plot_grid(
    plot_grid(
        a + theme(legend.position = "none"), 
        b + theme(legend.position = "none"),
        c + theme(legend.position = "none"),
        nrow = 1
    ),
    get_legend(b + theme(legend.position = "bottom")),
    nrow = 2, 
    rel_heights = c(4,1)
)
```