# RNA-Seq Analysis
## Author: Peter Allen

In [None]:
library(data.table)
library(readxl)
library(dplyr)
library(Haplin)
library(IlluminaHumanMethylationEPICanno.ilm10b2.hg19)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(GenomicFeatures)
library(edgeR)
library(statmod)
library(gplots)

## Importing metadata & expression data

In [None]:
meta <- data.frame(read_excel("data/RNA_SampleData.xlsx", col_types = c("skip", "text", "numeric", "skip", "skip", "text", "numeric", "text", "skip", "text", "skip"), na = c(".", "N/A")))

##-- Filtering out bad samples/duplicates
meta  <- meta %>% filter(!is.na(RNA.seq.data.ID) & 
                         Gender != "Male" &
                         RNA.seq.data.ID != c(34, 36))


expression_data <- read.delim("data/GSE196070_raw_counts_matrix.txt")
colnames(expression_data)<-gsub("X","", colnames(expression_data))

expression_data$symbol <- make.unique(as.character(expression_data$symbol), sep="_")

rownames(expression_data) <- expression_data$symbol
expression_data <- expression_data[,-1]

## Sorting the metasheet to match the Counts 

In [None]:
meta.sorted  <- meta[na.omit(match(colnames(expression_data), meta$RNA.seq.data.ID)),]
counts.sorted  <- expression_data[,match(meta.sorted$RNA.seq.data.ID, colnames(expression_data))]
counts.sorted  <- cbind(expression_data[,1:3], counts.sorted)

group <- factor(meta.sorted$SLE.SSc)

design <- model.matrix(~0+group)
colnames(design) <- levels(group)

## Creating an edgeR object with the count and metadata stored -> Filtration -> Normalization

Here, an edgeR object is being created with the sorted count file and associated metadata. The object count data is then filtered by expression using edgeR's filter by expression function which takes into account the library sizes and the experimental design. 

After filtration, the data is then normalized using the trimmed mean of M values (TMM) method. 

In [None]:
y <- DGEList(counts=counts.sorted[,4:ncol(counts.sorted)],group=group)

keep <- filterByExpr(y, design)
y <- y[keep,,keep.lib.sizes=FALSE]
y <- calcNormFactors(y, method = "TMM")
tmm <- cpm(y)

## Differences between Samples

To get a better understanding of what our data looks like, the count data was clustered in two dimensions using multi-dimensional scaling (MDS) plots. With the exception of the bottom left quadrant, the samples do not appear to be too different from one another.

In [None]:
pch <- c(15,16)
colors <- c("darkgreen", "red")
plotMDS(y, col=colors[group], pch=pch[group])
legend("topleft", legend=levels(group), pch=pch, col=colors, ncol=2)

## Dispersion Estimation averaged over all genes

In [None]:
y <- estimateDisp(y, design, robust=TRUE)
plotBCV(y)

## Processing the Count Data

To take into consideration the biological and non-biological variance in gene expression, a quasi-likelihood method was used and the data tested for differential expression.

In [None]:
fit <- glmQLFit(y, design, robust=TRUE)

contrast <- makeContrasts(SSc-Control, levels=design)

res <- glmQLFTest(fit, contrast=contrast)

is.de <- decideTestsDGE(res)
summary(is.de)

topTags(res, n=20)

## Multi-dimensional Plot

In [None]:
plotMD(res, status=is.de, values=c(1,-1), col=c("red","blue"),
       legend="topright")

From the Multi-dimensional plot and the summary, it should be noted that that there were not any significant up-regulated or down-regulated genes between the two groups. This is most likely due to sample size, but while there were not significant genes found, the top genes could be intersected with methylation data and analyzed for any potential biological effects occurring in Scleroderma.

## Heatmap Clustering of Data

In [None]:
logCPM <- cpm(y, prior.count=2, log=TRUE)
colnames(logCPM) <- paste(y$samples$group, 1:length(y$samples$group), sep="-")

o <- order(res$table$PValue)
logCPM <- logCPM[o[1:30],]

logCPM <- t(scale(t(logCPM)))

col.pan <- colorpanel(100, "blue", "white", "red")
heatmap.2(logCPM, col=col.pan, Rowv=TRUE, scale="none", 
          trace="none", dendrogram="both", cexRow=1, cexCol=1.4, density.info="none",
          margin=c(10,9), lhei=c(2,10), lwid=c(2,6))
