
# Methylation Analysis
## Author: "Peter Allen"


<style type="text/css">

h1.title {
  font-size: 38px;
  color: DarkRed;
  text-align: center;
}
h4.author { /* Header 4 - and the author and data headers use this too  */
    font-size: 18px;
  font-family: "Times New Roman", Times, serif;
  color: DarkRed;
  text-align: center;
}

h1, h2, h3 {
  text-align: center;
}
</style>

## Importing necessary Libraries

In [None]:
library(tidyverse)
library(data.table)
library(Haplin)
library(IlluminaHumanMethylationEPICanno.ilm10b2.hg19)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(GenomicFeatures)
library(edgeR)
library(kableExtra)

## Importing metadata and filtering by which have both RNA-seq & Methylation Data


In [None]:
sampleSheet <- read_csv("data/methyl_sampleSheet.csv", na = "N/A")

#replacing NA in disease_subset to healthy
sampleSheet$disease_subset <- sampleSheet$disease_subset %>% replace_na("Healthy")

#matching rowname syntax to betas
sampleSheet$sample_name <- gsub("-", ".", sampleSheet$sample_name)

sampleSheet <- sampleSheet %>% column_to_rownames(var = "sample_name")

## Importing Methylation Data

Importing the methylation data. There are two types of beta files, imputed and nonimputed. The imputed data will be used for calculating principal components which will be used to regress out potential technical variance downstream.

In [None]:
## Imputed Betas
finalBetas_auto <- data.frame(fread("data/finalBetas_autos_Imp.txt", stringsAsFactors = FALSE))
rownames(finalBetas_auto) <- finalBetas_auto[,1]
finalBetas_auto <- finalBetas_auto[,-1]
colnames(finalBetas_auto)<-gsub("X","", colnames(finalBetas_auto))
finalBetas_auto <- finalBetas_auto[, order(colnames(finalBetas_auto))]

finalBetas_x <- data.frame(fread("data/finalBetas_X_Imp.txt", stringsAsFactors = FALSE))
rownames(finalBetas_x) <- finalBetas_x[,1]
finalBetas_x <- finalBetas_x[,-1]
colnames(finalBetas_x)<-gsub("X","", colnames(finalBetas_x))
finalBetas_x <- finalBetas_x[, order(colnames(finalBetas_x))]

finalBetasImp <- rbind(finalBetas_auto, finalBetas_x)

rm(finalBetas_auto, finalBetas_x)

## Non-imputed Betas
finalBetas_auto.nonimp <- data.frame(fread("data/finalBetas_autos_NonImp.txt", stringsAsFactors = FALSE))
rownames(finalBetas_auto.nonimp) <- finalBetas_auto.nonimp[,1]
finalBetas_auto.nonimp <- finalBetas_auto.nonimp[,-1]
colnames(finalBetas_auto.nonimp)<-gsub("X","", colnames(finalBetas_auto.nonimp))
finalBetas_auto.nonimp <- finalBetas_auto.nonimp[, order(colnames(finalBetas_auto.nonimp))]

finalBetas_x.nonimp <- data.frame(fread("data/finalBetas_X_NonImp.txt", stringsAsFactors = FALSE))
rownames(finalBetas_x.nonimp) <- finalBetas_x.nonimp[,1]
finalBetas_x.nonimp <- finalBetas_x.nonimp[,-1]
colnames(finalBetas_x.nonimp)<-gsub("X","", colnames(finalBetas_x.nonimp))
finalBetas_x.nonimp <- finalBetas_x.nonimp[, order(colnames(finalBetas_x.nonimp))]

finalBetasNonImp <- rbind(finalBetas_auto.nonimp, finalBetas_x.nonimp)
rm(finalBetas_auto.nonimp, finalBetas_x.nonimp)


#ensure both dataset columns match
all(colnames(finalBetasImp) == colnames(finalBetasNonImp))

## Performing Principal Component Analysis

In [None]:
pca <- prcomp(t(na.omit(finalBetasImp)))$x

In [None]:
#regression
ssc_model <- apply(na.omit(finalBetasNonImp), 1, function(x) summary(lm(as.numeric(x)~as.factor(sampleSheet$disease_status)+ as.numeric(sampleSheet$age_at_enrollment) + as.numeric(pca[,1]) + as.numeric(pca[,2])))$coeff[2,4])

# Organize & Annotate CpG Regression Results -----------------------------------------
top_cpgs <- tibble(CpG=names(ssc_model), pval=ssc_model) %>% arrange(pval)

top_cpgs <- top_cpgs %>% mutate(p.adjusted=p.adjust(top_cpgs$pval, method = "BH"))

#created a function to name the top 500 genes
gene_match_500 <- function(x){
  EPIC <- getAnnotation(IlluminaHumanMethylationEPICanno.ilm10b2.hg19)
  intersect <- match(x$CpG[1:500], rownames(EPIC))
  EPIC <- EPIC[intersect,]
  EPIC <- as_tibble(EPIC)
  
  EPIC <- EPIC %>% dplyr::select(Name, UCSC_RefGene_Name, chr, pos, Relation_to_Island)
  
  EPIC.granges <- data.frame(EPIC[,3:4])
  EPIC.granges$stop <- EPIC.granges$pos+5
  EPIC.granges$CpG <- EPIC$Name
  
  colnames(EPIC.granges) <- c("chr", "start", "stop", "CpG")
  
  genes <- annotateTranscripts(TxDb.Hsapiens.UCSC.hg19.knownGene)
  cpgs <- makeGRangesFromDataFrame(EPIC.granges)
  
  
  epic.genes <- matchGenes(cpgs,genes) %>% as_tibble() %>% 
    add_column(CpG=x$CpG[1:500], pval=x$pval[1:500], p.adjusted=x$p.adjusted[1:500])
  return(epic.genes)
}

top_cpgs_annotated <- gene_match_500(top_cpgs)

top_cpgs_annotated[1:20,] %>% dplyr::select(CpG, name, annotation, description, pval, p.adjusted)

In [None]:
#write to output
write_delim(top_cpgs_annotated, file = "output/methylated_genes.txt")