This notebook contains the instructions for reproducing results presented in  "*Environmental and genealogical signals on DNA methylation in a widespread apomictic dandelion lineage*" by V.N. Ibañez, M. van Antro, C. Peña Ponton, S. Ivanovic, C.A.M. Wagemaker, F. Gawehns, K.J.F. Verhoeven.

## Load data and set R environment

In this section, we will load the dataset to run the script, configure the working directory and environment.

In [19]:
#@title Load files
%load_ext rpy2.ipython
!rm -r *
!mkdir results rawData annotation scripts plots tmp
!wget -c -O scripts/commonFunctions.R https://raw.githubusercontent.com/VeronicaNoe/epiTree/main/Rscripts/commonFunctions.R
!wget -c -O rawData/00_DMC_table.csv https://raw.githubusercontent.com/VeronicaNoe/epiTree/main/data4r/00_DMC_table.csv

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython
--2022-09-17 16:41:23--  https://raw.githubusercontent.com/VeronicaNoe/epiTree/main/Rscripts/commonFunctions.R
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 18802 (18K) [text/plain]
Saving to: ‘scripts/commonFunctions.R’


2022-09-17 16:41:23 (47.6 MB/s) - ‘scripts/commonFunctions.R’ saved [18802/18802]

--2022-09-17 16:41:23--  https://raw.githubusercontent.com/VeronicaNoe/epiTree/main/data4r/00_DMC_table.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting respons

In [15]:
%%R
#@title Set R environment
rm(list=ls())
wd<-getwd()
baseDir <- gsub("/results", "", wd)
scriptDir <- file.path(baseDir, "scripts")


In [16]:
%%R
#@title Install R packages
install.packages(c("data.table","qqman","dplyr","plyr","tidyr"),quiet=TRUE)

In [25]:
%%R
#@title Load packages silently
## load packages silently
suppressPackageStartupMessages({
  library("data.table") # file reading
  library("qqman")
  library("plyr")
  library("dplyr")
  library("tidyr")
  source(file.path(scriptDir, "commonFunctions.R"))
})


# Analyzing data step-by-step

In this section, we will explore chunk of code to filter the one dataset: *AseI-NsiI*


## Load and explore data

In [22]:
#@title
%%R
infileName <- file.path(paste0(baseDir,"/rawData/","00_DMC_table.csv"))
data<-read.csv(infileName, header=TRUE, sep="\t")
head(data)

    chr pos      stat     pvals      fdrs        RE factor context feature
1 12089  21 0.6110785 0.5411476 0.5660592 AseI-NsiI    Acc      CG    gene
2 12089  22 0.7122345 0.4763196 0.5401882 AseI-NsiI    Acc      CG    gene
3 12089  51 0.5364717 0.5916326 0.5997154 AseI-NsiI    Acc      CG    gene
4 12089  52 0.7385639 0.4601718 0.5401882 AseI-NsiI    Acc      CG    gene
5 12089  55 0.8021251 0.4224806 0.5401882 AseI-NsiI    Acc      CG    gene
6 12089  56 0.7732340 0.4393839 0.5401882 AseI-NsiI    Acc      CG    gene


In [26]:
#@title
%%R
RE<-"AseI-NsiI"
toUse<-RE
dfDMC<-dplyr::filter(data, RE==toUse)
dfDMC<-dplyr::filter(dfDMC, fdrs!="NA")
dfDMC<-dplyr::filter(dfDMC, factor=="Treat")
dfDMC<-unite(dfDMC, chrPos, c(chr, pos), sep="_", remove=FALSE)
dfDMC$chr<-as.numeric(dfDMC$chr)

In [27]:
%%R
#@title
DMC<-dplyr::filter(dfDMC, fdrs<=0.05)
uniReg<-unique(DMC$chr)

In [29]:
%%R
#@title
outdf<-matrix(NA, nrow=length(uniReg), ncol = 2)  
for(i in 1:length(uniReg)){
  hits<-sum(DMC$chr==uniReg[i])
  colnames(outdf) <- c("chr","ocurrences")
  outdf[i,1]<-uniReg[i]
  outdf[i,2]<-hits
}


In [30]:
%%R
#@title
outdf<-data.frame(outdf)
outdf <- outdf[order(-outdf$ocurrences),]
out<-dplyr::filter(outdf, ocurrences>=5)
out<-out[order(out$chr),]
write.csv(out, paste0(baseDir,"/results/",RE,"_DMRegions.csv"),row.names = FALSE)


In [34]:
%%R  
#@title
outDir<-paste0(baseDir,"/plots/", RE,"_")
ctxt <- c("CG","CHG","CHH")
for (i in 1:length(ctxt)){
  df<-dplyr::filter(dfDMC, context==ctxt[i])
  toPlot<-dplyr::filter(DMC, DMC$context==ctxt[i])
  snpOfInterest<-intersect(toPlot$chr,out$chr)
  snpOfInterest<-sort(snpOfInterest)
  pdf(paste0(outDir,ctxt[i],"_manhattanPlot.pdf"))
  manhattan(df, chr = "chr", bp = "pos", p = "fdrs", highlight =snpOfInterest, 
            snp = "chr", annotatePval = 0.05, annotateTop=TRUE, col = c("gray60"), 
            chrlabs = NULL, main = paste0("DMC_for_",ctxt[i], "_context"," (",RE,")"), 
            suggestiveline = -log10(5e-02), xlab="epiGBS fragment", ylab=expression('-log'[10]*' (FDR)'))
  dev.off()
}

# Make the manhattan plot for both dataset in each context

In this section, the code will run the previous steps for both datasets: *AseI-NsiI* and *Csp6I-NsiI*

In [36]:
%%R
#@title Visualize both data set: AseI-NsiI and Csp6I-NsiI
## process both data set
infileName <- file.path(paste0(baseDir,"/rawData/","00_DMC_table.csv"))
data<-read.csv(infileName, header=TRUE, sep="\t")
head(data)

RE<-c("AseI-NsiI", "Csp6I-NsiI")
for (r in 1:length(RE)){
  toUse<-RE[r]
  dfDMC<-dplyr::filter(data, RE==toUse)
  dfDMC<-dplyr::filter(dfDMC, fdrs!="NA")
  dfDMC<-dplyr::filter(dfDMC, factor=="Treat")
  dfDMC<-unite(dfDMC, chrPos, c(chr, pos), sep="_", remove=FALSE)
  dfDMC$chr<-as.numeric(dfDMC$chr)
  
  DMC<-dplyr::filter(dfDMC, fdrs<=0.05)
  uniReg<-unique(DMC$chr)
  
  outdf<-matrix(NA, nrow=length(uniReg), ncol = 2)  
  for(i in 1:length(uniReg)){
    hits<-sum(DMC$chr==uniReg[i])
    colnames(outdf) <- c("chr","ocurrences")
    outdf[i,1]<-uniReg[i]
    outdf[i,2]<-hits
  }
  outdf<-data.frame(outdf)
  outdf <- outdf[order(-outdf$ocurrences),]
  out<-dplyr::filter(outdf, ocurrences>=5)
  out<-out[order(out$chr),]
  write.csv(out, paste0(baseDir,"_",RE[r],"_Regions.csv"),row.names = FALSE)
  
  outDir<-paste0(baseDir,"/plots/", RE[r],"_")
  #ctxt <- c("CHH", "CG", "CHG")
  ctxt <- c("CG","CHG","CHH")
  for (i in 1:length(ctxt)){
    df<-dplyr::filter(dfDMC, context==ctxt[i])
    sum(is.na(df))
    toPlot<-dplyr::filter(DMC, DMC$context==ctxt[i])
    snpOfInterest<-intersect(toPlot$chr,out$chr)
    snpOfInterest<-sort(snpOfInterest)
    pdf(paste0(outDir,ctxt[i],"_manhattanPlot.pdf"))
    manhattan(df, chr = "chr", bp = "pos", p = "fdrs", highlight =snpOfInterest, 
              snp = "chr", annotatePval = 0.05, annotateTop=TRUE, col = c("gray60"), 
              chrlabs = NULL, main = paste0("DMC_for_",ctxt[i], "_context"," (",RE[r],")"), 
              suggestiveline = -log10(5e-02), xlab="epiGBS fragment", ylab=expression('-log'[10]*' (FDR)'))
    dev.off()
    }
  }