In [1]:
library(tidyverse)
library(hash)
library(gprofiler2)
library(viridis)
library(VennDiagram)
library(XML)
library(httr)
library(stringi)

Registered S3 method overwritten by 'rvest':
  method            from
  read_xml.response xml2
-- Attaching packages --------------------------------------- tidyverse 1.2.1 --
v ggplot2 3.3.5     v purrr   0.3.4
v tibble  3.1.6     v dplyr   1.0.7
v tidyr   1.1.4     v stringr 1.4.0
v readr   1.3.1     v forcats 0.4.0
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
"package 'hash' was built under R version 3.6.3"hash-3.0.1 provided by Decision Patterns

Loading required package: viridisLite

Attaching package: 'viridis'

The following object is masked from 'package:viridisLite':

    viridis.map

Loading required package: grid
Loading required package: futile.logger


create a function to find overlaps in lists

In [2]:
private_of_three <- function(df1, df2, df3) {
    df1[!(df1 %in% df2) & !(df1 %in% df3)]
}

overlap_genes_two <- function(df1, df2, df3) {
    two <- intersect(as.character(df1), as.character(df2))
    allthree <- overlap_genes_three(df1, df2, df3)
    two[!two %in% allthree]
}

overlap_genes_three <- function(df1, df2, df3) {
    intersect(as.character(df3), intersect(as.character(df1), as.character(df2)))
}

make the gene lists in each of the three rootstocks (the ones used in the venn diagrams)

store all of the overlapping gene lists

In [3]:
tissues <- c('Leaf')
years <- c('2018')
phenos <- c('Anthesis', 'Veraison', 'Harvest')
rootstocks <- c('Ungrafted', '1103P', '3309C', 'SO4')
dir <- c('Up', 'Down')

prefix <- '/data/projects/julia.pratt/CS1_genomeSelection/all_rootstock_comparisons/'
results_prefix <- '/data/projects/julia.pratt/CS1_genomeSelection/scripts/ownGO/overlaps/'

stats <- list()
overlaps <- list()

In [4]:
# filter according to log2foldchange value
log2foldchange <- function(data, alpha, log2FoldChangeVal, d) {
 
    # only keeps significant values
    data.sig <- data %>% filter(padj < alpha)
    
    # separates upregulated/downregulated
    if (d == 'Up') {
        data.adj <- data.sig %>% filter(log2FoldChange > log2FoldChangeVal)
    } else if (d == 'Down') {
         data.adj <- data.sig %>% filter(log2FoldChange < -log2FoldChangeVal)
    } else {
        print(paste(d, "not a valid direction"))
    }

    return(data.adj)
}

In [5]:
# all data combos
for (t in tissues){
    for (y in years){
        for (p in phenos){
            for (direction in dir){
                
                meta <- paste(y, p, t, direction, sep='_')
                metaformat <- paste(y, p, t, direction, sep=' ')
                overlaps[[meta]] <- list()
                
                for (j in 2:length(rootstocks)) {             
                    # read in the data
                    rs <- str_interp("Ungrafted-${rootstocks[j]}")
                    data <- read.csv(paste(prefix, paste(t, y, p, rs, sep='_'), '.csv', sep=''))
                    fname <- paste(meta,rs,sep='_')
                    # filter and save the data
                    data.adj <- log2foldchange(data, 0.05, 0, direction)
                    stats[[rs]] <- data.adj
                }
                
                # Ungrafted-1103P x Ungrafted-3309C x Ungrafted-SO4
                overlaps[[meta]][['1103P-3309C-SO4']] <- list()
                overlaps[[meta]][['1103P-3309C-SO4']][['genes']] <- overlap_genes_three(stats[['Ungrafted-1103P']][['X']],
                                                                    stats[['Ungrafted-3309C']][['X']],
                                                                    stats[['Ungrafted-SO4']][['X']])
                
                # Ungrafted-1103P x Ungrafted-3309C , NOT Ungrafted-SO4
                overlaps[[meta]][['1103P-3309C']] <- list()
                overlaps[[meta]][['1103P-3309C']][['genes']] <- overlap_genes_two(stats[['Ungrafted-1103P']][['X']],
                                                              stats[['Ungrafted-3309C']][['X']],
                                                              stats[['Ungrafted-SO4']][['X']])
                
                # Ungrafted-1103P x Ungrafted-SO4 , NOT Ungrafted-3309C
                overlaps[[meta]][['1103P-SO4']] <- list()
                overlaps[[meta]][['1103P-SO4']][['genes']] <- overlap_genes_two(stats[['Ungrafted-1103P']][['X']],
                                                            stats[['Ungrafted-SO4']][['X']],
                                                            stats[['Ungrafted-3309C']][['X']])
                
                # Ungrafted-3309C x Ungrafted-SO4 , NOT Ungrafted-1103P
                overlaps[[meta]][['3309C-SO4']] <- list()
                overlaps[[meta]][['3309C-SO4']][['genes']] <- overlap_genes_two(stats[['Ungrafted-3309C']][['X']],
                                                            stats[['Ungrafted-SO4']][['X']],
                                                            stats[['Ungrafted-1103P']][['X']])
                
                # Private to Ungrafted-1103P
                overlaps[[meta]][['1103P']] <- list()
                overlaps[[meta]][['1103P']][['genes']] <- private_of_three(stats[['Ungrafted-1103P']][['X']],
                                                       stats[['Ungrafted-3309C']][['X']],
                                                       stats[['Ungrafted-SO4']][['X']])
                
                # Private to Ungrafted-3309C
                overlaps[[meta]][['3309C']] <- list()
                overlaps[[meta]][['3309C']][['genes']] <- private_of_three(stats[['Ungrafted-3309C']][['X']],
                                                       stats[['Ungrafted-1103P']][['X']],
                                                       stats[['Ungrafted-SO4']][['X']])
                
                # Private to Ungrafted-SO4
                overlaps[[meta]][['SO4']] <- list()
                overlaps[[meta]][['SO4']][['genes']] <- private_of_three(stats[['Ungrafted-SO4']][['X']],
                                                     stats[['Ungrafted-1103P']][['X']],
                                                     stats[['Ungrafted-3309C']][['X']])
            }
        }
    }
}

In [6]:
df <- NULL
for (set in names(overlaps)){
    for (rsset in names(overlaps[[set]])) {
        df <- rbind(df, data.frame(
            "category" = set,
            "overlapping_rootstocks" = rsset,
            "number_genes" = length(overlaps[[set]][[rsset]][['gemes']])
        ))
    }
}

In [7]:
# Use gprofiler2 to perform gene ontology term enrichment - ZNH edits
gochart <- read.csv("gotable.csv")
full_go_enrichment_test <- function(set_of_genes, go_chart = gochart, print_res=FALSE){
    go_testset <- go_chart[go_chart$genename %in% set_of_genes, ]
    terms <- unique(go_testset$go_term)
    if(length(terms) != 0){
        print(paste("TERMS:", length(terms)))
        enriched_terms <- c()
        for (term in terms){ 
            genes_to_term_in_set <- nrow(go_testset[go_testset$go_term == term,])
            genes_to_term_in_genome <- nrow(go_chart[go_chart$go_term == term,])
            genes_not_term_in_genome <- 24769 - genes_to_term_in_genome
            size_of_set <- length(set_of_genes)
            test <- phyper(genes_to_term_in_set, genes_to_term_in_genome,
                           genes_not_term_in_genome, size_of_set, lower.tail=FALSE)


            enriched_terms$term <- c(enriched_terms$term, term)
            enriched_terms$pval <- c(enriched_terms$pval, test)
            enriched_terms$genes_to_term_in_set <- c(enriched_terms$genes_to_term_in_set, genes_to_term_in_set)
            enriched_terms$genes_to_term_in_genome <- c(enriched_terms$genes_to_term_in_genome, genes_to_term_in_genome)
            enriched_terms$genes_not_term_in_genome <- c(enriched_terms$genes_not_term_in_genome, genes_not_term_in_genome)
            enriched_terms$size_of_set <- c(enriched_terms$size_of_set, size_of_set)
        }

        enriched_terms <- as.data.frame(enriched_terms)
        if(nrow(enriched_terms) != 0){
            enriched_terms <- enriched_terms %>% filter( genes_to_term_in_genome >= 10)
            enriched_terms$p.adj <- p.adjust(enriched_terms$pval, method='fdr')
            enriched_terms <- enriched_terms %>% filter(p.adj < 0.05)
            return(enriched_terms)
        }
    } 
}

In [8]:
for (set in names(overlaps)){
    for (rsset in names(overlaps[[set]])) {
        if ((length(overlaps[[set]][[rsset]][['genes']]) > 0)) {
            # run GO
            res <- full_go_enrichment_test(overlaps[[set]][[rsset]][['genes']])
            fname <- paste(set,rsset,sep="_")
            if(length(res) > 0) {
                overlaps[[set]][[rsset]][['GOresults']] <- res
                print(fname)
            }
            else{overlaps[[set]][[rsset]][['GOresults']] <- NULL}
        }
        else{overlaps[[set]][[rsset]][['GOresults']] <- NULL}
        print("================")
    }
}

[1] "TERMS: 30"
[1] "2018_Anthesis_Leaf_Up_1103P-3309C-SO4"
[1] "TERMS: 14"
[1] "2018_Anthesis_Leaf_Up_1103P-3309C"
[1] "TERMS: 3"
[1] "2018_Anthesis_Leaf_Up_1103P-SO4"
[1] "TERMS: 26"
[1] "2018_Anthesis_Leaf_Up_3309C-SO4"
[1] "TERMS: 41"
[1] "2018_Anthesis_Leaf_Up_1103P"
[1] "TERMS: 164"
[1] "2018_Anthesis_Leaf_Up_3309C"
[1] "TERMS: 150"
[1] "2018_Anthesis_Leaf_Up_SO4"
[1] "TERMS: 59"
[1] "2018_Anthesis_Leaf_Down_3309C-SO4"
[1] "TERMS: 7"
[1] "2018_Anthesis_Leaf_Down_1103P"
[1] "TERMS: 44"
[1] "2018_Anthesis_Leaf_Down_3309C"
[1] "TERMS: 222"
[1] "2018_Anthesis_Leaf_Down_SO4"
[1] "TERMS: 153"
[1] "2018_Veraison_Leaf_Up_1103P-3309C-SO4"
[1] "TERMS: 32"
[1] "2018_Veraison_Leaf_Up_1103P-3309C"
[1] "TERMS: 492"
[1] "2018_Veraison_Leaf_Up_1103P-SO4"
[1] "TERMS: 73"
[1] "2018_Veraison_Leaf_Up_3309C-SO4"
[1] "TERMS: 548"
[1] "2018_Veraison_Leaf_Up_1103P"
[1] "TERMS: 79"
[1] "2018_Veraison_Leaf_Up_3309C"
[1] "TERMS: 158"
[1] "2018_Veraison_Leaf_Up_SO4"
[1] "TERMS: 111"
[1] "2018_Veraison_Leaf_

In [9]:
overlaps[[set]][[rsset]][['GOresults']]

term,pval,genes_to_term_in_set,genes_to_term_in_genome,genes_not_term_in_genome,size_of_set,p.adj
GO:0005385,0.0005691829,2,12,24757,352,0.02219813
GO:0071577,0.0005691829,2,12,24757,352,0.02219813
GO:0006415,0.0003170966,2,10,24759,352,0.02219813


In [10]:
df2 <- NULL
for (set in names(overlaps)){
    for (rsset in names(overlaps[[set]])) {
        if (!is.null(overlaps[[set]][[rsset]][['GOresults']])) {
            
        }
    }
}

In [11]:
df2 <- NULL
for (set in names(overlaps)){
    for (rsset in names(overlaps[[set]])) {
        if (!is.null(overlaps[[set]][[rsset]][['GOresults']])) {
            df2 <- rbind(df2, data.frame(
                
                "name_full" = rep(paste(set, rsset, sep="_"), dim(overlaps[[set]][[rsset]][['GOresults']])[1]),
                "year" = rep(str_split(set, "_")[[1]][1], dim(overlaps[[set]][[rsset]][['GOresults']])[1]),
                "phenology" = rep(str_split(set, "_")[[1]][2], dim(overlaps[[set]][[rsset]][['GOresults']])[1]),
                "tissue" = rep(str_split(set, "_")[[1]][3], dim(overlaps[[set]][[rsset]][['GOresults']])[1]),
                "direction" = rep(str_split(set, "_")[[1]][4], dim(overlaps[[set]][[rsset]][['GOresults']])[1]),
                "overlapping.rootstocks" = rep(paste(rsset), dim(overlaps[[set]][[rsset]][['GOresults']])[1]),
                "term" = overlaps[[set]][[rsset]][['GOresults']][['term']],
                "pval" = overlaps[[set]][[rsset]][['GOresults']][['pval']],
                "genes_to_term_in_set" = overlaps[[set]][[rsset]][['GOresults']][['genes_to_term_in_set']],
                "genes_to_term_in_genome" = overlaps[[set]][[rsset]][['GOresults']][['genes_to_term_in_genome']],
                "genes_not_term_in_genome" = overlaps[[set]][[rsset]][['GOresults']][['genes_not_term_in_genome']],
                "size_of_set" = overlaps[[set]][[rsset]][['GOresults']][['size_of_set']]
            ))
            
            tmp <- overlaps[[set]][[rsset]][['GOresults']] %>% select(term, pval)
            finalname <- paste(results_prefix, paste("REVIGO", set, rsset, sep="_"), ".csv", sep="")
            write.table(tmp, "tmp.txt", quote=FALSE, row.names = FALSE, col.names = FALSE)
            filedata <-readChar("tmp.txt",file.info("tmp.txt")$size)

            httr::POST(
              url = "http://revigo.irb.hr/Revigo.aspx",
              body = list(
                cutoff = "0.5",
                valueType = "pvalue",
                speciesTaxon = "0",
                measure = "SIMREL",
                goList = filedata
              ),
              # application/x-www-form-urlencoded
              encode = "form"
            ) -> res

            dat <- httr::content(res, encoding = "UTF-8")

            # Write results to a file
            dat <- stri_replace_all_fixed(dat, "\r", "")
            cat(dat, file='tmpresults.html', fill = FALSE)
            resultsrevigo <- readHTMLTable('tmpresults.html')

            fullresultsrevigo <- setNames(data.frame(matrix(ncol = 4, nrow = 0)), c("Term.ID", "Name", "Value", "Category"))
            if(!is.null(resultsrevigo$BiologicalProcess)){
                resultsrevigo$BiologicalProcess <- resultsrevigo$BiologicalProcess %>% 
                                                    filter(Eliminated=='False') %>% 
                                                    select(`Term ID`, Name, Value)
                resultsrevigo$BiologicalProcess['Category'] <- rep('BiologicalProcess', dim(resultsrevigo$BiologicalProcess)[1])
                fullresultsrevigo <- rbind(fullresultsrevigo, resultsrevigo$BiologicalProcess)
            }
            if(!is.null(resultsrevigo$CellularComponent)){
                resultsrevigo$CellularComponent <- resultsrevigo$CellularComponent %>% 
                                                    filter(Eliminated=='False') %>% 
                                                    select(`Term ID`, Name, Value)
                resultsrevigo$CellularComponent['Category'] <- rep('CellularComponent', dim(resultsrevigo$CellularComponent)[1])
                fullresultsrevigo <- rbind(fullresultsrevigo, resultsrevigo$CellularComponent)
            }
            if(!is.null(resultsrevigo$MolecularFunction)){
                resultsrevigo$MolecularFunction <- resultsrevigo$MolecularFunction %>% 
                                                    filter(Eliminated=='False') %>% 
                                                    select(`Term ID`, Name, Value)
                resultsrevigo$MolecularFunction['Category'] <- rep('MolecularFunction', dim(resultsrevigo$MolecularFunction)[1])
                fullresultsrevigo <- rbind(fullresultsrevigo, resultsrevigo$MolecularFunction)
            }

            write.csv(data.frame(fullresultsrevigo), finalname, row.names=FALSE)
        }
    }
}

"argument is not an atomic vector; coercing"

In [12]:
write.csv(df, paste(results_prefix, "GO_overlaps_stats.csv", sep=""))
write.csv(df2, paste(results_prefix, "GO_overlaps_go_term_list.csv", sep=""))