In [1]:
library(tidyverse)
library(hash)
library(gprofiler2)
library(viridis)
library(VennDiagram)
library(XML)
library(httr)
library(stringi)

Registered S3 method overwritten by 'rvest':
  method            from
  read_xml.response xml2
-- Attaching packages --------------------------------------- tidyverse 1.2.1 --
v ggplot2 3.3.5     v purrr   0.3.4
v tibble  3.1.6     v dplyr   1.0.7
v tidyr   1.1.4     v stringr 1.4.0
v readr   1.3.1     v forcats 0.4.0
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
"package 'hash' was built under R version 3.6.3"hash-3.0.1 provided by Decision Patterns

Loading required package: viridisLite

Attaching package: 'viridis'

The following object is masked from 'package:viridisLite':

    viridis.map

Loading required package: grid
Loading required package: futile.logger


create a function to find overlaps in lists

In [2]:
private_of_three <- function(df1, df2, df3) {
    df1[!(df1 %in% df2) & !(df1 %in% df3)]
}

overlap_genes_two <- function(df1, df2, df3) {
    two <- intersect(as.character(df1), as.character(df2))
    allthree <- overlap_genes_three(df1, df2, df3)
    two[!two %in% allthree]
}

overlap_genes_three <- function(df1, df2, df3) {
    intersect(as.character(df3), intersect(as.character(df1), as.character(df2)))
}

make the gene lists in each of the three rootstocks (the ones used in the venn diagrams)

store all of the overlapping gene lists

In [3]:
alpha <- 4.6e-4
results_prefix <- '/data/projects/julia.pratt/CS1_genomeSelection/scripts/revigo/overlaps/GOBP/'
sourcepreferences <- "GO:BP"

In [4]:
tissues <- c('Leaf')
years <- c('2018')
phenos <- c('Anthesis', 'Veraison', 'Harvest')
rootstocks <- c('Ungrafted', '1103P', '3309C', 'SO4')
dir <- c('Up', 'Down')
prefix <- '/data/projects/julia.pratt/CS1_genomeSelection/all_rootstock_comparisons/'
stats <- list()
overlaps <- list()

In [5]:
# filter according to log2foldchange value
log2foldchange <- function(data, alpha, log2FoldChangeVal, d) {
 
    # only keeps significant values
    data.sig <- data %>% filter(padj < alpha)
    
    # separates upregulated/downregulated
    if (d == 'Up') {
        data.adj <- data.sig %>% filter(log2FoldChange > log2FoldChangeVal)
    } else if (d == 'Down') {
         data.adj <- data.sig %>% filter(log2FoldChange < -log2FoldChangeVal)
    } else {
        print(paste(d, "not a valid direction"))
    }

    return(data.adj)
}

In [6]:
# all data combos
for (t in tissues){
    for (y in years){
        for (p in phenos){
            for (direction in dir){
                
                meta <- paste(y, p, t, direction, sep='_')
                metaformat <- paste(y, p, t, direction, sep=' ')
                overlaps[[meta]] <- list()
                
                for (j in 2:length(rootstocks)) {             
                    # read in the data
                    rs <- str_interp("Ungrafted-${rootstocks[j]}")
                    data <- read.csv(paste(prefix, paste(t, y, p, rs, sep='_'), '.csv', sep=''))
                    fname <- paste(meta,rs,sep='_')
                    # filter and save the data
                    data.adj <- log2foldchange(data, 0.05, 0, direction)
                    stats[[rs]] <- data.adj
                }
                
                # Ungrafted-1103P x Ungrafted-3309C x Ungrafted-SO4
                overlaps[[meta]][['1103P-3309C-SO4']] <- list()
                overlaps[[meta]][['1103P-3309C-SO4']][['v3_genes']] <- overlap_genes_three(stats[['Ungrafted-1103P']][['X']],
                                                                    stats[['Ungrafted-3309C']][['X']],
                                                                    stats[['Ungrafted-SO4']][['X']])
                
                # Ungrafted-1103P x Ungrafted-3309C , NOT Ungrafted-SO4
                overlaps[[meta]][['1103P-3309C']] <- list()
                overlaps[[meta]][['1103P-3309C']][['v3_genes']] <- overlap_genes_two(stats[['Ungrafted-1103P']][['X']],
                                                              stats[['Ungrafted-3309C']][['X']],
                                                              stats[['Ungrafted-SO4']][['X']])
                
                # Ungrafted-1103P x Ungrafted-SO4 , NOT Ungrafted-3309C
                overlaps[[meta]][['1103P-SO4']] <- list()
                overlaps[[meta]][['1103P-SO4']][['v3_genes']] <- overlap_genes_two(stats[['Ungrafted-1103P']][['X']],
                                                            stats[['Ungrafted-SO4']][['X']],
                                                            stats[['Ungrafted-3309C']][['X']])
                
                # Ungrafted-3309C x Ungrafted-SO4 , NOT Ungrafted-1103P
                overlaps[[meta]][['3309C-SO4']] <- list()
                overlaps[[meta]][['3309C-SO4']][['v3_genes']] <- overlap_genes_two(stats[['Ungrafted-3309C']][['X']],
                                                            stats[['Ungrafted-SO4']][['X']],
                                                            stats[['Ungrafted-1103P']][['X']])
                
                # Private to Ungrafted-1103P
                overlaps[[meta]][['1103P']] <- list()
                overlaps[[meta]][['1103P']][['v3_genes']] <- private_of_three(stats[['Ungrafted-1103P']][['X']],
                                                       stats[['Ungrafted-3309C']][['X']],
                                                       stats[['Ungrafted-SO4']][['X']])
                
                # Private to Ungrafted-3309C
                overlaps[[meta]][['3309C']] <- list()
                overlaps[[meta]][['3309C']][['v3_genes']] <- private_of_three(stats[['Ungrafted-3309C']][['X']],
                                                       stats[['Ungrafted-1103P']][['X']],
                                                       stats[['Ungrafted-SO4']][['X']])
                
                # Private to Ungrafted-SO4
                overlaps[[meta]][['SO4']] <- list()
                overlaps[[meta]][['SO4']][['v3_genes']] <- private_of_three(stats[['Ungrafted-SO4']][['X']],
                                                     stats[['Ungrafted-1103P']][['X']],
                                                     stats[['Ungrafted-3309C']][['X']])
            }
        }
    }
}

In [7]:
df <- NULL
for (set in names(overlaps)){
    for (rsset in names(overlaps[[set]])) {
        df <- rbind(df, data.frame(
            "category" = set,
            "overlapping_rootstocks" = rsset,
            "number_genes" = length(overlaps[[set]][[rsset]][['v3_genes']])
        ))
    }
}

In [8]:
# method to convert vitis (v3) to vit_ (v1)
aliasmap <- data.frame(read_tsv('v3_mapNamesToAlias.tsv', col_names = FALSE))
colnames(aliasmap) <- c('12Xv1', 'VCOST.v3')
aliashash <- hash(keys=aliasmap$'12Xv1', values=aliasmap$'VCOST.v3')

convert <- function(i){
  vitvi <- as.character(i)
  vitvi.alias <- unlist(strsplit(aliashash[[vitvi]], ','))
  vitvi.alias.reduced <- unlist(vitvi.alias[grepl('VIT_', vitvi.alias, fixed=TRUE)])
  if(length(vitvi.alias.reduced) == 0){
    #print(paste("no vit_ name located for", vitvi))
  }
  return(vitvi.alias.reduced)
}

Parsed with column specification:
cols(
  X1 = col_character(),
  X2 = col_character()
)
"15 parsing failures.
 row col  expected    actual                     file
9454  -- 2 columns 1 columns 'v3_mapNamesToAlias.tsv'
9459  -- 2 columns 1 columns 'v3_mapNamesToAlias.tsv'
9465  -- 2 columns 1 columns 'v3_mapNamesToAlias.tsv'
9466  -- 2 columns 1 columns 'v3_mapNamesToAlias.tsv'
9468  -- 2 columns 1 columns 'v3_mapNamesToAlias.tsv'
.... ... ......... ......... ........................
See problems(...) for more details.
"

In [9]:
for (set in names(overlaps)){
    for (rsset in names(overlaps[[set]])) {
        
        # name conversion, v3 to v1
        data.converted <- unlist(lapply(X=overlaps[[set]][[rsset]][['v3_genes']], FUN=convert))
        
        if (length(data.converted) > 0) {
            # run GO
            results <- gost(query = data.converted,
                        organism = "vvinifera", ordered_query = FALSE, 
                        multi_query = FALSE, significant = TRUE, exclude_iea = FALSE, 
                        measure_underrepresentation = FALSE, evcodes = FALSE, 
                        user_threshold = alpha, correction_method = "fdr", 
                        domain_scope = "annotated", custom_bg = NULL, 
                        numeric_ns = "", sources = sourcepreferences, as_short_link = FALSE)
            fname <- paste(set,rsset,sep="_")

            if(length(results$result$source) > 0) {
                overlaps[[set]][[rsset]][['GOresults']] <- results
                print(fname)
            }
            else{
                overlaps[[set]][[rsset]][['GOresults']] <- NULL
            }
        }
    
        else{
                overlaps[[set]][[rsset]][['GOresults']] <- NULL
        }
    }
}

No results to show
Please make sure that the organism is correct or set significant = FALSE
No results to show
Please make sure that the organism is correct or set significant = FALSE
No results to show
Please make sure that the organism is correct or set significant = FALSE


[1] "2018_Anthesis_Leaf_Up_3309C-SO4"


No results to show
Please make sure that the organism is correct or set significant = FALSE


[1] "2018_Anthesis_Leaf_Up_3309C"
[1] "2018_Anthesis_Leaf_Up_SO4"


No results to show
Please make sure that the organism is correct or set significant = FALSE
No results to show
Please make sure that the organism is correct or set significant = FALSE


[1] "2018_Anthesis_Leaf_Down_3309C-SO4"


No results to show
Please make sure that the organism is correct or set significant = FALSE


[1] "2018_Anthesis_Leaf_Down_3309C"
[1] "2018_Anthesis_Leaf_Down_SO4"


No results to show
Please make sure that the organism is correct or set significant = FALSE
No results to show
Please make sure that the organism is correct or set significant = FALSE


[1] "2018_Veraison_Leaf_Up_1103P-SO4"


No results to show
Please make sure that the organism is correct or set significant = FALSE


[1] "2018_Veraison_Leaf_Up_1103P"


No results to show
Please make sure that the organism is correct or set significant = FALSE
No results to show
Please make sure that the organism is correct or set significant = FALSE
No results to show
Please make sure that the organism is correct or set significant = FALSE
No results to show
Please make sure that the organism is correct or set significant = FALSE
No results to show
Please make sure that the organism is correct or set significant = FALSE
No results to show
Please make sure that the organism is correct or set significant = FALSE
No results to show
Please make sure that the organism is correct or set significant = FALSE
No results to show
Please make sure that the organism is correct or set significant = FALSE


[1] "2018_Veraison_Leaf_Down_SO4"
[1] "2018_Harvest_Leaf_Up_1103P-3309C-SO4"


No results to show
Please make sure that the organism is correct or set significant = FALSE


[1] "2018_Harvest_Leaf_Up_1103P-SO4"


No results to show
Please make sure that the organism is correct or set significant = FALSE
No results to show
Please make sure that the organism is correct or set significant = FALSE
No results to show
Please make sure that the organism is correct or set significant = FALSE


[1] "2018_Harvest_Leaf_Up_SO4"


No results to show
Please make sure that the organism is correct or set significant = FALSE
No results to show
Please make sure that the organism is correct or set significant = FALSE
No results to show
Please make sure that the organism is correct or set significant = FALSE
No results to show
Please make sure that the organism is correct or set significant = FALSE
No results to show
Please make sure that the organism is correct or set significant = FALSE
No results to show
Please make sure that the organism is correct or set significant = FALSE
No results to show
Please make sure that the organism is correct or set significant = FALSE


In [10]:
df2 <- NULL
for (set in names(overlaps)){
    for (rsset in names(overlaps[[set]])) {
        if (!is.null(overlaps[[set]][[rsset]][['GOresults']])) {
            df2 <- rbind(df2, data.frame(
                "name_full" = paste(set, rsset, sep="_"),
                "year" = rep(str_split(set, "_")[[1]][1], dim(overlaps[[set]][[rsset]][['GOresults']][['result']])[1]),
                "phenology" = rep(str_split(set, "_")[[1]][2], dim(overlaps[[set]][[rsset]][['GOresults']][['result']])[1]),
                "tissue" = rep(str_split(set, "_")[[1]][3], dim(overlaps[[set]][[rsset]][['GOresults']][['result']])[1]),
                "direction" = rep(str_split(set, "_")[[1]][4], dim(overlaps[[set]][[rsset]][['GOresults']][['result']])[1]),
                "overlapping.rootstocks" = rep(paste(rsset), dim(overlaps[[set]][[rsset]][['GOresults']][['result']])[1]),
                "GO.source" = overlaps[[set]][[rsset]][['GOresults']][['result']] %>% select('source'),
                "GO.term.id" = overlaps[[set]][[rsset]][['GOresults']][['result']] %>% select('term_id'),
                "GO.term.name" = overlaps[[set]][[rsset]][['GOresults']][['result']] %>% select('term_name'),
                "p.value" = overlaps[[set]][[rsset]][['GOresults']][['result']] %>% select('p_value'))
            )
            
            tmp <- overlaps[[set]][[rsset]][['GOresults']][['result']] %>% select(term_id, p_value)
            finalname <- paste(results_prefix, paste("REVIGO", set, rsset, sep="_"), ".csv", sep="")
            write.table(tmp, "tmp.txt", quote=FALSE, row.names = FALSE, col.names = FALSE)
            filedata <-readChar("tmp.txt",file.info("tmp.txt")$size)

            httr::POST(
              url = "http://revigo.irb.hr/Revigo.aspx",
              body = list(
                cutoff = "0.5",
                valueType = "pvalue",
                speciesTaxon = "0",
                measure = "SIMREL",
                goList = filedata
              ),
              # application/x-www-form-urlencoded
              encode = "form"
            ) -> res

            dat <- httr::content(res, encoding = "UTF-8")

            # Write results to a file
            dat <- stri_replace_all_fixed(dat, "\r", "")
            cat(dat, file='tmpresults.html', fill = FALSE)
            resultsrevigo <- readHTMLTable('tmpresults.html')

            fullresultsrevigo <- setNames(data.frame(matrix(ncol = 4, nrow = 0)), c("Term.ID", "Name", "Value", "Category"))
            if(!is.null(resultsrevigo$BiologicalProcess)){
                resultsrevigo$BiologicalProcess <- resultsrevigo$BiologicalProcess %>% 
                                                    filter(Eliminated=='False') %>% 
                                                    select(`Term ID`, Name, Value)
                resultsrevigo$BiologicalProcess['Category'] <- rep('BiologicalProcess', dim(resultsrevigo$BiologicalProcess)[1])
                fullresultsrevigo <- rbind(fullresultsrevigo, resultsrevigo$BiologicalProcess)
            }
            if(!is.null(resultsrevigo$CellularComponent)){
                resultsrevigo$CellularComponent <- resultsrevigo$CellularComponent %>% 
                                                    filter(Eliminated=='False') %>% 
                                                    select(`Term ID`, Name, Value)
                resultsrevigo$CellularComponent['Category'] <- rep('CellularComponent', dim(resultsrevigo$CellularComponent)[1])
                fullresultsrevigo <- rbind(fullresultsrevigo, resultsrevigo$CellularComponent)
            }
            if(!is.null(resultsrevigo$MolecularFunction)){
                resultsrevigo$MolecularFunction <- resultsrevigo$MolecularFunction %>% 
                                                    filter(Eliminated=='False') %>% 
                                                    select(`Term ID`, Name, Value)
                resultsrevigo$MolecularFunction['Category'] <- rep('MolecularFunction', dim(resultsrevigo$MolecularFunction)[1])
                fullresultsrevigo <- rbind(fullresultsrevigo, resultsrevigo$MolecularFunction)
            }

            write.csv(data.frame(fullresultsrevigo), finalname, row.names=FALSE)
        }
    }
}

"argument is not an atomic vector; coercing"

In [12]:
write.csv(df, paste(results_prefix, "GO_overlaps_stats.csv", sep=""))

In [13]:
write.csv(df2, paste(results_prefix, "GO_overlaps_go_term_list.csv", sep=""))