In [1]:
library(tidyverse)
library(hash)
library(gprofiler2)
library(viridis)
library(VennDiagram)
library(XML)
library(httr)
library(stringi)

Registered S3 method overwritten by 'rvest':
  method            from
  read_xml.response xml2
-- Attaching packages --------------------------------------- tidyverse 1.2.1 --
v ggplot2 3.3.5     v purrr   0.3.4
v tibble  3.1.6     v dplyr   1.0.7
v tidyr   1.1.4     v stringr 1.4.0
v readr   1.3.1     v forcats 0.4.0
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
"package 'hash' was built under R version 3.6.3"hash-3.0.1 provided by Decision Patterns

Loading required package: viridisLite

Attaching package: 'viridis'

The following object is masked from 'package:viridisLite':

    viridis.map

Loading required package: grid
Loading required package: futile.logger


In [2]:
alpha <- 4.6e-4
results_prefix <- '/data/projects/julia.pratt/CS1_genomeSelection/scripts/revigo/contrasts/COMPLETE/'
sourcepreferences <- NULL

In [3]:
# I AM ONLY DOING 2018
# but you can do more years by adding the years to the proper list
# change time points as necessary/desired

tissues <- c('Leaf')
years <- c('2018')
phenos <- c('Anthesis', 'Veraison', 'Harvest')
rootstocks <- c('Ungrafted', '1103P', '3309C', 'SO4')
prefix <- '/data/projects/julia.pratt/CS1_genomeSelection/all_rootstock_comparisons/'

stats <- list()
dats <- list()
skipped <- list()

In [4]:
# Convert vitis (v3) to vit_ (v1s)
aliasmap <- data.frame(read_tsv('v3_mapNamesToAlias.tsv', col_names = FALSE))
colnames(aliasmap) <- c('12Xv1', 'VCOST.v3')
aliashash <- hash(keys=aliasmap$'12Xv1', values=aliasmap$'VCOST.v3')

convert <- function(i){
  vitvi <- as.character(i)
  vitvi.alias <- unlist(strsplit(aliashash[[vitvi]], ','))
  vitvi.alias.reduced <- unlist(vitvi.alias[grepl('VIT_', vitvi.alias, fixed=TRUE)])
  if(length(vitvi.alias.reduced) == 0){
    #print(paste("no vit_ name located for", vitvi))
  }
  return(vitvi.alias.reduced)
}

Parsed with column specification:
cols(
  X1 = col_character(),
  X2 = col_character()
)
"15 parsing failures.
 row col  expected    actual                     file
9454  -- 2 columns 1 columns 'v3_mapNamesToAlias.tsv'
9459  -- 2 columns 1 columns 'v3_mapNamesToAlias.tsv'
9465  -- 2 columns 1 columns 'v3_mapNamesToAlias.tsv'
9466  -- 2 columns 1 columns 'v3_mapNamesToAlias.tsv'
9468  -- 2 columns 1 columns 'v3_mapNamesToAlias.tsv'
.... ... ......... ......... ........................
See problems(...) for more details.
"

In [5]:
# Prep the data to enter gene ontology function
prep <- function(data, alpha, log2FoldChangeVal) {
    data.clean = list()
    
    # only keeps significant values
    data.sig <- data %>% filter(padj < alpha)
    
    # separates upregulated/downregulated
    data.up.v3 <- data.sig %>% filter(log2FoldChange > log2FoldChangeVal)
    data.down.v3 <- data.sig %>% filter(log2FoldChange < log2FoldChangeVal)
    
    # converts name from v3 to v1 for use with ggprofiler2 (go)
    data.up.v1 <- unlist(lapply(X=data.up.v3$X, FUN=convert))
    data.down.v1 <- unlist(lapply(X=data.down.v3$X, FUN=convert))
    
    # prints number of values for v3 and v1 names
    # make sure that these numbers are too hugely different (generally, they are very similar)
    # not all v3 names have a v1. some have more than one v1 name.
    print(paste('upv3: ', length(data.up.v3$X)))
    print(paste('upv1: ', length(data.up.v1)))
    print(paste('downv3: ', length(data.down.v3$X)))
    print(paste('downv1: ', length(data.down.v1)))
    
    # store and reddturn cleaned datadd
    data.clean$up <- data.up.v1
    data.clean$down <- data.down.v1
    data.clean$upv3 <- data.up.v3
    data.clean$downv3 <- data.down.v3
    
    return(data.clean)
}

In [6]:
# Use gprofiler2 to perform gene ontology term enrichment
go <- function(prepped.data, alpha=0.05, source_preferences=NULL) {
    d = list()
    
    # upregulated genes first
    d$up <- gost(query = prepped.data$up, 
                    organism = "vvinifera", ordered_query = FALSE, 
                    multi_query = FALSE, significant = TRUE, exclude_iea = FALSE, 
                    measure_underrepresentation = FALSE, evcodes = FALSE, 
                    user_threshold = alpha, correction_method = "fdr", 
                    domain_scope = "annotated", custom_bg = NULL, 
                    numeric_ns = "", sources = source_preferences, as_short_link = FALSE)
    
    # downregulated genes
    d$down <- gost(query = prepped.data$down, 
                    organism = "vvinifera", ordered_query = FALSE, 
                    multi_query = FALSE, significant = TRUE, exclude_iea = FALSE, 
                    measure_underrepresentation = FALSE, evcodes = FALSE, 
                    user_threshold = alpha, correction_method = "fdr", 
                    domain_scope = "annotated", custom_bg = NULL, 
                    numeric_ns = "", sources = source_preferences, as_short_link = FALSE)

    return(d)
}

In [7]:
# cycle through all the data
# this is essentially the gene ontology pipeline
for (t in tissues){
    for (y in years){
        for (p in phenos){
            for (i in 1:length(rootstocks)){
                
                # cycle through the rootstock combos
                for (j in 2:length(rootstocks))
                    if (i != j & i < j){
                        rs <- str_interp("${rootstocks[i]}-${rootstocks[j]}")
                        filename <- paste(t, y, p, rs, sep='_')
                        print(filename)
                        fullpath <- paste(prefix, filename, '.csv', sep='')
                        
                        # read in the data
                        data <- read.csv(fullpath)
                        # clean the data - this alpha filter is 0.05
                        data.clean <- prep(data, 0.05, 0)
                        
                        if(length(data.clean) > 0){
                            # run gene ontology - genes are kept if significant to 4.6e-4
                            res <- go(data.clean, alpha, sourcepreferences)

                            fname <- paste(filename,'Up' ,sep='_')
                            if(length(res$up$result$source) > 0){
                                stats[[fname]] <- res$up$result
                                dats[[fname]] <- data.clean$upv3
                                
                                print(fname)
                            }
                            # keep track of which years & time points are skipped to prevent future errors
                            else{skipped <- append(skipped, fname)}

                            fname <- paste(filename,'Down',sep='_')
                            if(length(res$down$result$source) > 0){                            
                                # save info - GO results, gene lists
                                stats[[fname]] <- res$down$result
                                dats[[fname]] <- data.clean$downv3
                                
                                print(fname)
                            }
                            # keep track of which years & time points are skipped to prevent future errors
                            else{skipped <- append(skipped, fname)}
                        }
                        else{skipped <- append(skipped, fname)}
                        
                        print('-------------------') 
                    }
            }
        }
    }
}

[1] "Leaf_2018_Anthesis_Ungrafted-1103P"
[1] "upv3:  116"
[1] "upv1:  109"
[1] "downv3:  13"
[1] "downv1:  12"


No results to show
Please make sure that the organism is correct or set significant = FALSE


[1] "Leaf_2018_Anthesis_Ungrafted-1103P_Up"
[1] "-------------------"
[1] "Leaf_2018_Anthesis_Ungrafted-3309C"
[1] "upv3:  497"
[1] "upv1:  476"
[1] "downv3:  158"
[1] "downv1:  178"
[1] "Leaf_2018_Anthesis_Ungrafted-3309C_Up"
[1] "Leaf_2018_Anthesis_Ungrafted-3309C_Down"
[1] "-------------------"
[1] "Leaf_2018_Anthesis_Ungrafted-SO4"
[1] "upv3:  315"
[1] "upv1:  312"
[1] "downv3:  687"
[1] "downv1:  730"
[1] "Leaf_2018_Anthesis_Ungrafted-SO4_Up"
[1] "Leaf_2018_Anthesis_Ungrafted-SO4_Down"
[1] "-------------------"
[1] "Leaf_2018_Anthesis_1103P-3309C"
[1] "upv3:  408"
[1] "upv1:  381"
[1] "downv3:  130"
[1] "downv1:  143"
[1] "Leaf_2018_Anthesis_1103P-3309C_Up"
[1] "Leaf_2018_Anthesis_1103P-3309C_Down"
[1] "-------------------"
[1] "Leaf_2018_Anthesis_1103P-SO4"
[1] "upv3:  28"
[1] "upv1:  26"
[1] "downv3:  394"
[1] "downv1:  394"


No results to show
Please make sure that the organism is correct or set significant = FALSE


[1] "Leaf_2018_Anthesis_1103P-SO4_Down"
[1] "-------------------"
[1] "Leaf_2018_Anthesis_3309C-SO4"
[1] "upv3:  5"
[1] "upv1:  5"
[1] "downv3:  330"
[1] "downv1:  304"


No results to show
Please make sure that the organism is correct or set significant = FALSE


[1] "Leaf_2018_Anthesis_3309C-SO4_Down"
[1] "-------------------"
[1] "Leaf_2018_Veraison_Ungrafted-1103P"
[1] "upv3:  3586"
[1] "upv1:  3522"
[1] "downv3:  2376"
[1] "downv1:  2377"


No results to show
Please make sure that the organism is correct or set significant = FALSE


[1] "Leaf_2018_Veraison_Ungrafted-1103P_Up"
[1] "-------------------"
[1] "Leaf_2018_Veraison_Ungrafted-3309C"
[1] "upv3:  522"
[1] "upv1:  521"
[1] "downv3:  391"
[1] "downv1:  404"
[1] "Leaf_2018_Veraison_Ungrafted-3309C_Up"
[1] "Leaf_2018_Veraison_Ungrafted-3309C_Down"
[1] "-------------------"
[1] "Leaf_2018_Veraison_Ungrafted-SO4"
[1] "upv3:  2342"
[1] "upv1:  2303"
[1] "downv3:  1630"
[1] "downv1:  1675"
[1] "Leaf_2018_Veraison_Ungrafted-SO4_Up"
[1] "Leaf_2018_Veraison_Ungrafted-SO4_Down"
[1] "-------------------"
[1] "Leaf_2018_Veraison_1103P-3309C"
[1] "upv3:  1492"
[1] "upv1:  1483"
[1] "downv3:  2404"
[1] "downv1:  2394"
[1] "Leaf_2018_Veraison_1103P-3309C_Up"
[1] "Leaf_2018_Veraison_1103P-3309C_Down"
[1] "-------------------"
[1] "Leaf_2018_Veraison_1103P-SO4"
[1] "upv3:  51"
[1] "upv1:  54"
[1] "downv3:  55"
[1] "downv1:  55"


No results to show
Please make sure that the organism is correct or set significant = FALSE
No results to show
Please make sure that the organism is correct or set significant = FALSE


[1] "-------------------"
[1] "Leaf_2018_Veraison_3309C-SO4"
[1] "upv3:  400"
[1] "upv1:  399"
[1] "downv3:  337"
[1] "downv1:  353"


No results to show
Please make sure that the organism is correct or set significant = FALSE


[1] "Leaf_2018_Veraison_3309C-SO4_Up"
[1] "-------------------"
[1] "Leaf_2018_Harvest_Ungrafted-1103P"
[1] "upv3:  4415"
[1] "upv1:  4327"
[1] "downv3:  2520"
[1] "downv1:  2504"
[1] "Leaf_2018_Harvest_Ungrafted-1103P_Up"
[1] "Leaf_2018_Harvest_Ungrafted-1103P_Down"
[1] "-------------------"
[1] "Leaf_2018_Harvest_Ungrafted-3309C"
[1] "upv3:  3043"
[1] "upv1:  2993"
[1] "downv3:  1265"
[1] "downv1:  1274"
[1] "Leaf_2018_Harvest_Ungrafted-3309C_Up"
[1] "Leaf_2018_Harvest_Ungrafted-3309C_Down"
[1] "-------------------"
[1] "Leaf_2018_Harvest_Ungrafted-SO4"
[1] "upv3:  4098"
[1] "upv1:  4042"
[1] "downv3:  1810"
[1] "downv1:  1807"
[1] "Leaf_2018_Harvest_Ungrafted-SO4_Up"
[1] "Leaf_2018_Harvest_Ungrafted-SO4_Down"
[1] "-------------------"
[1] "Leaf_2018_Harvest_1103P-3309C"
[1] "upv3:  32"
[1] "upv1:  30"
[1] "downv3:  52"
[1] "downv1:  51"
[1] "Leaf_2018_Harvest_1103P-3309C_Up"
[1] "Leaf_2018_Harvest_1103P-3309C_Down"
[1] "-------------------"
[1] "Leaf_2018_Harvest_1103P-SO4"
[1] "upv

No results to show
Please make sure that the organism is correct or set significant = FALSE
No results to show
Please make sure that the organism is correct or set significant = FALSE


[1] "-------------------"
[1] "Leaf_2018_Harvest_3309C-SO4"
[1] "upv3:  205"
[1] "upv1:  202"
[1] "downv3:  21"
[1] "downv1:  21"
[1] "Leaf_2018_Harvest_3309C-SO4_Up"
[1] "Leaf_2018_Harvest_3309C-SO4_Down"
[1] "-------------------"


In [8]:
df <- setNames(data.frame(matrix(ncol = 4, nrow = 0)), c("name", "numTerms", "numGenes", "alpha"))
for (name in names(stats)){
    df[nrow(df) + 1,] = c(name, dim(stats[[name]])[1], dim(dats[[name]])[1], alpha)
}

In [9]:
df2 <- NULL
for (name in names(stats)){    
    df2 <- rbind(df2, data.frame(
        "name_full" = name,
        "tissue" = rep(str_split(name, "_")[[1]][1], dim(stats[[name]])[1]),
        "year" = rep(str_split(name, "_")[[1]][2], dim(stats[[name]])[1]),
        "phenology" = rep(str_split(name, "_")[[1]][3], dim(stats[[name]])[1]),
        "direction" = rep(str_split(name, "_")[[1]][5], dim(stats[[name]])[1]),
        "contrast" = rep(str_split(name, "_")[[1]][4], dim(stats[[name]])[1]),
        "GO.source" = stats[[name]] %>% select('source'),
        "GO.term.id" = stats[[name]] %>% select('term_id'),
        "GO.term.name" = stats[[name]] %>% select('term_name'),
        "p.value" = stats[[name]] %>% select('p_value'))
    )    
    tmp <- stats[[name]] %>% select(term_id, p_value)
    finalname <- paste(results_prefix, paste("REVIGO", name, sep="_"), ".csv", sep="")
    write.table(tmp, "tmp.txt", quote=FALSE, row.names = FALSE, col.names = FALSE)
    filedata <-readChar("tmp.txt",file.info("tmp.txt")$size)
    
    httr::POST(
      url = "http://revigo.irb.hr/Revigo.aspx",
      body = list(
        cutoff = "0.5",
        valueType = "pvalue",
        speciesTaxon = "0",
        measure = "SIMREL",
        goList = filedata
      ),
      # application/x-www-form-urlencoded
      encode = "form"
    ) -> res

    dat <- httr::content(res, encoding = "UTF-8")

    # Write results to a file
    dat <- stri_replace_all_fixed(dat, "\r", "")
    cat(dat, file='tmpresults.html', fill = FALSE)
    resultsrevigo <- readHTMLTable('tmpresults.html')
    
    fullresultsrevigo <- setNames(data.frame(matrix(ncol = 4, nrow = 0)), c("Term.ID", "Name", "Value", "Category"))
    if(!is.null(resultsrevigo$BiologicalProcess)){
        resultsrevigo$BiologicalProcess <- resultsrevigo$BiologicalProcess %>% 
                                            filter(Eliminated=='False') %>% 
                                            select(`Term ID`, Name, Value)
        resultsrevigo$BiologicalProcess['Category'] <- rep('BiologicalProcess', dim(resultsrevigo$BiologicalProcess)[1])
        fullresultsrevigo <- rbind(fullresultsrevigo, resultsrevigo$BiologicalProcess)
    }
    if(!is.null(resultsrevigo$CellularComponent)){
        resultsrevigo$CellularComponent <- resultsrevigo$CellularComponent %>% 
                                            filter(Eliminated=='False') %>% 
                                            select(`Term ID`, Name, Value)
        resultsrevigo$CellularComponent['Category'] <- rep('CellularComponent', dim(resultsrevigo$CellularComponent)[1])
        fullresultsrevigo <- rbind(fullresultsrevigo, resultsrevigo$CellularComponent)
    }
    if(!is.null(resultsrevigo$MolecularFunction)){
        resultsrevigo$MolecularFunction <- resultsrevigo$MolecularFunction %>% 
                                            filter(Eliminated=='False') %>% 
                                            select(`Term ID`, Name, Value)
        resultsrevigo$MolecularFunction['Category'] <- rep('MolecularFunction', dim(resultsrevigo$MolecularFunction)[1])
        fullresultsrevigo <- rbind(fullresultsrevigo, resultsrevigo$MolecularFunction)
    }
    
    write.csv(data.frame(fullresultsrevigo), finalname, row.names=FALSE)
}

"argument is not an atomic vector; coercing"

In [10]:
write.csv(df, paste(results_prefix, "GO_contrasts_stats.csv", sep=""))
write.csv(df2, paste(results_prefix, "GO_contrasts_go_term_list.csv", sep=""))