In [None]:
library(hash)
library(gprofiler2)
library(viridis)
library(VennDiagram)
library(XML)
library(httr)
library(stringi)
library(tidyverse)

In [None]:
# I AM ONLY DOING 2018
# but you can do more years by adding the years to the proper list
# change time points as necessary/desired

tissues <- c('Leaf')
years <- c('2018')
phenos <- c('Anthesis', 'Veraison', 'Harvest')
rootstocks <- c('Ungrafted', '1103P', '3309C', 'SO4')

prefix <- '/data/projects/julia.pratt/CS1_genomeSelection/all_rootstock_comparisons/'
results_prefix <- '/data/projects/julia.pratt/CS1_genomeSelection/scripts/ownGO/contrasts/'

stats <- list()
dats <- list()
skipped <- list()
res <- list()

In [None]:
# Prep the data to enter gene ontology function
prep <- function(data, alpha, log2FoldChangeVal) {
    data.clean = list()
    
    # only keeps significant values
    data.sig <- data %>% filter(padj < alpha)
    
    # separates upregulated/downregulated
    data.clean$up <- data.sig %>% filter(log2FoldChange > log2FoldChangeVal)
    data.clean$down <- data.sig %>% filter(log2FoldChange < log2FoldChangeVal)
    
    return(data.clean)
}

In [None]:
# Use GO parsed table to perform gene ontology term enrichment - ZNH edits
gochart <- read.csv("gotable.csv")
full_go_enrichment_test <- function(set_of_genes, go_chart = gochart, print_res=FALSE){
    go_testset <- go_chart[go_chart$genename %in% set_of_genes, ]
    terms <- unique(go_testset$go_term)
    print(paste("TERMS:", length(terms)))
    enriched_terms <- c()
    for (term in terms){ 
        genes_to_term_in_set <- nrow(go_testset[go_testset$go_term == term,])
        genes_to_term_in_genome <- nrow(go_chart[go_chart$go_term == term,])
        genes_not_term_in_genome <- 24769 - genes_to_term_in_genome
        size_of_set <- length(set_of_genes)
        test <- phyper(genes_to_term_in_set, genes_to_term_in_genome,
                       genes_not_term_in_genome, size_of_set, lower.tail=FALSE)

        
        enriched_terms$term <- c(enriched_terms$term, term)
        enriched_terms$pval <- c(enriched_terms$pval, test)
        enriched_terms$genes_to_term_in_set <- c(enriched_terms$genes_to_term_in_set, genes_to_term_in_set)
        enriched_terms$genes_to_term_in_genome <- c(enriched_terms$genes_to_term_in_genome, genes_to_term_in_genome)
        enriched_terms$genes_not_term_in_genome <- c(enriched_terms$genes_not_term_in_genome, genes_not_term_in_genome)
        enriched_terms$size_of_set <- c(enriched_terms$size_of_set, size_of_set)
    }
    
    enriched_terms <- as.data.frame(enriched_terms)
    enriched_terms <- enriched_terms %>% filter( genes_to_term_in_genome >= 10)
    enriched_terms$p.adj <- p.adjust(enriched_terms$pval, method='fdr')
    enriched_terms <- enriched_terms %>% filter(p.adj < 0.05)
    
    print(paste("SIG TERMS:", nrow(enriched_terms)))
    
    return(enriched_terms)
}

In [None]:
# cycle through all the data
# this is essentially the gene ontology pipeline
for (t in tissues){
    for (y in years){
        for (p in phenos){
            for (i in 1:length(rootstocks)){
                
                # cycle through the rootstock combos
                for (j in 2:length(rootstocks))
                    if (i != j & i < j){
                        rs <- str_interp("${rootstocks[i]}-${rootstocks[j]}")
                        filename <- paste(t, y, p, rs, sep='_')
                        print(filename)
                        fullpath <- paste(prefix, filename, '.csv', sep='')
                        
                        # read in the data
                        data <- read.csv(fullpath)
                        # clean the data - this alpha filter is 0.05
                        data.clean <- prep(data, 0.05, 0)
                        # make sure there is data
                        if(length(data.clean) > 0){
                            # run gene ontology
                            res$up <- full_go_enrichment_test(data.clean$up$X)
                            fname <- paste(filename,'Up' ,sep='_')
                            # make sure there are significant results - up
                            if(length(res$up) > 0){
                                stats[[fname]] <- res$up
                                dats[[fname]] <- data.clean$up
                                print(fname)
                            } else{skipped <- append(skipped, fname)}
                            res$down <- full_go_enrichment_test(data.clean$down$X)
                            fname <- paste(filename,'Down',sep='_')
                            # make sure there are significant results - down
                            if(length(res$down) > 0){
                                stats[[fname]] <- res$down
                                dats[[fname]] <- data.clean$down
                                print(fname)
                            } else{skipped <- append(skipped, fname)}
                        }
                        else{skipped <- append(skipped, fname)}
                        print('-------------------') 
                    }
            }
        }
    }
}

In [None]:
df <- setNames(data.frame(matrix(ncol = 3, nrow = 0)), c("name", "numTerms", "numGenes"))
for (name in names(stats)){
    df[nrow(df) + 1,] = c(name, length(stats[[name]][["term"]]), length(dats[[name]][["X"]]))
}

In [None]:
df2 <- NULL
for (name in names(stats)){    
    df2 <- rbind(df2, data.frame(
        "name_full" = rep(name, length(stats[[name]][["term"]])),
        "year" = rep(str_split(name, "_")[[1]][2], length(stats[[name]][["term"]])),
        "phenology" = rep(str_split(name, "_")[[1]][3], length(stats[[name]][["term"]])),
        "direction" = rep(str_split(name, "_")[[1]][5], length(stats[[name]][["term"]])),
        "contrast" = rep(str_split(name, "_")[[1]][4], length(stats[[name]][["term"]])),
        "term" = stats[[name]][['term']],
        "pval" = stats[[name]][['pval']],
        "genes_to_term_in_set" = stats[[name]][['genes_to_term_in_set']],
        "genes_to_term_in_genome" = stats[[name]][['genes_to_term_in_genome']],
        "genes_not_term_in_genome" = stats[[name]][['genes_not_term_in_genome']],
        "size_of_set" = stats[[name]][['size_of_set']])
    )
    
    tmp <- data.frame(stats[[name]]) %>% select(term, pval)
    finalname <- paste(results_prefix, paste("REVIGO", name, sep="_"), ".csv", sep="")
    write.table(tmp, "tmp.txt", quote=FALSE, row.names = FALSE, col.names = FALSE)
    filedata <-readChar("tmp.txt",file.info("tmp.txt")$size)
    
    httr::POST(
      url = "http://revigo.irb.hr/Revigo.aspx",
      body = list(
        cutoff = "0.5",
        valueType = "pvalue",
        speciesTaxon = "0",
        measure = "SIMREL",
        goList = filedata
      ),
      # application/x-www-form-urlencoded
      encode = "form"
    ) -> res

    dat <- httr::content(res, encoding = "UTF-8")

    # Write results to a file
    dat <- stri_replace_all_fixed(dat, "\r", "")
    cat(dat, file='tmpresults.html', fill = FALSE)
    resultsrevigo <- readHTMLTable('tmpresults.html')
    
    fullresultsrevigo <- setNames(data.frame(matrix(ncol = 4, nrow = 0)), c("Term.ID", "Name", "Value", "Category"))
    if(!is.null(resultsrevigo$BiologicalProcess)){
        resultsrevigo$BiologicalProcess <- resultsrevigo$BiologicalProcess %>% 
                                            filter(Eliminated=='False') %>% 
                                            select(`Term ID`, Name, Value)
        resultsrevigo$BiologicalProcess['Category'] <- rep('BiologicalProcess', dim(resultsrevigo$BiologicalProcess)[1])
        fullresultsrevigo <- rbind(fullresultsrevigo, resultsrevigo$BiologicalProcess)
    }
    if(!is.null(resultsrevigo$CellularComponent)){
        resultsrevigo$CellularComponent <- resultsrevigo$CellularComponent %>% 
                                            filter(Eliminated=='False') %>% 
                                            select(`Term ID`, Name, Value)
        resultsrevigo$CellularComponent['Category'] <- rep('CellularComponent', dim(resultsrevigo$CellularComponent)[1])
        fullresultsrevigo <- rbind(fullresultsrevigo, resultsrevigo$CellularComponent)
    }
    if(!is.null(resultsrevigo$MolecularFunction)){
        resultsrevigo$MolecularFunction <- resultsrevigo$MolecularFunction %>% 
                                            filter(Eliminated=='False') %>% 
                                            select(`Term ID`, Name, Value)
        resultsrevigo$MolecularFunction['Category'] <- rep('MolecularFunction', dim(resultsrevigo$MolecularFunction)[1])
        fullresultsrevigo <- rbind(fullresultsrevigo, resultsrevigo$MolecularFunction)
    }
    
    write.csv(data.frame(fullresultsrevigo), finalname, row.names=FALSE)
}

In [None]:
resultsrevigo

In [None]:
fullresultsrevigo

In [None]:
stats[[name]]

In [None]:
write.csv(df, paste(results_prefix, "GO_contrasts_stats.csv", sep=""))
write.csv(df2, paste(results_prefix, "GO_contrasts_go_term_list.csv", sep=""))

In [None]:
# # Use gprofiler2 to perform gene ontology term enrichment
# gochart <- read.csv("gotable.csv")
# full_go_enrichment_test <- function(set_of_genes, go_chart = gochart, print_res=FALSE){
#     go_testset <- go_chart[go_chart$genename %in% set_of_genes, ]
#     terms <- unique(go_testset$go_term)
#     print(paste("TERMS:", length(terms)))
#     enriched_terms <- c()
#     for (term in terms){ 
#         genes_to_term_in_set <- nrow(go_testset[go_testset$go_term == term,])
#         genes_to_term_in_genome <- nrow(go_chart[go_chart$go_term == term,])
#         genes_not_term_in_genome <- 24769 - genes_to_term_in_genome
#         size_of_set <- length(set_of_genes)
#         test <- phyper(genes_to_term_in_set, genes_to_term_in_genome,
#                        genes_not_term_in_genome, size_of_set, lower.tail=FALSE)
#         # bonferroni correction
#         if (test < 0.05/length(terms) & genes_to_term_in_genome >= 10){
#             if (print_res == TRUE){
#                 print(c(term, test, genes_to_term_in_set, genes_to_term_in_genome))
#             }
#             enriched_terms$term <- c(enriched_terms$term, term)
#             enriched_terms$pval <- c(enriched_terms$pval, test)
#             enriched_terms$genes_to_term_in_set <- c(enriched_terms$genes_to_term_in_set, genes_to_term_in_set)
#             enriched_terms$genes_to_term_in_genome <- c(enriched_terms$genes_to_term_in_genome, genes_to_term_in_genome)
#             enriched_terms$genes_not_term_in_genome <- c(enriched_terms$genes_not_term_in_genome, genes_not_term_in_genome)
#             enriched_terms$size_of_set <- c(enriched_terms$size_of_set, size_of_set)
#         }
#     }
    
#     return(enriched_terms)
# }