In [1]:
library(tidyverse)
library(hash)
library(gprofiler2)
library(viridis)
library(VennDiagram)
library(XML)
library(httr)
library(stringi)

Registered S3 method overwritten by 'rvest':
  method            from
  read_xml.response xml2
-- Attaching packages --------------------------------------- tidyverse 1.2.1 --
v ggplot2 3.3.5     v purrr   0.3.4
v tibble  3.1.6     v dplyr   1.0.7
v tidyr   1.1.4     v stringr 1.4.0
v readr   1.3.1     v forcats 0.4.0
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
"package 'hash' was built under R version 3.6.3"hash-3.0.1 provided by Decision Patterns

Loading required package: viridisLite

Attaching package: 'viridis'

The following object is masked from 'package:viridisLite':

    viridis.map

Loading required package: grid
Loading required package: futile.logger


In [2]:
tissues <- c('Leaf')
years <- c('2018')
phenos <- c('Anthesis', 'Veraison', 'Harvest')
rootstocks <- c('Ungrafted', '1103P', '3309C', 'SO4')
dir <- c('Up', 'Down')

prefix <- '/data/projects/julia.pratt/CS1_genomeSelection/scripts/PCs_for_julia/'
results_prefix <- '/data/projects/julia.pratt/CS1_genomeSelection/scripts/ownGO/overlaps/'

stats <- list()
dats <- list()
skipped <- list()

In [3]:
# Use GO parsed table to perform gene ontology term enrichment - ZNH edits
gochart <- read.csv("gotable.csv")
full_go_enrichment_test <- function(set_of_genes, go_chart = gochart, print_res=FALSE){
    go_testset <- go_chart[go_chart$genename %in% set_of_genes, ]
    terms <- unique(go_testset$go_term)
    print(paste("TERMS:", length(terms)))
    enriched_terms <- c()
    for (term in terms){ 
        genes_to_term_in_set <- nrow(go_testset[go_testset$go_term == term,])
        genes_to_term_in_genome <- nrow(go_chart[go_chart$go_term == term,])
        genes_not_term_in_genome <- 24769 - genes_to_term_in_genome
        size_of_set <- length(set_of_genes)
        test <- phyper(genes_to_term_in_set, genes_to_term_in_genome,
                       genes_not_term_in_genome, size_of_set, lower.tail=FALSE)

        
        enriched_terms$term <- c(enriched_terms$term, term)
        enriched_terms$pval <- c(enriched_terms$pval, test)
        enriched_terms$genes_to_term_in_set <- c(enriched_terms$genes_to_term_in_set, genes_to_term_in_set)
        enriched_terms$genes_to_term_in_genome <- c(enriched_terms$genes_to_term_in_genome, genes_to_term_in_genome)
        enriched_terms$genes_not_term_in_genome <- c(enriched_terms$genes_not_term_in_genome, genes_not_term_in_genome)
        enriched_terms$size_of_set <- c(enriched_terms$size_of_set, size_of_set)
    }
    
    enriched_terms <- as.data.frame(enriched_terms)
    enriched_terms <- enriched_terms %>% filter( genes_to_term_in_genome >= 10)
    enriched_terms$p.adj <- p.adjust(enriched_terms$pval, method='fdr')
    enriched_terms <- enriched_terms %>% filter(p.adj < 0.05)
    
    print(paste("SIG TERMS:", nrow(enriched_terms)))
    
    return(enriched_terms)
}

In [4]:
# cycle through all the data
# this is essentially the gene ontology pipeline

for (f in list.files(prefix) ) {
    # read in the data
    fullpath <- paste(prefix, f, sep='')
    data <- read.csv(fullpath)
    fname <- substr(f,1,nchar(f)-4)
    # run gene ontology
    if(length(data) > 0){
        res <- full_go_enrichment_test(data$x)
        if(length(res) > 0){
            # save info
            stats[[fname]] <- res$up
            dats[[fname]] <- data$up
            print(fname)
        }
        else{skipped <- append(skipped, fname)}
    }
    else{skipped <- append(skipped, fname)}
}

[1] "TERMS: 181"
[1] "SIG TERMS: 0"
[1] "PC105_high_loaders"
[1] "TERMS: 157"
[1] "SIG TERMS: 0"
[1] "PC105_low_loaders"
[1] "TERMS: 324"
[1] "SIG TERMS: 17"
[1] "PC10_high_loaders"
[1] "TERMS: 300"
[1] "SIG TERMS: 9"
[1] "PC10_low_loaders"
[1] "TERMS: 314"
[1] "SIG TERMS: 24"
[1] "PC11_high_loaders"
[1] "TERMS: 266"
[1] "SIG TERMS: 0"
[1] "PC11_low_loaders"
[1] "TERMS: 337"
[1] "SIG TERMS: 17"
[1] "PC12_high_loaders"
[1] "TERMS: 246"
[1] "SIG TERMS: 29"
[1] "PC12_low_loaders"
[1] "TERMS: 230"
[1] "SIG TERMS: 0"
[1] "PC13_high_loaders"
[1] "TERMS: 371"
[1] "SIG TERMS: 44"
[1] "PC13_low_loaders"
[1] "TERMS: 188"
[1] "SIG TERMS: 0"
[1] "PC145_high_loaders"
[1] "TERMS: 176"
[1] "SIG TERMS: 0"
[1] "PC145_low_loaders"
[1] "TERMS: 203"
[1] "SIG TERMS: 0"
[1] "PC151_high_loaders"
[1] "TERMS: 182"
[1] "SIG TERMS: 0"
[1] "PC151_low_loaders"
[1] "TERMS: 322"
[1] "SIG TERMS: 26"
[1] "PC15_high_loaders"
[1] "TERMS: 177"
[1] "SIG TERMS: 14"
[1] "PC15_low_loaders"
[1] "TERMS: 288"
[1] "SIG TERMS: 19

In [5]:
df <- setNames(data.frame(matrix(ncol = 3, nrow = 0)), c("name", "numTerms", "numGenes"))
for (name in names(stats)){
    df[nrow(df) + 1,] = c(name, dim(stats[[name]])[1], length(dats[[name]]))
    print(stats[[name]][['term_id']])
    print(stats[[name]][['term_name']])
    print("=======================")
}

In [6]:
df2 <- NULL
for (name in names(stats)){
    df2 <- rbind(df2, data.frame(
        "name" = rep(name, dim(stats[[name]])[1]),
        "term" = stats[[name]][['term']],
        "pval" = stats[[name]][['pval']],
        "genes_to_term_in_set" = stats[[name]][['genes_to_term_in_set']],
        "genes_to_term_in_genome" = stats[[name]][['genes_to_term_in_genome']],
        "genes_not_term_in_genome" = stats[[name]][['genes_not_term_in_genome']],
        "size_of_set" = stats[[name]][['size_of_set']]
    ))
    
    tmp <- stats[[name]] %>% select(term_id, p_value)
    finalname <- paste(results_prefix, paste("REVIGO", name, sep="_"), ".csv", sep="")
    write.table(tmp, "tmp.txt", quote=FALSE, row.names = FALSE, col.names = FALSE)
    filedata <-readChar("tmp.txt",file.info("tmp.txt")$size)
    
    httr::POST(
      url = "http://revigo.irb.hr/Revigo.aspx",
      body = list(
        cutoff = "0.5",
        valueType = "pvalue",
        speciesTaxon = "0",
        measure = "SIMREL",
        goList = filedata
      ),
      # application/x-www-form-urlencoded
      encode = "form"
    ) -> res

    dat <- httr::content(res, encoding = "UTF-8")

    # Write results to a file
    dat <- stri_replace_all_fixed(dat, "\r", "")
    cat(dat, file='tmpresults.html', fill = FALSE)
    resultsrevigo <- readHTMLTable('tmpresults.html')
    
    fullresultsrevigo <- setNames(data.frame(matrix(ncol = 4, nrow = 0)), c("Term.ID", "Name", "Value", "Category"))
    if(!is.null(resultsrevigo$BiologicalProcess)){
        resultsrevigo$BiologicalProcess <- resultsrevigo$BiologicalProcess %>% 
                                            filter(Eliminated=='False') %>% 
                                            select(`Term ID`, Name, Value)
        resultsrevigo$BiologicalProcess['Category'] <- rep('BiologicalProcess', dim(resultsrevigo$BiologicalProcess)[1])
        fullresultsrevigo <- rbind(fullresultsrevigo, resultsrevigo$BiologicalProcess)
    }
    if(!is.null(resultsrevigo$CellularComponent)){
        resultsrevigo$CellularComponent <- resultsrevigo$CellularComponent %>% 
                                            filter(Eliminated=='False') %>% 
                                            select(`Term ID`, Name, Value)
        resultsrevigo$CellularComponent['Category'] <- rep('CellularComponent', dim(resultsrevigo$CellularComponent)[1])
        fullresultsrevigo <- rbind(fullresultsrevigo, resultsrevigo$CellularComponent)
    }
    if(!is.null(resultsrevigo$MolecularFunction)){
        resultsrevigo$MolecularFunction <- resultsrevigo$MolecularFunction %>% 
                                            filter(Eliminated=='False') %>% 
                                            select(`Term ID`, Name, Value)
        resultsrevigo$MolecularFunction['Category'] <- rep('MolecularFunction', dim(resultsrevigo$MolecularFunction)[1])
        fullresultsrevigo <- rbind(fullresultsrevigo, resultsrevigo$MolecularFunction)
    }
    
    write.csv(data.frame(fullresultsrevigo), finalname, row.names=FALSE)
}

In [7]:
write.csv(df, paste(results_prefix, "GO_pcs_stats.csv", sep=""))
write.csv(df2, paste(results_prefix, "GO_pcs_go_term_list.csv", sep=""))