In [1]:
library(tidyverse)
library(hash)
library(gprofiler2)
library(viridis)
library(VennDiagram)
library(XML)
library(httr)
library(stringi)

Registered S3 method overwritten by 'rvest':
  method            from
  read_xml.response xml2
-- Attaching packages --------------------------------------- tidyverse 1.2.1 --
v ggplot2 3.3.5     v purrr   0.3.4
v tibble  3.1.6     v dplyr   1.0.7
v tidyr   1.1.4     v stringr 1.4.0
v readr   1.3.1     v forcats 0.4.0
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
"package 'hash' was built under R version 3.6.3"hash-3.0.1 provided by Decision Patterns

Loading required package: viridisLite

Attaching package: 'viridis'

The following object is masked from 'package:viridisLite':

    viridis.map

Loading required package: grid
Loading required package: futile.logger


In [2]:
stats <- list()
dats <- list()
skipped <- list()

fullpath <- '/data/projects/julia.pratt/CS1_genomeSelection/scripts/1719_geneExpression_SOM_clusters_9x9.csv'
results_prefix <- '/data/projects/julia.pratt/CS1_genomeSelection/scripts/revigo/SOMs/GOBP/'

data <- read.csv(fullpath)

In [3]:
# Use GO parsed table to perform gene ontology term enrichment - ZNH edits
gochart <- read.csv("gotable.csv")
full_go_enrichment_test <- function(set_of_genes, go_chart = gochart, print_res=FALSE){
    go_testset <- go_chart[go_chart$genename %in% set_of_genes, ]
    terms <- unique(go_testset$go_term)
    print(paste("TERMS:", length(terms)))
    enriched_terms <- c()
    for (term in terms){ 
        genes_to_term_in_set <- nrow(go_testset[go_testset$go_term == term,])
        genes_to_term_in_genome <- nrow(go_chart[go_chart$go_term == term,])
        genes_not_term_in_genome <- 24769 - genes_to_term_in_genome
        size_of_set <- length(set_of_genes)
        test <- phyper(genes_to_term_in_set, genes_to_term_in_genome,
                       genes_not_term_in_genome, size_of_set, lower.tail=FALSE)

        
        enriched_terms$term <- c(enriched_terms$term, term)
        enriched_terms$pval <- c(enriched_terms$pval, test)
        enriched_terms$genes_to_term_in_set <- c(enriched_terms$genes_to_term_in_set, genes_to_term_in_set)
        enriched_terms$genes_to_term_in_genome <- c(enriched_terms$genes_to_term_in_genome, genes_to_term_in_genome)
        enriched_terms$genes_not_term_in_genome <- c(enriched_terms$genes_not_term_in_genome, genes_not_term_in_genome)
        enriched_terms$size_of_set <- c(enriched_terms$size_of_set, size_of_set)
    }
    
    enriched_terms <- as.data.frame(enriched_terms)
    enriched_terms <- enriched_terms %>% filter( genes_to_term_in_genome >= 10)
    enriched_terms$p.adj <- p.adjust(enriched_terms$pval, method='fdr')
    enriched_terms <- enriched_terms %>% filter(p.adj < 0.05)
    
    print(paste("SIG TERMS:", nrow(enriched_terms)))
    
    return(enriched_terms)
}

In [4]:
# cycle through all the data
# this is essentially the gene ontology pipeline
for (clusterNumber in seq(1,81)) {
    d <- data %>% filter(cluster == clusterNumber)
    fname <- paste('SOM_cluster_', clusterNumber, sep="")
    # run gene ontology
    if(length(d$gene) > 0){
        res <- full_go_enrichment_test(d$gene)
        if(length(res) > 0){
            # save info
            stats[[fname]] <- res$result
            dats[[fname]] <- d$gene
            print(fname)
        }
        # keep track of which years & time points are skipped to prevent future errors
        else{skipped <- append(skipped, fname)}
    }
    else{skipped <- append(skipped, fname)}
}

[1] "TERMS: 49"
[1] "SIG TERMS: 17"
[1] "SOM_cluster_1"
[1] "TERMS: 32"
[1] "SIG TERMS: 12"
[1] "SOM_cluster_2"
[1] "TERMS: 24"
[1] "SIG TERMS: 7"
[1] "SOM_cluster_3"
[1] "TERMS: 30"
[1] "SIG TERMS: 9"
[1] "SOM_cluster_4"
[1] "TERMS: 37"
[1] "SIG TERMS: 20"
[1] "SOM_cluster_7"
[1] "TERMS: 45"
[1] "SIG TERMS: 14"
[1] "SOM_cluster_8"
[1] "TERMS: 28"
[1] "SIG TERMS: 10"
[1] "SOM_cluster_9"
[1] "TERMS: 30"
[1] "SIG TERMS: 9"
[1] "SOM_cluster_10"
[1] "TERMS: 37"
[1] "SIG TERMS: 18"
[1] "SOM_cluster_11"
[1] "TERMS: 51"
[1] "SIG TERMS: 19"
[1] "SOM_cluster_12"
[1] "TERMS: 34"
[1] "SIG TERMS: 13"
[1] "SOM_cluster_13"
[1] "TERMS: 40"
[1] "SIG TERMS: 24"
[1] "SOM_cluster_14"
[1] "TERMS: 18"
[1] "SIG TERMS: 16"
[1] "SOM_cluster_17"
[1] "TERMS: 25"
[1] "SIG TERMS: 16"
[1] "SOM_cluster_18"
[1] "TERMS: 15"
[1] "SIG TERMS: 3"
[1] "SOM_cluster_21"
[1] "TERMS: 21"
[1] "SIG TERMS: 13"
[1] "SOM_cluster_23"
[1] "TERMS: 17"
[1] "SIG TERMS: 13"
[1] "SOM_cluster_26"
[1] "TERMS: 41"
[1] "SIG TERMS: 13"
[1] "S

In [5]:
names(stats)

NULL

In [6]:
# term, the pval, the description, and the label (like GP:BP)
df <- setNames(data.frame(matrix(ncol = 3, nrow = 0)), c("name", "numTerms", "numGenes"))
for (name in names(stats)){
    df[nrow(df) + 1,] = c(name, dim(stats[[name]])[1], length(dats[[name]]))
}

In [7]:
df

name,numTerms,numGenes


In [8]:
df2 <- NULL
for (name in names(stats)){
    df2 <- rbind(df2, data.frame(
        "name" = rep(name, dim(stats[[name]])[1]),
        "GO.source" = stats[[name]] %>% select('source'),
        "GO.term.id" = stats[[name]] %>% select('term_id'),
        "GO.term.aname" = stats[[name]] %>% select('term_name'),
        "p.value" = stats[[name]] %>% select('p_value'))
    )
    
    tmp <- stats[[name]] %>% select(term_id, p_value)
    finalname <- paste(results_prefix, paste("REVIGO", name, sep="_"), ".csv", sep="")
    write.table(tmp, "tmp.txt", quote=FALSE, row.names = FALSE, col.names = FALSE)
    filedata <-readChar("tmp.txt",file.info("tmp.txt")$size)
    
    httr::POST(
      url = "http://revigo.irb.hr/Revigo.aspx",
      body = list(
        cutoff = "0.5",
        valueType = "pvalue",
        speciesTaxon = "0",
        measure = "SIMREL",
        goList = filedata
      ),
      # application/x-www-form-urlencoded
      encode = "form"
    ) -> res

    dat <- httr::content(res, encoding = "UTF-8")

    # Write results to a file
    dat <- stri_replace_all_fixed(dat, "\r", "")
    cat(dat, file='tmpresults.html', fill = FALSE)
    resultsrevigo <- readHTMLTable('tmpresults.html')
    
    fullresultsrevigo <- setNames(data.frame(matrix(ncol = 4, nrow = 0)), c("Term.ID", "Name", "Value", "Category"))
    if(!is.null(resultsrevigo$BiologicalProcess)){
        resultsrevigo$BiologicalProcess <- resultsrevigo$BiologicalProcess %>% 
                                            filter(Eliminated=='False') %>% 
                                            select(`Term ID`, Name, Value)
        resultsrevigo$BiologicalProcess['Category'] <- rep('BiologicalProcess', dim(resultsrevigo$BiologicalProcess)[1])
        fullresultsrevigo <- rbind(fullresultsrevigo, resultsrevigo$BiologicalProcess)
    }
    if(!is.null(resultsrevigo$CellularComponent)){
        
        print(name)
        resultsrevigo$CellularComponent <- resultsrevigo$CellularComponent %>% 
                                            filter(Eliminated=='False') %>% 
                                            select(`Term ID`, Name, Value)
        resultsrevigo$CellularComponent['Category'] <- rep('CellularComponent', dim(resultsrevigo$CellularComponent)[1])
        fullresultsrevigo <- rbind(fullresultsrevigo, resultsrevigo$CellularComponent)
    }
    if(!is.null(resultsrevigo$MolecularFunction)){
        
        print(name)
        resultsrevigo$MolecularFunction <- resultsrevigo$MolecularFunction %>% 
                                            filter(Eliminated=='False') %>% 
                                            select(`Term ID`, Name, Value)
        resultsrevigo$MolecularFunction['Category'] <- rep('MolecularFunction', dim(resultsrevigo$MolecularFunction)[1])
        fullresultsrevigo <- rbind(fullresultsrevigo, resultsrevigo$MolecularFunction)
    }
    
    write.csv(data.frame(fullresultsrevigo), finalname, row.names=FALSE)
}

In [9]:
write.csv(df, paste(results_prefix, "GO_SOMS_stats.csv", sep=""))
write.csv(df2, paste(results_prefix, "GO_SOMS_go_term_list.csv", sep=""))