In [1]:
library(tidyverse)
library(XML)
library(httr)
library(stringi)
library(lubridate)
library(GOSemSim)
library(org.Hs.eg.db)

Registered S3 method overwritten by 'rvest':
  method            from
  read_xml.response xml2
-- Attaching packages --------------------------------------- tidyverse 1.2.1 --
v ggplot2 3.3.5     v purrr   0.3.4
v tibble  3.1.6     v dplyr   1.0.7
v tidyr   1.1.4     v stringr 1.4.0
v readr   1.3.1     v forcats 0.4.0
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()

Attaching package: 'lubridate'

The following object is masked from 'package:base':

    date


GOSemSim v2.12.0  For help: https://guangchuangyu.github.io/GOSemSim

If you use GOSemSim in published research, please cite:
Guangchuang Yu, Fei Li, Yide Qin, Xiaochen Bo, Yibo Wu, Shengqi Wang. GOSemSim: an R package for measuring semantic similarity among GO terms and gene products Bioinformatics 2010, 26(7):976-978
Loading required package: AnnotationDbi
Loading required package: stats4
Loading required package: BiocG

In [2]:
# go term enrichment
# inputs
# required to include a list of genes for which to find go enrichments
# optional to include a new gochart for comparing files, a new total_genes (for pval calc), and a path&name for saving outputs
# notes: 
#      go table parsing can be found in create_go_table.ipynb

gochart <- read.csv("gotable.csv")
go_enrichment_test <- function(set_of_genes, go_chart = gochart, total_genes = 24769, outputfile=NULL, v=TRUE){
    go_testset <- go_chart[go_chart$genename %in% set_of_genes, ]
    terms <- unique(go_testset$go_term)
    if(v){
        print(paste("TERMS: ", length(terms)), sep="")
    }
    
    enriched_terms <- c()
    for (term in terms){ 
        genes_to_term_in_set <- nrow(go_testset[go_testset$go_term == term,])
        genes_to_term_in_genome <- nrow(go_chart[go_chart$go_term == term,])
        genes_not_term_in_genome <- total_genes - genes_to_term_in_genome
        size_of_set <- length(set_of_genes)
        test <- phyper(genes_to_term_in_set, genes_to_term_in_genome,
                       genes_not_term_in_genome, size_of_set, lower.tail=FALSE)        
        enriched_terms$term <- c(enriched_terms$term, term)
        enriched_terms$pval <- c(enriched_terms$pval, test)
        enriched_terms$genes_to_term_in_set <- c(enriched_terms$genes_to_term_in_set, genes_to_term_in_set)
        enriched_terms$genes_to_term_in_genome <- c(enriched_terms$genes_to_term_in_genome, genes_to_term_in_genome)
        enriched_terms$genes_not_term_in_genome <- c(enriched_terms$genes_not_term_in_genome, genes_not_term_in_genome)
        enriched_terms$size_of_set <- c(enriched_terms$size_of_set, size_of_set)
    }
    
    enriched_terms <- as.data.frame(enriched_terms)
    enriched_terms <- enriched_terms %>% filter(genes_to_term_in_genome >= 10)
    enriched_terms$p.adj <- p.adjust(enriched_terms$pval, method='fdr')
    enriched_terms <- enriched_terms %>% filter(p.adj < 0.05)
    
    if(v){
        print(paste("SIG TERMS:", nrow(enriched_terms)))
    }
    
    if(!is.null(outputfile)) {
        write.csv(enriched_terms, outputfile)
    }
    return(enriched_terms)
}   

In [3]:
# example of use
filename_1 <- '/data/projects/julia.pratt/CS1_genomeSelection/all_rootstock_comparisons/Leaf_2018_Anthesis_Ungrafted-1103P.csv'
data_1 <- read.csv(filename_1)
genelist_1 <- data_1 %>% filter(padj < 0.05) %>% filter(log2FoldChange > 0)
go_results_1 <- go_enrichment_test(genelist_1$X)

[1] "TERMS:  68"
[1] "SIG TERMS: 24"


In [4]:
# example of use
filename_2 <- '/data/projects/julia.pratt/CS1_genomeSelection/all_rootstock_comparisons/Leaf_2018_Anthesis_Ungrafted-3309C.csv'
data_2 <- read.csv(filename_2)
genelist_2 <- data_2 %>% filter(padj < 0.05) %>% filter(log2FoldChange > 0)
go_results_2 <- go_enrichment_test(genelist_2$X)

[1] "TERMS:  183"
[1] "SIG TERMS: 23"


In [5]:
# revigo

# inputs: 
# required two matching lists: one of terms and the other of pvals
# optional to specify a directory to store tmp files and a file path&name for saving output

run_revigo <- function(terms, pvals, tmp_dir="./", outputfile=NULL) {
    tmp <- data.frame("term"=terms, "pval"=pvals)
    tmpfile <- paste(tmp_dir, "tmp.txt", sep="")
    write.table(tmp, tmpfile, quote=FALSE, row.names = FALSE, col.names = FALSE)
    filedata <-readChar(tmpfile,file.info(tmpfile)$size)
    
    httr::POST(
      url = "http://revigo.irb.hr/Revigo.aspx",
      body = list(
        cutoff = "0.5",
        valueType = "pvalue",
        speciesTaxon = "0",
        measure = "SIMREL",
        goList = filedata
      ),
      # application/x-www-form-urlencoded
      encode = "form"
    ) -> res

    dat <- httr::content(res, encoding = "UTF-8")

    # Write results to a file
    dat <- stri_replace_all_fixed(dat, "\r", "")
    tmpfile <- paste(tmp_dir, "tmpresults.html", sep="")
    cat(dat, file=tmpfile, fill = FALSE)
    resultsrevigo <- readHTMLTable(tmpfile)   
    fullresultsrevigo <- setNames(data.frame(matrix(ncol = 4, nrow = 0)), c("Term.ID", "Name", "Value", "Category"))
    if(!is.null(resultsrevigo$BiologicalProcess)){
        resultsrevigo$BiologicalProcess <- resultsrevigo$BiologicalProcess %>% 
                                            filter(Eliminated=='False') %>% 
                                            dplyr::select(`Term ID`, Name, Value)
        resultsrevigo$BiologicalProcess['Category'] <- rep('BiologicalProcess', dim(resultsrevigo$BiologicalProcess)[1])
        fullresultsrevigo <- rbind(fullresultsrevigo, resultsrevigo$BiologicalProcess)
    }   
    if(!is.null(resultsrevigo$CellularComponent)){
        resultsrevigo$CellularComponent <- resultsrevigo$CellularComponent %>% 
                                            filter(Eliminated=='False') %>% 
                                            dplyr::select(`Term ID`, Name, Value)
        resultsrevigo$CellularComponent['Category'] <- rep('CellularComponent', dim(resultsrevigo$CellularComponent)[1])
        fullresultsrevigo <- rbind(fullresultsrevigo, resultsrevigo$CellularComponent)
    }
    if(!is.null(resultsrevigo$MolecularFunction)){
        resultsrevigo$MolecularFunction <- resultsrevigo$MolecularFunction %>% 
                                            filter(Eliminated=='False') %>% 
                                            dplyr::select(`Term ID`, Name, Value)
        resultsrevigo$MolecularFunction['Category'] <- rep('MolecularFunction', dim(resultsrevigo$MolecularFunction)[1])
        fullresultsrevigo <- rbind(fullresultsrevigo, resultsrevigo$MolecularFunction)
    }
    
    if(!is.null(outputfile)){
        write.csv(data.frame(fullresultsrevigo), finalname, row.names=FALSE)
    }
    return(data.frame(fullresultsrevigo))
}

In [6]:
# example of use, see also go_enrichment_test
run_revigo(terms=go_results_2[["term"]], pvals=go_results_2[["pval"]])

"argument is not an atomic vector; coercing"

Term.ID,Name,Value,Category
GO:0008272,sulfate transport,-2.4316,BiologicalProcess
GO:0042545,cell wall modification,-6.389,BiologicalProcess
GO:0045944,positive regulation of transcription by RNA polymerase II,-2.4306,BiologicalProcess
GO:0006073,cellular glucan metabolic process,-2.2031,BiologicalProcess
GO:0015629,actin cytoskeleton,-5.4447,CellularComponent
GO:0005618,cell wall,-2.2031,CellularComponent
GO:0005576,extracellular region,-3.9942,CellularComponent
GO:0004857,enzyme inhibitor activity,-11.4355,MolecularFunction
GO:0005509,calcium ion binding,-4.1249,MolecularFunction
GO:0015116,sulfate transmembrane transporter activity,-2.4316,MolecularFunction


In [73]:
# ipr enrichment
# inputs:


# scanned_file <- read.csv('/data/projects/julia.pratt/CS1_genomeSelection/scripts/vitvi.vcostv3.clean.pep.txt', sep='\t')
ipr_enrichment_test <- function(set_of_genes, scan=scanned_file, v=TRUE){
    ipr_sub <- scan %>%
    dplyr::select(genename, e.value, ipr_accession) %>%
    dplyr::filter(e.value < 1e-05) %>% 
    dplyr::filter(ipr_accession != '') %>%
    dplyr::select(genename, ipr_accession) %>%
    unique()
    
    set_of_genes <- set_of_genes %>% paste('.t01', sep='')
    ipr_testset <- ipr_sub[ipr_sub$genename %in% set_of_genes,]
    terms <- unique(ipr_testset$ipr_accession)
    
    if(v){
        print(str_interp("analyzing ${length(terms)} IPR accessions..."))
    }
    
    enriched_terms <- c() 
    for (term in terms){
        genes_to_term_in_set <- nrow(ipr_testset[ipr_testset$ipr_accession == term,])
        genes_to_term_in_genome <- nrow(ipr_sub[ipr_sub$ipr_accession == term,])
        genes_not_term_in_genome <- 24769 - genes_to_term_in_genome
        size_of_set <- length(set_of_genes)
        test <- phyper(genes_to_term_in_set, genes_to_term_in_genome,
                       genes_not_term_in_genome, size_of_set, lower.tail=FALSE)
        
        if (test < 0.05/length(terms) & 
            genes_to_term_in_genome >= 10 & 
            genes_to_term_in_set >= 3){
            if (v) {
                print(c(term, test, genes_to_term_in_set, genes_to_term_in_genome))
            }
            enriched_terms$term <- c(enriched_terms$term, term)
            enriched_terms$genes_to_term_in_set <- c(enriched_terms$genes_to_term_in_set, genes_to_term_in_set)
            enriched_terms$genes_to_term_in_genome <- c(enriched_terms$genes_to_term_in_genome, genes_to_term_in_genome)
            enriched_terms$pval <- c(enriched_terms$pval, test)
        }
    }
    return(enriched_terms)
}

In [74]:
# example of use
ipr_results <- ipr_enrichment_test(as.character(genelist_1$X))
ipr_results

[1] "analyzing 154 IPR accessions..."
[1] "IPR001471"            "1.07609723465595e-17" "15"                  
[4] "147"                 
[1] "IPR036955"            "1.20193093598421e-17" "15"                  
[4] "148"                 
[1] "IPR016177"            "3.85411345356568e-17" "15"                  
[4] "159"                 
[1] "IPR002048"           "4.2353739574481e-07" "7"                  
[4] "145"                
[1] "IPR011992"            "4.39768215039543e-06" "7"                   
[4] "198"                 
[1] "IPR000757"            "4.89451036565296e-05" "3"                   
[4] "43"                  
[1] "IPR010713"            "2.98084961180961e-05" "3"                   
[4] "38"                  


In [76]:
# pfam enrichment
# inputs:
# outputs: 

scan <- read.csv('/data/projects/julia.pratt/CS1_genomeSelection/scripts/vitvi.vcostv3.clean.pep.txt', sep='\t')

# scanned_file <- read.csv('/data/projects/julia.pratt/CS1_genomeSelection/scripts/vitvi.vcostv3.clean.pep.txt', sep='\t')
pfam_enrichment_test <- function(set_of_genes, scan=scanned_file, v=TRUE){
    pfam_sub <- scan %>%
    dplyr::select(genename, analysis, signature_accession, signature_description, e.value, ipr_accession) %>%
    dplyr::filter(e.value < 1e-05) %>% 
    dplyr::filter(analysis == 'Pfam') %>%
    dplyr::select(genename, signature_accession, signature_description, ipr_accession) %>%
    unique()
    
    set_of_genes <- set_of_genes %>% paste('.t01', sep='')
    pfam_testset <- pfam_sub[pfam_sub$genename %in% set_of_genes,]
    terms <- unique(pfam_testset$signature_accession)
    
    if(v){
        print(str_interp("analyzing ${length(terms)} Pfam domains..."))
    }
    
    enriched_terms <- c() 
    for (term in terms){
        genes_to_term_in_set <- nrow(pfam_testset[pfam_testset$signature_accession == term,])
        genes_to_term_in_genome <- nrow(pfam_sub[pfam_sub$signature_accession == term,])
        genes_not_term_in_genome <- 24769 - genes_to_term_in_genome
        size_of_set <- length(set_of_genes)
        test <- phyper(genes_to_term_in_set, genes_to_term_in_genome,
                       genes_not_term_in_genome, size_of_set, lower.tail=FALSE)
        
        if (test < 0.05/length(terms) & 
            genes_to_term_in_genome >= 10 & 
            genes_to_term_in_set >= 3){
            if (v) {
                print(c(term, test, genes_to_term_in_set, genes_to_term_in_genome))
            }
            enriched_terms$term <- c(enriched_terms$term, term)
            enriched_terms$genes_to_term_in_set <- c(enriched_terms$genes_to_term_in_set, genes_to_term_in_set)
            enriched_terms$genes_to_term_in_genome <- c(enriched_terms$genes_to_term_in_genome, genes_to_term_in_genome)
            enriched_terms$pval <- c(enriched_terms$pval, test)
        }
    }
    return(enriched_terms)
}

In [77]:
# example of use
pfam_results <- pfam_enrichment_test(as.character(genelist_1$X))
pfam_results

[1] "analyzing 85 Pfam domains..."
[1] "PF00847"              "6.11267309335109e-18" "15"                  
[4] "142"                 
[1] "PF13499"             "9.3548808237208e-07" "6"                  
[4] "111"                
[1] "PF00722"              "4.89451036565296e-05" "3"                   
[4] "43"                  
[1] "PF06955"              "2.98084961180961e-05" "3"                   
[4] "38"                  
[1] "PF13833"              "5.01617790663546e-09" "5"                   
[4] "30"                  


In [78]:
# gosemsim
# inputs: two lists of go terms, as characters, filtered by ontology (passed in as "BP", "CC", "MF")
# measurement recommended (default): "Wang"

compare_two <- function(go1, go2, ont, measure="Wang") {
    hsGO <- godata('org.Hs.eg.db', ont=ont)
    measure <- mgoSim(go1, go2, semData=hsGO, measure=measure)
    gosemsims <- rbind(gosemsims, data.frame(
        "filename" = f,
        "ont" = ont,
        "semsim" = measure)
    )
}

In [82]:
go_results_1

term,pval,genes_to_term_in_set,genes_to_term_in_genome,genes_not_term_in_genome,size_of_set,p.adj
GO:0004568,0.005158789,1,23,24746,116,0.01418667
GO:0006032,0.005158789,1,23,24746,116,0.01418667
GO:0016998,0.005158789,1,23,24746,116,0.01418667
GO:0006334,0.001907271,1,14,24755,116,0.009536357
GO:0003677,2.781172e-06,17,1080,23689,116,3.824112e-05
GO:0003700,8.756623e-13,16,355,24414,116,4.816142e-11
GO:0006355,1.055418e-09,17,638,24131,116,2.902399e-08
GO:0006457,0.003855627,2,67,24702,116,0.0141373
GO:0051082,0.002199133,2,55,24714,116,0.01007936
GO:0005509,2.167969e-06,7,180,24589,116,3.824112e-05


In [81]:
# example of use, see also go_enrichment_test
go1 <- go_results_1 %>% filter(Category == "BiologicalProcess")
# go2 <- go_results_2 %>% filter(Category == "BiologicalProcess")
# compare_two(as.character(go_results_1[['term']]), as.character(go_results_2[['term']]))

ERROR: Error: Problem with `filter()` input `..1`.
i Input `..1` is `Category == "BiologicalProcess"`.
x object 'Category' not found
