In [31]:
library(GO.db)
library(glue)
library(tidyverse)
library(tidytext)
library(quanteda.textstats)
library(quanteda)

if (str_detect(getwd(), "Bio_SDD")) {
    wd <- "/home/shannc/Bio_SDD/MUIC_senior_project/workflow"
    env <- "/home/shannc/Bio_SDD/miniconda3/envs/reticulate"
} else {
    wd <- "/home/shannc/workflow"
    env <- "/home/shannc/anaconda3/envs/reticulate"
}

go_file <- glue("{wd}/data/reference/go_data.tsv")
if (file.exists(go_file)) {
    info_tb <- read_tsv(go_file)
} else {
    all_gos <- as.list(GOTERM) %>% names()
    info_tb <- goInfoTb(all_gos)
    write_tsv(info_tb, go_file)
}

showFrequent <- function(tb, core_word = NULL, filter_unwanted = TRUE) {
    if (!is.null(core_word)) {
        tb <- tb %>%
            filter(grepl(core_word, term))
    }
    tb <- tb %>%
        unnest_tokens(word, term) %>%
        count(word, sort = TRUE)
    if (filter_unwanted) {
        tb %>% filter(!word %in% UNWANTED)
    } else {
        tb
    }
}


[1mRows: [22m[34m42443[39m [1mColumns: [22m[34m4[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (4): GO_IDs, term, definition, ontology

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [2]:
UNWANTED <- c(
    "of", "to", "cell", "in", "complex", "activity", "regulation", "process", "cellular", "stimulus",
    "response"
)

qualifiers <- c("positive", "negative", "catabolic", "involved")

In [3]:
info_tb

GO_IDs,term,definition,ontology
<chr>,<chr>,<chr>,<chr>
GO:0000001,mitochondrion inheritance,"The distribution of mitochondria, including the mitochondrial genome, into daughter cells after mitosis or meiosis, mediated by interactions between mitochondria and the cytoskeleton.",BP
GO:0000002,mitochondrial genome maintenance,The maintenance of the structure and integrity of the mitochondrial genome; includes replication and segregation of the mitochondrial chromosome.,BP
GO:0000003,reproduction,The production of new individuals that contain some portion of genetic material inherited from one or more parent organisms.,BP
GO:0000006,high-affinity zinc transmembrane transporter activity,"Enables the transfer of zinc ions (Zn2+) from one side of a membrane to the other, probably powered by proton motive force. In high-affinity transport the transporter is able to bind the solute even if it is only present at very low concentrations.",MF
GO:0000007,low-affinity zinc ion transmembrane transporter activity,"Enables the transfer of a solute or solutes from one side of a membrane to the other according to the reaction: Zn2+ = Zn2+, probably powered by proton motive force. In low-affinity transport the transporter is able to bind the solute only if it is present at very high concentrations.",MF
GO:0000009,"alpha-1,6-mannosyltransferase activity","Catalysis of the transfer of a mannose residue to an oligosaccharide, forming an alpha-(1->6) linkage.",MF
GO:0000010,trans-hexaprenyltranstransferase activity,"Catalysis of the reaction: (2E,6E)-farnesyl diphosphate + 4 isopentenyl diphosphate = 4 diphosphate + all-trans-heptaprenyl diphosphate.",MF
GO:0000011,vacuole inheritance,"The distribution of vacuoles into daughter cells after mitosis or meiosis, mediated by interactions between vacuoles and the cytoskeleton.",BP
GO:0000012,single strand break repair,The repair of single strand breaks in DNA. Repair of such breaks is mediated by the same enzyme systems as are used in base excision repair.,BP
GO:0000014,single-stranded DNA endodeoxyribonuclease activity,Catalysis of the hydrolysis of ester linkages within a single-stranded deoxyribonucleic acid molecule by creating internal breaks.,MF


# Get all GO terms

In [4]:
corpus <- corpus(info_tb, text_field = "term", docid_field = "GO_IDs")
docvars(corpus) <- info_tb %>% select(., ontology, definition)
tokens <- tokens(corpus, remove_punct = TRUE)

## View top words

In [35]:
dfm <- dfm(tokens)
topfeatures(dfm, 150)

## Find common phrases and patterns

In [37]:
tokens %>% kwic("multicellular", window = 10)

docname,from,to,pre,keyword,post,pattern
<chr>,<int>,<int>,<chr>,<chr>,<chr>,<fct>
GO:0007275,1,1,,multicellular,organism development,multicellular
GO:0022412,7,7,cellular process involved in reproduction in,multicellular,organism,multicellular
GO:0022608,1,1,,multicellular,organism adhesion,multicellular
GO:0022609,1,1,,multicellular,organism adhesion to substrate,multicellular
GO:0032501,1,1,,multicellular,organismal process,multicellular
GO:0032504,1,1,,multicellular,organism reproduction,multicellular
GO:0033555,1,1,,multicellular,organismal response to stress,multicellular
GO:0035212,5,5,cell competition in a,multicellular,organism,multicellular
GO:0035264,1,1,,multicellular,organism growth,multicellular
GO:0040014,3,3,regulation of,multicellular,organism growth,multicellular


In [38]:
tokens %>%
    kwic("multicellular", window = 10) %>%
    purrr::pluck("post") %>%
    unique() %>%
    discard(., \(x) str_detect(x, "involved")) %>%
    write_lines("words")

In [7]:
tokens %>%
    kwic("binding") %>%
    pluck("pre") %>%
    unique() %>%
    discard(., \(x) str_detect(x, ".*of.*|.*by.*|.*via.*")) %>%
    discard(\(x) length(str_split_1(x, " ")) > 3) %>%
    write_lines("words")


In [8]:
tokens %>% kwic("pathway") %>% pluck("pre") %>% unique() %>% discard(., \(x) str_detect(x, ".*of.*|.*by.*|.*via.*")) %>% write_lines("words")

In [9]:
tokens %>% kwic("response") %>% pluck("pre") %>% unique() %>% discard(., \(x) str_detect(x, ".*of.*|.*by.*|.*via.*")) %>% write_lines("words")

In [10]:
tokens %>% kwic("system") %>% pluck("pre") %>% discard(., \(x) str_detect(x, ".*of.*|.*by.*")) %>% unique() %>%  write_lines("systems")

In [11]:
# tokens %>% kwic("process") %>% pluck("pre") %>% unique() %>%
#     discard(., \(x) str_detect(x, ".*catabolic.*|.*biosynthetic.*|.*of.*")) %>% write_lines("processes")
read_lines("processes") %>% unique() %>% write_lines("processes")

ERROR: Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'unique': 'processes' does not exist in current working directory ('/home/shannc/Bio_SDD/MUIC_senior_project/workflow/docs').


In [None]:
tokens %>%
    kwic("cell") %>%
    pluck("post") %>%
    unique() %>%
    discard(\(x) str_detect(x, "to$|of$|.*across.*|.*in.*|.*to.*|.*of.*|.*by.*|.*with.*|.*via.*"))

read_lines("~/cell_post") %>%
    unique() %>%
    write_lines("cell_post")


In [None]:
tokens %>% kwic("membrane") %>% pluck("pre") %>% unique() %>%
    discard(\(x) str_detect(x, "to$|of$|.*across.*|.*in.*|.*to.*|.*of.*|.*by.*|.*with.*|.*via.*")) %>% unique() %>% write_lines(file = "../bin/R/.membrane_nouns")

### Get collocations

In [None]:
collocations <- tokens %>% textstat_collocations() %>% as_tibble()

In [None]:
collocations <- collocations %>% arrange(desc(count))
a <- collocations %>% filter(!grepl("process|activity|regulation", collocation))
b <- collocations %>% filter(grepl("receptor", collocation))

In [None]:
corpus %>%
  corpus_subset(., grepl("process,", as.character(corpus))) %>%
  tokens(remove_punct = TRUE) %>%
  textstat_collocations()

Unnamed: 0_level_0,collocation,count,count_nested,length,lambda,z
Unnamed: 0_level_1,<chr>,<int>,<int>,<dbl>,<dbl>,<dbl>
1,regulation of,14,0,2,8.251612,5.24826
2,nuclear-transcribed mrna,21,0,2,10.23971,5.088414
3,isopentenyl diphosphate,9,0,2,8.357572,5.017141
4,mrna catabolic,21,0,2,7.267232,4.950533
5,propionate metabolic,2,0,2,5.941632,4.509197
6,of nuclear-transcribed,9,0,2,6.730143,4.495138
7,3 5,2,0,2,5.427883,4.477459
8,5 3,2,0,2,5.427883,4.477459
9,biosynthetic process,31,0,2,6.275268,4.366745
10,sulfate proteoglycan,5,0,2,8.92439,4.362586


### Searches on specific terms

In [None]:
process <- tokens %>% kwic("process")
process$pre %>% unique()

In [None]:
activity <- tokens %>% kwic("activity")

In [None]:
pathways <- tokens %>% kwic("pathway")
pathways

docname,from,to,pre,keyword,post,pattern
<chr>,<int>,<int>,<chr>,<chr>,<chr>,<fct>
GO:0000754,4,4,adaptation of signaling,pathway,by response to pheromone involved,pathway
GO:0000766,5,5,negative adaptation of signaling,pathway,by response to pheromone involved,pathway
GO:0000947,9,9,process to alcohol via Ehrlich,pathway,,pathway
GO:0000948,10,10,to carboxylic acid via Ehrlich,pathway,,pathway
GO:0000949,11,11,process to alcohol via Ehrlich,pathway,,pathway
GO:0000950,10,10,process to alcohol via Ehrlich,pathway,,pathway
GO:0000952,12,12,to carboxylic acid via Ehrlich,pathway,,pathway
GO:0000953,11,11,to carboxylic acid via Ehrlich,pathway,,pathway
GO:0000955,7,7,acid catabolic process via Ehrlich,pathway,,pathway
GO:0001867,4,4,complement activation lectin,pathway,,pathway


In [None]:
tokens %>% kwic("cell")

docname,from,to,pre,keyword,post,pattern
<chr>,<int>,<int>,<chr>,<chr>,<chr>,<fct>
GO:0000032,1,1,,cell,wall mannoprotein biosynthetic process,cell
GO:0000075,1,1,,cell,cycle checkpoint signaling,cell
GO:0000082,6,6,G1 S transition of mitotic,cell,cycle,cell
GO:0000086,6,6,G2 M transition of mitotic,cell,cycle,cell
GO:0000196,1,1,,cell,wall integrity MAPK cascade,cell
GO:0000278,2,2,mitotic,cell,cycle,cell
GO:0000320,4,4,re-entry into mitotic,cell,cycle,cell
GO:0000321,4,4,re-entry into mitotic,cell,cycle after pheromone arrest,cell
GO:0000751,2,2,mitotic,cell,cycle G1 arrest in response,cell
GO:0000753,1,1,,cell,morphogenesis involved in conjugation with,cell


### Test compounding

In [None]:
tokens_compound(tokens, pattern = "of * activity", valuetype = "regex") %>% kwic("activity")

docname,from,to,pre,keyword,post,pattern
<chr>,<int>,<int>,<chr>,<chr>,<chr>,<fct>
GO:0000006,5,5,high-affinity zinc transmembrane transporter,activity,,activity
GO:0000007,6,6,low-affinity zinc ion transmembrane transporter,activity,,activity
GO:0000009,2,2,"alpha-1,6-mannosyltransferase",activity,,activity
GO:0000010,2,2,trans-hexaprenyltranstransferase,activity,,activity
GO:0000014,4,4,single-stranded DNA endodeoxyribonuclease,activity,,activity
GO:0000016,2,2,lactase,activity,,activity
GO:0000026,2,2,"alpha-1,2-mannosyltransferase",activity,,activity
GO:0000030,2,2,mannosyltransferase,activity,,activity
GO:0000031,3,3,mannosylphosphate transferase,activity,,activity
GO:0000033,2,2,"alpha-1,3-mannosyltransferase",activity,,activity


quanteda is good for defined phrases, but it looks like you'll have to rely on stringr for advanced regex first

### Terms with "activity"

In [None]:
activity <- info_tb %>% filter(grepl("activity", term))
samples <- activity$term

compoundActivity <- function(str) {
  match <- str_match(str, "^(?:.*of)?(.*)activity(?:.*in)?(.*)?")
  first <- match[, 2] %>%
    str_trim() %>%
    str_replace_all(., " ", "_")
  sec <- match[, 3] %>%
    str_trim() %>%
    str_replace_all(., " ", "_")
  return(str_trim(paste0(first, " ", sec)))
}

compoundActivity(samples) %>% tail()

In [None]:
activity$term

In [None]:
# General version of above
compoundSpecial <- function(str, pattern, expected_matches) {
  match <- str_match(str, pattern)
  selected <- lapply(seq(2, expected_matches + 1), \(x)  {
    match[, x] %>%
      str_trim() %>%
      str_replace_all(., " ", "_")
  })
  reduce(selected, \(x, y) paste0(x, " ", y)) %>% str_trim()
}

compoundSpecial(samples, "^(?:.*of)?(.*)activity(?:.*in)?(.*)?", 2) %>% tail()

### Terms with "regulation"

In [None]:
regulation <- info_tb %>% filter(ontology == "BP") %>% filter(grepl("regulation", term))
regulation$term %>% keep(\(x) grepl("activity", x)) %>%
    compoundSpecial("^(?:.*of)?(.*)activity(?:.*in)?(.*)?", 2)

# compoundSpecial(regulation$term, "^(?:.*of)?(.*)(?:.*from)?(.*)", 2)

### Terms with "pathway"

In [None]:
pathways <- info_tb %>% filter(grepl("pathway", term))
pathways$term

In [None]:
# bind pathways into single tokens, replacing this in the original name
f <- function(x) {
    str_replace(x, " ", "_")
}

pnames <- pathways$term %>% compoundSpecial("(?:.* in)?(?:.*of)?(?:.*via)?(.*pathway)", 1)
context1 <- pathways$term %>% str_extract(".*( in| via| of)") %>% map_chr(., \(x) ifelse(is.na(x), "", x))
context2 <- pathways$term %>% str_extract("pathway (.*)", group = 1) %>% map_chr(., \(x) ifelse(is.na(x), "", x))
paste0(context1, " ", pnames, " ", context2)

### Terms with "process"

In [None]:
process <- info_tb %>% filter(grepl("process", term) & grepl("biosynthetic|metabolic|catabolic", term)) %>%
  mutate(type = map_chr(term, \(x) {
    case_when(
      str_detect(x, "catabolic") ~ "catabolic",
      str_detect(x, "biosynthetic") ~ "anabolic",
      .default = "unspecified"
    )
  }))

In [None]:
process$term %>% corpus() %>% tokens() %>% textstat_collocations() %>% as_tibble() %>% arrange(desc(count))

collocation,count,count_nested,length,lambda,z
<chr>,<int>,<int>,<dbl>,<dbl>,<dbl>
biosynthetic process,2012,0,2,9.608525,6.792738
metabolic process,1729,0,2,9.372527,6.625806
catabolic process,1482,0,2,9.150128,6.468459
regulation of,1234,0,2,12.016198,25.099031
positive regulation,389,0,2,10.417621,7.356709
negative regulation,379,0,2,10.365670,7.320021
process from,124,0,2,7.744914,14.194976
acid biosynthetic,122,0,2,1.603711,13.625071
process to,121,0,2,6.821337,18.935297
acid metabolic,104,0,2,1.524768,12.483604


In [None]:
process$term %>% corpus() %>% tokens() %>% kwic("fructose")

docname,from,to,pre,keyword,post,pattern
<chr>,<int>,<int>,<chr>,<chr>,<chr>,<fct>
text106,1,1,,fructose,metabolic process,fructose
text107,1,1,,fructose,catabolic process,fructose
text108,1,1,,fructose,6-phosphate metabolic process,fructose
text109,1,1,,fructose,"2,6-bisphosphate metabolic process",fructose
text1346,2,2,anaerobic,fructose,catabolic process,fructose
text1531,2,2,aerobic,fructose,catabolic process,fructose
text1686,1,1,,fructose,"1,6-bisphosphate metabolic process",fructose
text2895,1,1,,fructose,biosynthetic process,fructose
text3285,3,3,regulation of,fructose,"1,6-bisphosphate metabolic process",fructose
text3286,4,4,positive regulation of,fructose,"1,6-bisphosphate metabolic process",fructose


In [None]:
process$term %>% compoundSpecial(., "^(?:.*of)?(.*)(?:metabolic|catabolic|biosynthetic.*)", 1)

# Find verbs used in each sub-ontology

In [None]:
cc <- corpus_subset(corpus, ontology == "CC")
cc_tokens <- cc %>% tokens(remove_punct = TRUE)
cc_tokens %>% textstat_collocations()

Unnamed: 0_level_0,collocation,count,count_nested,length,lambda,z
Unnamed: 0_level_1,<chr>,<int>,<int>,<dbl>,<dbl>,<dbl>
1,host cell,75,0,2,5.774402,21.19685
2,collagen type,29,0,2,6.913928,16.99967
3,endoplasmic reticulum,44,0,2,8.514725,16.99019
4,transport vesicle,33,0,2,5.502378,16.51155
5,RNA polymerase,31,0,2,7.969458,15.31399
6,transcription factor,22,0,2,4.854412,15.17161
7,mitotic spindle,18,0,2,6.876903,15.06987
8,extracellular matrix,17,0,2,6.357154,14.97250
9,spindle pole,22,0,2,7.726801,14.80247
10,cytoplasmic side,18,0,2,4.953262,14.71483


In [None]:
mf <- corpus_subset(corpus, ontology == "MF")
mf_tokens <- mf %>% tokens(remove_punct = TRUE)
mf_tokens %>% textstat_collocations() %>% as_tibble() %>% arrange(desc(count))

collocation,count,count_nested,length,lambda,z
<chr>,<int>,<int>,<dbl>,<dbl>,<dbl>
transporter activity,600,0,2,4.9839297,15.997437
synthase activity,501,0,2,3.5184860,20.726862
transmembrane transporter,482,0,2,8.9616555,32.813459
receptor activity,388,0,2,0.8559265,11.867779
dehydrogenase activity,379,0,2,1.6941425,18.282288
receptor binding,348,0,2,2.8766101,37.422710
reductase activity,247,0,2,1.7956953,15.099087
kinase activity,240,0,2,1.9523105,15.297772
hydrolase activity,228,0,2,4.0766588,12.303215
ligase activity,187,0,2,2.9264204,13.599953


In [None]:
bp <- corpus_subset(corpus, ontology == "BP")
bp_tokens <- bp %>% tokens(remove_punct = TRUE)
bp_tokens %>% textstat_collocations() %>% as_tibble() %>% arrange(desc(count))

collocation,count,count_nested,length,lambda,z
<chr>,<int>,<int>,<dbl>,<dbl>,<dbl>
regulation of,9736,0,2,9.6516094,50.227898
positive regulation,3082,0,2,8.0561942,45.446861
negative regulation,3046,0,2,8.4939214,38.202519
biosynthetic process,2012,0,2,11.6569969,8.241132
metabolic process,1729,0,2,9.8211314,15.512242
response to,1647,0,2,8.2377968,54.182891
catabolic process,1479,0,2,11.2126894,7.926736
involved in,1376,0,2,13.1005530,9.257985
signaling pathway,1031,0,2,8.8804867,67.245386
cell differentiation,617,0,2,4.4113716,60.010643


In [None]:
pathways <- info_tb %>% filter(grepl("pathway", term))
pathways_token <- bp %>% tokens(remove_punct = TRUE)
pathways_token %>% textstat_collocations() %>% as_tibble() %>% arrange(desc(count))
# kwic(pathways_token, "pathway")

collocation,count,count_nested,length,lambda,z
<chr>,<int>,<int>,<dbl>,<dbl>,<dbl>
regulation of,9736,0,2,9.6516094,50.227898
positive regulation,3082,0,2,8.0561942,45.446861
negative regulation,3046,0,2,8.4939214,38.202519
biosynthetic process,2012,0,2,11.6569969,8.241132
metabolic process,1729,0,2,9.8211314,15.512242
response to,1647,0,2,8.2377968,54.182891
catabolic process,1479,0,2,11.2126894,7.926736
involved in,1376,0,2,13.1005530,9.257985
signaling pathway,1031,0,2,8.8804867,67.245386
cell differentiation,617,0,2,4.4113716,60.010643
