In [260]:
import matplotlib.pyplot as plt
%matplotlib inline
import polars as pl
import numpy as np
import networkx as nx
import obonet
import os
import json

go_data = pl.read_csv("/home/shannc/Bio_SDD/MUIC_senior_project/workflow/data/reference/go_data.tsv", separator="\t")
level_file = "/home/shannc/Bio_SDD/MUIC_senior_project/workflow/data/reference/go_level_map.tsv"
mapping_file = "/home/shannc/Bio_SDD/MUIC_senior_project/workflow/data/reference/go_map_generic.tsv"
all_custom = {}
go_path = "/home/shannc/Bio_SDD/MUIC_senior_project/workflow/data/reference/go.obo"


ROOTS = {"BP": "GO:0008150", "CC": "GO:0005575", "MF": "GO:0003674"}

In [2]:
toxin_terms = ["toxin", "venom"]

toxin_gos = go_data.filter(pl.col("term").str.contains_any(toxin_terms))
toxin_gos.write_csv("./toxin_gos.tsv", separator="\t")
all_custom["toxin_related"] = toxin_gos["GO_IDs"].to_list()

In [3]:
all_custom["toxin_related"] = (["GO:0019835", "GO:0042268", "GO:0045918", "GO:0045919", "GO:0051715" ,"GO:0101004", "GO:1904856"] + ["GO:0004623", "GO:0016005", "GO:0019834", "GO:0032429", "GO:0032430", "GO:0032431", "GO:0047498", "GO:0047499", "GO:1900138"] + # Phospholipases A2
                               [ "GO:0044179" ] # hemolysis
+ ["GO:0004222", "GO:0008191", "GO:0008235", "GO:0008237", "GO:1902963", "GO:1902964", "GO:1904464", "GO:1904465", "GO:1904466", "GO:1904683", "GO:1904684", "GO:1904685", "GO:1905048", "GO:1905049", "GO:1905050", "GO:1990773"] # Metalloproteinase-related
)




In [4]:
with open("../config/custom_go_groups.json", "w") as c:
    json.dump(all_custom, c, indent=4)

In [239]:
def relation_path(paths: list[tuple], relation: str) -> list:
    """Find the path from a list of paths (which are edge lists)
    that consists only of `relation`
    """
    result = []
    for path in paths:
        if all(map(lambda x: x[2] == relation, path)):
            for p in path:
                result.append((*p[:2][::-1], p[2]))
        # Note: This changes the direction of the obonet GO graph so that successors are children and predecessors are ancestors
        break
    return result

def get_filtered_go(go_path, go_data):
    GO_temp = obonet.read_obo(go_path)
    go_data = go_data.filter(pl.col("GO_IDs").is_in(list(GO_temp.nodes)))

    GO: nx.MultiDiGraph = nx.MultiDiGraph()
    id2root = dict(zip(go_data["GO_IDs"], go_data["ontology"]))
    id2name = dict(zip(go_data["GO_IDs"], go_data["term"]))

    GO.add_nodes_from(GO_temp.nodes)
    for go in GO_temp.nodes:
        if go in id2root:
            paths = nx.all_simple_edge_paths(GO_temp, source=go, target=ROOTS[id2root[go]])
            is_a = relation_path(paths, "is_a")
            if is_a:
                GO.add_edges_from(is_a)

    nx.set_node_attributes(GO, id2name, "name")
    return GO


In [241]:
saved_graph = "filtered_go.json"
if os.path.exists(saved_graph):
    with open("filtered_go.json", "r") as r:
        GO = nx.node_link_graph(json.load(r))
else:
    GO = get_filtered_go(go_path, go_data)
    with open("filtered_go.json", "w") as w:
        json.dump(nx.node_link_data(GO), w)

In [242]:
# Goal: map a specific GO term variant to its most general version
# e.g. double-strand break repair involved in meiotic recombination -> double-strand break repair
QUALIFIER_WORDS = {"of", "negative", "positive", "involved", "via"} # these words indicate that a GO term is a variant of another

def n_children_with(node: str, words: set[str]) -> tuple[str, int]:
    tree = nx.bfs_tree(GO, node)
    rare_vars = []
    if (self_name := GO.nodes[node].get("name")):
        self_words = set(self_name.split(" "))
    else:
        self_words = {" "}
    count = 0
    if "transcription" in node:
        print(node)
    for n in tree.nodes:
        name = GO.nodes[n].get("name")
        if name:
            name_words = set(name.split(" "))
            if (len(name_words & words) > 1) or (len(name_words & self_words) > 1):
                count += 1
                rare_vars.append(n)
    return count, rare_vars

In [207]:
def find_generic():
    term_data: dict = {"GO_IDs": [], "n_children_with": [], "level": []}
    generic2variants: dict = {}
    for ontology in ROOTS.values():
        for level, layer in enumerate(nx.bfs_layers(GO, ontology)):
            for node in layer:
                term_data["level"].append(level)
                term_data["GO_IDs"].append(node)
                found = n_children_with(node, QUALIFIER_WORDS)
                term_data["n_children_with"].append(found[0])
                if found[0] > 0:
                    generic2variants[node] = found[1]
    df = pl.DataFrame(term_data)
    return df, generic2variants

In [243]:
df, g2v = find_generic()

In [244]:
ontology2n = {}
for o in ROOTS.keys():
    filtered = df.join(go_data, on="GO_IDs").filter((pl.col("level") > 3) & (pl.col("n_children_with") > 0) & (pl.col("ontology") == o))
    print(o, filtered["n_children_with"].mean())

BP 2.8011789710069377
CC 1.4229934924078091
MF 1.6058720591094693


In [245]:
candidates = df.filter((pl.col("level") > 3) & (pl.col("n_children_with") > 5)).join(go_data.select("GO_IDs", "term"), on="GO_IDs")
print(candidates.shape)
list(candidates["term"])

(2030, 4)


['regulation of DNA recombination',
 'mannosyltransferase activity',
 'transition metal ion transport',
 'cell cycle checkpoint signaling',
 'sulfur amino acid metabolic process',
 'sulfur amino acid biosynthetic process',
 'sulfur amino acid catabolic process',
 'negative regulation of transcription by RNA polymerase II',
 'histone acetyltransferase complex',
 'cytoplasmic ubiquitin ligase complex',
 'nucleotide binding',
 'DNA secondary structure binding',
 'microtubule cytoskeleton organization',
 'nitrilase activity',
 'peroxisome targeting sequence binding',
 'polysaccharide biosynthetic process',
 'polysaccharide catabolic process',
 'adenine nucleotide transmembrane transporter activity',
 'response to reactive oxygen species',
 'RNA splicing, via transesterification reactions',
 'regulation of transcription by galactose',
 'DNA-directed RNA polymerase complex',
 'carbon catabolite regulation of transcription from RNA polymerase II promoter',
 'endonucleolytic cleavage of tricis

In [261]:
def children2parent(parent_term: str):
    dct = {"child": [], "parent": []}
    for n in nx.bfs_tree(GO, parent_term).nodes:
        dct["parent"].append(parent_term)
        dct["child"].append(n)
    return pl.DataFrame(dct)

sample = children2parent("GO:0019843")
s2 = children2parent("GO:0019543")

all_map = pl.concat([children2parent(c) for c in candidates["GO_IDs"]])
all_map.write_csv(mapping_file, separator="\t")
df.select("GO_IDs", "level").write_csv(level_file, separator="\t")