Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

change syn file format #113

Merged
merged 12 commits into from
May 8, 2023
7 changes: 4 additions & 3 deletions kubernetes/babel-outputs.k8s.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# Kubernetes file for setting up a volume to use for Babel outputs.
#
# As of April 2022, Babel outputs take up around 95G, so a default
# of 200 Gi is probably good enough for the near future.
# As of April 2023, Babel outputs take up around 200G, including
# intermediate files (~50G), compendia (~30G) and synonyms (over 100 GB).
# So 300G is probably enough.

apiVersion: v1
kind: PersistentVolumeClaim
Expand All @@ -14,5 +15,5 @@ spec:
- ReadWriteOnce
resources:
requests:
storage: 200Gi
storage: 300Gi
storageClassName: basic
23 changes: 18 additions & 5 deletions src/babel_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[]):
synonym_factory = SynonymFactory(make_local_name(''))
ic_factory = InformationContentFactory(f'{get_config()["input_directory"]}/icRDF.tsv')
node_test = node_factory.create_node(input_identifiers=[],node_type=node_type,labels={},extra_prefixes = extra_prefixes)
with jsonlines.open(os.path.join(cdir,'compendia',ofname),'w') as outf, open(os.path.join(cdir,'synonyms',ofname),'w') as sfile:
with jsonlines.open(os.path.join(cdir,'compendia',ofname),'w') as outf, jsonlines.open(os.path.join(cdir,'synonyms',ofname),'w') as sfile:
for slist in synonym_list:
node = node_factory.create_node(input_identifiers=slist, node_type=node_type,labels = labels, extra_prefixes = extra_prefixes)
if node is not None:
Expand All @@ -226,10 +226,23 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[]):
nw['ic'] = ic
nw['identifiers'] = [ {k[0]:v for k,v in nids.items()} for nids in node['identifiers']]
outf.write( nw )
synonyms = synonym_factory.get_synonyms(node)
if len(synonyms) > 0:
for p,o in synonyms:
sfile.write(f'{node["identifiers"][0]["identifier"]}\t{p}\t{o}\n')

# get_synonyms() returns tuples in the form ('http://www.geneontology.org/formats/oboInOwl#hasExactSynonym', 'Caudal articular process of eighteenth thoracic vertebra')
# But we're only interested in the synonyms themselves, so we can skip the relationship for now.
synonyms = [result[1] for result in synonym_factory.get_synonyms(node)]
synonyms_list = sorted(synonyms,key=lambda x:len(x))
try:
document = {"curie": node["identifiers"][0]["identifier"],
"names": synonyms_list,
"types": [ t[8:] for t in node_factory.get_ancestors(node["type"])]} #remove biolink:
if "label" in node["identifiers"][0]:
document["preferred_name"] = node["identifiers"][0]["label"]
sfile.write( document )
except Exception as ex:
print(f"Exception thrown while write_compendium() was generating {ofname}: {ex}")
print(node["type"])
print(node_factory.get_ancestors(node["type"]))
exit()

def glom(conc_set, newgroups, unique_prefixes=['INCHIKEY'],pref='HP',close={}):
"""We want to construct sets containing equivalent identifiers.
Expand Down
50 changes: 40 additions & 10 deletions src/createcompendia/leftover_umls.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
from datetime import datetime
import json
import jsonlines
from pathlib import Path

from snakemake.logging import Logger
from bmt import Toolkit

from src.node import NodeFactory
from src.datahandlers import umls
from src.prefixes import UMLS
from src.categories import ACTIVITY, AGENT, DEVICE, DRUG, FOOD, SMALL_MOLECULE, PHYSICAL_ENTITY, PUBLICATION, PROCEDURE


def write_leftover_umls(compendia, mrconso, mrsty, synonyms, umls_compendium, umls_synonyms, report, done):
def write_leftover_umls(compendia, mrconso, mrsty, synonyms, umls_compendium, umls_synonyms, report, done, biolink_version):
"""
Search for "leftover" UMLS concepts, i.e. those that are defined and valid in MRCONSO but are not
mapped to a concept in Babel.
Expand Down Expand Up @@ -68,6 +70,8 @@ def write_leftover_umls(compendia, mrconso, mrsty, synonyms, umls_compendium, um
# print(umls_ids_in_other_compendia)

# Load all the semantic types.
umls_type_by_id = dict()
preferred_name_by_id = dict()
types_by_id = dict()
types_by_tui = dict()
with open(mrsty, 'r') as inf:
Expand Down Expand Up @@ -161,6 +165,8 @@ def umls_type_to_biolink_type(umls_tui):
count_multiple_umls_type += 1
continue
biolink_type = list(biolink_types)[0]
umls_type_by_id[umls_id] = biolink_type
preferred_name_by_id[umls_id] = label

# Write this UMLS term to UMLS.txt as a single-identifier term.
cluster = {
Expand All @@ -180,19 +186,43 @@ def umls_type_to_biolink_type(umls_tui):
logging.info(f"Found {count_no_umls_type} UMLS IDs without UMLS types and {count_multiple_umls_type} UMLS IDs with multiple UMLS types.")
reportf.write(f"Found {count_no_umls_type} UMLS IDs without UMLS types and {count_multiple_umls_type} UMLS IDs with multiple UMLS types.\n")

# Write out synonyms for all IDs in this compendium.
synonym_ids = set()
count_synonyms = 0
with open(synonyms, 'r') as synonymsf, open(umls_synonyms, 'w') as umls_synonymsf:
# Collected synonyms for all IDs in this compendium.
synonyms_by_id = dict()
with open(synonyms, 'r') as synonymsf:
for line in synonymsf:
id, relation, synonym = line.rstrip().split('\t')
if id in umls_ids_in_this_compendium:
synonym_ids.add(id)
count_synonyms += 1
umls_synonymsf.write(f"{id}\t{relation}\t{synonym}\n")
# Add this synonym to the set of synonyms for this identifier.
if id not in synonyms_by_id:
synonyms_by_id[id] = set()
synonyms_by_id[id].add(synonym)

# We don't record the synonym relation (https://github.com/TranslatorSRI/Babel/pull/113#issuecomment-1516450124),
# so we don't need to write that out now.

logging.info(f"Collected synonyms for {len(synonyms_by_id)} UMLS IDs into the leftover UMLS synonyms file.")
reportf.write(f"Collected synonyms for {len(synonyms_by_id)} UMLS IDs into the leftover UMLS synonyms file.\n")

# Write out synonyms to synonym file.
node_factory = NodeFactory('babel_downloads/UMLS/labels', biolink_version)
count_synonym_objs = 0
with jsonlines.open(umls_synonyms, 'w') as umls_synonymsf:
for id in synonyms_by_id:
document = {
"curie": id,
"names": list(sorted(list(synonyms_by_id[id]), key=lambda syn:len(syn))),
"types": [ t[8:] for t in node_factory.get_ancestors(umls_type_by_id[id])]
}

if id in preferred_name_by_id:
document["preferred_name"] = preferred_name_by_id[id]

umls_synonymsf.write(document)
count_synonym_objs += 1

logging.info(f"Wrote out {count_synonym_objs} synonym objects into the leftover UMLS synonyms file.")
reportf.write(f"Wrote out {count_synonym_objs} synonym objects into the leftover UMLS synonyms file.\n")

logging.info(f"Wrote {count_synonyms} synonyms for {len(synonym_ids)} UMLS IDs into the leftover UMLS synonyms file.")
reportf.write(f"Wrote {count_synonyms} synonyms for {len(synonym_ids)} UMLS IDs into the leftover UMLS synonyms file.\n")

# Write out `done` file.
with open(done, 'w') as outf:
Expand Down
2 changes: 1 addition & 1 deletion src/datahandlers/ncbigene.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,6 @@ def pull_ncbigene_labels_and_synonyms():
others = set(x[13].split('|'))
syns.update(others)
for syn in syns:
synfile.write(f'{gene_id}\t{syn}\n')
synfile.write(f'{gene_id}\thttp://www.geneontology.org/formats/oboInOwl#hasSynonym\t{syn}\n')


2 changes: 1 addition & 1 deletion src/snakefiles/leftover_umls.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,4 @@ rule leftover_umls:
report = config['output_directory'] + "/reports/umls.txt",
done = config['output_directory'] + "/reports/umls_done"
run:
leftover_umls.write_leftover_umls(input.input_compendia, input.mrconso, input.mrsty, input.synonyms, output.umls_compendium, output.umls_synonyms, output.report, output.done)
leftover_umls.write_leftover_umls(input.input_compendia, input.mrconso, input.mrsty, input.synonyms, output.umls_compendium, output.umls_synonyms, output.report, output.done, config['biolink_version'])