### Convert predicted entities in the evaluation format

Define the imports

In [28]:
import json

Define paths to the prediction files

In [None]:
PATH_NER_PREDICTIONS_EVAL_FORMAT = "../Predictions/NER/predicted_entities_eval_format.json"
PATH_RE_PREDICTIONS = "../Predictions/RE/predicted_relations.json"

Define output path

In [30]:
PATH_OUTPUT_MERGED_PREDICTIONS = "../Predictions/predictions_eval_format.json"
PATH_OUTPUT_61_NER = "../Eval/BIU_ONLP_T61_runID_systemDesc.json"
PATH_OUTPUT_621_BINARY_TAG_RE = "../Eval/BIU_ONLP_T621_4_RobertaLarge.json"
PATH_OUTPUT_622_TERNARY_TAG_RE = "../Eval/BIU_ONLP_T622_4_RobertaLarge.json"
PATH_OUTPUT_623_TERNARY_MENTION_RE = "../Eval/BIU_ONLP_T623_4_RobertaLarge.json"

Load the input files into dictionary variables

In [31]:
with open(PATH_NER_PREDICTIONS_EVAL_FORMAT, 'r', encoding='utf-8') as file:
	ner_predictions = json.load(file)
	
with open(PATH_RE_PREDICTIONS, 'r', encoding='utf-8') as file:
	re_predictions = json.load(file)

#### Define the functions to process RE predictions

Map subjects and objects of predicted relations to entities

In [32]:
def map_predicted_relations_to_entities(re_data, ner_data):
    for rel in re_data:
        doc_pmid_title = rel['title']
        h_idx = rel['h_idx']
        t_idx = rel['t_idx']
        predicate = rel['r'].lower()
        doc_pmid = doc_pmid_title.split('||')[0].strip()
        title = doc_pmid_title.split('||')[1].strip()

        article_found = False
        # print("rel:", rel)
        # print("doc_pmid:", doc_pmid)
        # print("title:", title)
        # print("h_idx:", h_idx)
        # print("t_idx:", t_idx)
        # print("predicate:", predicate)
        for pmid, article in ner_data.items():
            if title == article['metadata']['title']:
                article_found = True
                subject_entity = article['entities'][h_idx]
                object_entity = article['entities'][t_idx]
                article['relations'].append({
                    "subject_start_idx": subject_entity['start_idx'],
                    "subject_end_idx": subject_entity['end_idx'],
                    "subject_location": subject_entity['location'],
                    "subject_text_span": subject_entity['text_span'],
                    "subject_label": subject_entity['label'],
                    "predicate": predicate,
                    "object_start_idx": object_entity['start_idx'],
                    "object_end_idx": object_entity['end_idx'],
                    "object_location": object_entity['location'],
                    "object_text_span": object_entity['text_span'],
                    "object_label": object_entity['label']
                })
            else:
                continue
        
        if not article_found:
            print(f'NO MATCH FOR RELATION: {rel}')

    return ner_data

In [33]:
merged_predictions = map_predicted_relations_to_entities(re_predictions, ner_predictions)

Remove relations not defined in the annotation guidelines and complete conversion to evaluation format

In [34]:
LEGAL_RELATIONS = [
    ("ddf", "affect", "ddf"),
    ("microbiome", "is linked to", "ddf"),
    ("ddf", "target", "human"),
    ("drug", "change effect", "ddf"),
    ("ddf", "is a", "ddf"),
    ("microbiome", "located in", "human"),
    ("chemical", "influence", "ddf"),
    ("dietary supplement", "influence", "ddf"),
    ("ddf", "target", "animal"),
    ("chemical", "impact", "microbiome"),
    ("anatomical location", "located in", "animal"),
    ("microbiome", "located in", "animal"),
    ("chemical", "located in", "anatomical location"),
    ("bacteria", "part of", "microbiome"),
    ("ddf", "strike", "anatomical location"),
    ("drug", "administered", "animal"),
    ("bacteria", "influence", "ddf"),
    ("drug", "impact", "microbiome"),
    ("ddf", "change abundance", "microbiome"),
    ("microbiome", "located in", "anatomical location"),
    ("microbiome", "used by", "biomedical technique"),
    ("chemical", "produced by", "microbiome"),
    ("dietary supplement", "impact", "microbiome"),
    ("bacteria", "located in", "animal"),
    ("animal", "used by", "biomedical technique"),
    ("chemical", "impact", "bacteria"),
    ("chemical", "located in", "animal"),
    ("food", "impact", "bacteria"),
    ("microbiome", "compared to", "microbiome"),
    ("human", "used by", "biomedical technique"),
    ("bacteria", "change expression", "gene"),
    ("chemical", "located in", "human"),
    ("drug", "interact", "chemical"),
    ("food", "administered", "human"),
    ("ddf", "change abundance", "bacteria"),
    ("chemical", "interact", "chemical"),
    ("chemical", "part of", "chemical"),
    ("dietary supplement", "impact", "bacteria"),
    ("ddf", "interact", "chemical"),
    ("food", "impact", "microbiome"),
    ("food", "influence", "ddf"),
    ("bacteria", "located in", "human"),
    ("dietary supplement", "administered", "human"),
    ("bacteria", "interact", "chemical"),
    ("drug", "change expression", "gene"),
    ("drug", "impact", "bacteria"),
    ("drug", "administered", "human"),
    ("anatomical location", "located in", "human"),
    ("dietary supplement", "change expression", "gene"),
    ("chemical", "change expression", "gene"),
    ("bacteria", "interact", "bacteria"),
    ("drug", "interact", "drug"),
    ("microbiome", "change expression", "gene"),
    ("bacteria", "interact", "drug"),
    ("food", "change expression", "gene")
]

def remove_illegal_relations(data):
    dump_dict = {}
    total_rels = 0
    kept_rels = 0
    discared_rels = 0
    discared_rels_set = set()

    for pmid, article in data.items():
        dump_dict[pmid] = {}
        dump_dict[pmid]['metadata'] = article['metadata']
        dump_dict[pmid]['entities'] = []
        dump_dict[pmid]['relations'] = []

        for entity in article['entities']:
            dump_dict[pmid]['entities'].append({
                "start_idx": entity["start_idx"],
                "end_idx": entity["end_idx"],
                "location": entity["location"],
                "text_span": entity["text_span"],
                "label": entity["label"] if entity['label'] != 'ddf' else 'DDF'
            })
        
        for relation in article['relations']:
            total_rels += 1
            rel_key = (relation["subject_label"], relation["predicate"], relation["object_label"])
            if rel_key in LEGAL_RELATIONS:
                kept_rels += 1
                dump_dict[pmid]['relations'].append({
                    "subject_start_idx": relation["subject_start_idx"],
                    "subject_end_idx": relation["subject_end_idx"],
                    "subject_location": relation["subject_location"],
                    "subject_text_span": relation["subject_text_span"],
                    "subject_label": relation["subject_label"] if relation["subject_label"] != 'ddf' else 'DDF',
                    "predicate": relation["predicate"],
                    "object_start_idx": relation["object_start_idx"],
                    "object_end_idx": relation["object_end_idx"],
                    "object_location": relation["object_location"],
                    "object_text_span": relation["object_text_span"],
                    "object_label": relation["object_label"] if relation["object_label"] != 'ddf' else 'DDF'
                })
            else:
                discared_rels += 1
                discared_rels_set.add(rel_key)

    print(f'total_rels: {total_rels}')
    print(f'kept_rels: {kept_rels}')
    print(f'discared_rels: {discared_rels}')
    print()
    print(f'discared_rels_set: {discared_rels_set}')
    for entry in discared_rels_set:
        print(entry)

    return dump_dict


In [35]:
dump_dict = remove_illegal_relations(merged_predictions)

total_rels: 519
kept_rels: 514
discared_rels: 5

discared_rels_set: {('gene', 'influence', 'ddf'), ('chemical', 'change effect', 'ddf'), ('chemical', 'affect', 'ddf'), ('human', 'target', 'human'), ('microbiome', 'is linked to', 'human')}
('gene', 'influence', 'ddf')
('chemical', 'change effect', 'ddf')
('chemical', 'affect', 'ddf')
('human', 'target', 'human')
('microbiome', 'is linked to', 'human')


Sort entities and relations

In [36]:
def sort_entities(release_dict):
	def get_sorting_key(entity):
		location_priority = 0 if entity["location"] == "title" else 1
		return (location_priority, entity["start_idx"])

	for pmid, article in release_dict.items():
		article["entities"].sort(key=get_sorting_key)

In [37]:
sort_entities(dump_dict)

In [38]:
def sort_relations(release_dict):
	def get_sorting_key(relation):
		location_priority = 0 if relation["subject_location"] == "title" else 1
		return (location_priority, relation["subject_start_idx"])

	for pmid, article in release_dict.items():
		article["relations"].sort(key=get_sorting_key)

In [39]:
sort_relations(dump_dict)

Generate Binary Tag Based Relations

In [40]:
def add_binary_tag_based_relations_to_release_dict(release_dict):
    for pmid, article in release_dict.items():
        pairs = set()
        for relation in article["relations"]:
            pairs.add((relation["subject_label"], relation["object_label"]))
        if "binary_tag_based_relations" not in release_dict[pmid]:    
            release_dict[pmid]["binary_tag_based_relations"] = []
        for entry in pairs:
            release_dict[pmid]["binary_tag_based_relations"].append({"subject_label": entry[0], "object_label": entry[1]})

In [41]:
add_binary_tag_based_relations_to_release_dict(dump_dict)

Generate Ternary Tag Based Relations

In [42]:
def add_ternary_tag_based_relations_to_release_dict(release_dict):
    for pmid, article in release_dict.items():
        triplets = set()
        for relation in article["relations"]:
            triplets.add((relation["subject_label"], relation["predicate"], relation["object_label"]))
        if "ternary_tag_based_relations" not in release_dict[pmid]:
            release_dict[pmid]["ternary_tag_based_relations"] = []
        for entry in triplets:
            release_dict[pmid]["ternary_tag_based_relations"].append({"subject_label": entry[0], "predicate": entry[1], "object_label": entry[2]})

In [43]:

add_ternary_tag_based_relations_to_release_dict(dump_dict)

Generate Ternary Mention Based Relations

In [44]:
def add_ternary_mention_based_relations_to_release_dict(release_dict):
    for pmid, article in release_dict.items():
        tuples = set()
        for relation in article["relations"]:
            tuples.add((relation["subject_text_span"], relation["subject_label"], relation["predicate"], relation["object_text_span"], relation["object_label"]))
        if "ternary_mention_based_relations" not in release_dict[pmid]:
            release_dict[pmid]["ternary_mention_based_relations"] = []		
        for entry in tuples:
            release_dict[pmid]["ternary_mention_based_relations"].append({"subject_text_span": entry[0], "subject_label": entry[1], "predicate": entry[2], "object_text_span": entry[3], "object_label": entry[4]})

In [45]:
add_ternary_mention_based_relations_to_release_dict(dump_dict)

In [46]:
with open(PATH_OUTPUT_MERGED_PREDICTIONS, 'w', encoding='utf-8') as file:
    json.dump(dump_dict, file, indent=2)

In [47]:
task_61 = {pmid: {} for pmid in list(dump_dict.keys())}
for pmid in list(dump_dict.keys()):
    task_61[pmid]['entities'] = dump_dict[pmid]['entities']

with open(PATH_OUTPUT_61_NER, 'w', encoding='utf-8') as file:
    json.dump(task_61, file, indent=2)

In [48]:
task_621 = {pmid: {} for pmid in list(dump_dict.keys())}
for pmid in list(dump_dict.keys()):
    task_621[pmid]['binary_tag_based_relations'] = dump_dict[pmid]['binary_tag_based_relations']

with open(PATH_OUTPUT_621_BINARY_TAG_RE, 'w', encoding='utf-8') as file:
    json.dump(task_621, file, indent=2)

In [49]:
task_622 = {pmid: {} for pmid in list(dump_dict.keys())}
for pmid in list(dump_dict.keys()):
    task_622[pmid]['ternary_tag_based_relations'] = dump_dict[pmid]['ternary_tag_based_relations']

with open(PATH_OUTPUT_622_TERNARY_TAG_RE, 'w', encoding='utf-8') as file:
    json.dump(task_622, file, indent=2)

In [50]:
task_623 = {pmid: {} for pmid in list(dump_dict.keys())}
for pmid in list(dump_dict.keys()):
    task_623[pmid]['ternary_mention_based_relations'] = dump_dict[pmid]['ternary_mention_based_relations']

with open(PATH_OUTPUT_623_TERNARY_MENTION_RE, 'w', encoding='utf-8') as file:
    json.dump(task_623, file, indent=2)