In [85]:
# Construction of dataset

import os, itertools, time, pickle, sys, glob, requests
import subprocess
from xml.dom import minidom
from collections import Counter, OrderedDict
from operator import itemgetter
from nltk.corpus import wordnet
import tensorflow as tf
import tensorflow_hub as hub
from scipy import spatial
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import numpy as np
import scipy.sparse as sp
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from math import ceil, exp
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline  

In [167]:
flatten = lambda l: [item for sublist in l for item in sublist]

class Ontology():
    def __init__(self, ontology):
        self.ontology = ontology
        self.ontology_obj = minidom.parse(ontology)
        self.root = self.ontology_obj.documentElement
        self.construct_mapping_dict()
        
        self.parents_dict = {}
        self.subclasses = self.parse_subclasses()
        self.object_properties = self.parse_object_properties()
        self.data_properties = self.parse_data_properties()
        self.triples = self.parse_triples()
        self.classes = self.parse_classes()        
    
    def construct_mapping_dict(self):
        self.mapping_dict = {self.extract_ID(el, False): self.get_child_node(el, "rdfs:label")[0].firstChild.nodeValue for el in self.root.getElementsByTagName("owl:Class") if self.get_child_node(el, "rdfs:label")}
        self.mapping_dict_inv = {self.mapping_dict[key]: key for key in self.mapping_dict}
        return
        
    def get_child_node(self, element, tag):
        return [e for e in element._get_childNodes() if type(e)==minidom.Element and e._get_tagName() == tag]
        
    def has_attribute_value(self, element, attribute, value):
        return True if element.getAttribute(attribute).split("#")[-1] == value else False
    
    def get_subclass_triples(self):
        subclasses = self.get_subclasses()
        for (a,b,c) in subclasses:
            if c == "subclass_of" and a!="Thing" and b!="Thing":
                if b not in self.parents_dict:
                    self.parents_dict[b] = [a]
                else:
                    self.parents_dict[b].append(a)
        return [(b,a,c) for (a,b,c) in subclasses]
    
    def parse_triples(self, union_flag=0, subclass_of=True):
        obj_props = self.object_properties
        data_props = self.data_properties
        props = obj_props + data_props
        all_triples = []
        for prop in props:
            domain_children = self.get_child_node(prop, "rdfs:domain")
            range_children = self.get_child_node(prop, "rdfs:range")
            domain_prop = self.filter_null([self.extract_ID(el) for el in domain_children])
            range_prop = self.filter_null([self.extract_ID(el) for el in range_children])
            if not domain_children or not range_children:
                continue
            if not domain_prop:
                domain_prop = self.filter_null([self.extract_ID(el) for el in domain_children[0].getElementsByTagName("owl:Class")])
            if not range_prop:
                range_prop = self.filter_null([self.extract_ID(el) for el in range_children[0].getElementsByTagName("owl:Class")])
            if domain_prop and range_prop:
                if union_flag == 0:
                    all_triples.extend([(el[0], el[1], self.extract_ID(prop)) for el in list(itertools.product(domain_prop, range_prop))])
                else:
                    all_triples.append(("###".join(domain_prop), "###".join(range_prop), self.extract_ID(prop)))
        if subclass_of:
            all_triples.extend(self.get_subclass_triples())
        
        return [triple for triple in list(set(all_triples)) if triple[0]!="Thing" and triple[1]!="Thing"]
    
    def get_triples(self, union_flag=0, subclass_of=True):
        return self.parse_triples(union_flag, subclass_of)

    def parse_subclasses(self, union_flag=0):
        subclasses = self.root.getElementsByTagName("rdfs:subClassOf")
        subclass_pairs = []
        for el in subclasses:
            inline_subclasses = self.extract_ID(el)
            if inline_subclasses:
                subclass_pairs.append((el, el.parentNode, "subclass_of"))
                
            else:
                level1_class = self.get_child_node(el, "owl:Class")
                if not level1_class:
                    restriction = el.getElementsByTagName("owl:Restriction")
                    if not restriction:
                        continue
                    prop = self.get_child_node(restriction[0], "owl:onProperty")
                    some_vals = self.get_child_node(restriction[0], "owl:someValuesFrom")
                    
                    if not prop or not some_vals:
                        continue
#                     print(self.extract_ID(el), "**", self.extract_ID(some_vals[0]), "**", self.extract_ID(prop[0]))
                    try:
                        if self.extract_ID(prop[0]) and self.extract_ID(some_vals[0]):
                            subclass_pairs.append((el.parentNode, some_vals[0], self.extract_ID(prop[0])))
                        elif self.extract_ID(prop[0]) and not self.extract_ID(some_vals[0]):
                            class_vals = self.get_child_node(some_vals[0], "owl:Class")
                            subclass_pairs.append((el.parentNode, class_vals[0], self.extract_ID(prop[0])))
                        elif not self.extract_ID(prop[0]) and self.extract_ID(some_vals[0]):
                            prop_vals = self.get_child_node(prop[0], "owl:ObjectProperty")
                            subclass_pairs.append((el.parentNode, some_vals[0], self.extract_ID(prop_vals[0])))
                        else:
                            prop_vals = self.get_child_node(prop[0], "owl:ObjectProperty")
                            class_vals = self.get_child_node(some_vals[0], "owl:Class")
                            subclass_pairs.append((el.parentNode, class_vals[0], self.extract_ID(prop_vals[0])))
                    except Exception as e:
                        print ("error", e)
                        continue
                else:
                    if self.extract_ID(level1_class[0]):
                        subclass_pairs.append((level1_class[0], el.parentNode, "subclass_of"))
                    else:
                        level2classes = level1_class[0].getElementsByTagName("owl:Class")
                        subclass_pairs.extend([(elem, el.parentNode, "subclass_of") for elem in level2classes if self.extract_ID(elem)])
                        continue
        return subclass_pairs
        
    def get_subclasses(self):
        subclasses = [(self.extract_ID(a), self.extract_ID(b), c) for (a,b,c) in self.subclasses]
        return [el for el in subclasses if el[0] and el[1] and el[2]]
    
    def filter_null(self, data):
        return [el for el in data if el]
    
    def extract_ID(self, element, check_coded = True):
        element_id = element.getAttribute("rdf:ID") or element.getAttribute("rdf:resource") or element.getAttribute("rdf:about")
        element_id = element_id.split("#")[-1]
        if len(list(filter(str.isdigit, element_id))) >= 3 and "_" in element_id and check_coded:
            return self.mapping_dict[element_id]
        return element_id.replace("UNDEFINED_", "").replace("DO_", "")
    
    def parse_classes(self):
        class_elems = [self.extract_ID(el) for el in self.root.getElementsByTagName("owl:Class")]
        subclass_classes = list(set(flatten([el[:-1] for el in self.triples])))
        return list(set(self.filter_null(class_elems + subclass_classes)))
    
    def get_classes(self):
        return self.classes
    
    def get_entities(self):
        entities = [self.extract_ID(el) for el in self.root.getElementsByTagName("owl:Class")]
        return list(set(self.filter_null(entities)))

    def parse_data_properties(self):
        data_properties = [el for el in self.get_child_node(self.root, 'owl:DatatypeProperty')]
        fn_data_properties = [el for el in self.get_child_node(self.root, 'owl:FunctionalProperty') if el]
        fn_data_properties = [el for el in fn_data_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "DatatypeProperty")]]
        inv_fn_data_properties = [el for el in self.get_child_node(self.root, 'owl:InverseFunctionalProperty') if el]
        inv_fn_data_properties = [el for el in inv_fn_data_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "DatatypeProperty")]]
        return data_properties + fn_data_properties + inv_fn_data_properties
        
    def parse_object_properties(self):
        obj_properties = [el for el in self.get_child_node(self.root, 'owl:ObjectProperty')]
        fn_obj_properties = [el for el in self.get_child_node(self.root, 'owl:FunctionalProperty') if el]
        fn_obj_properties = [el for el in fn_obj_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "ObjectProperty")]]
        inv_fn_obj_properties = [el for el in self.get_child_node(self.root, 'owl:InverseFunctionalProperty') if el]
        inv_fn_obj_properties = [el for el in inv_fn_obj_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "ObjectProperty")]]
        return obj_properties + fn_obj_properties + inv_fn_obj_properties
    
    def get_object_properties(self):
        obj_props = [self.extract_ID(el) for el in self.object_properties]
        return list(set(self.filter_null(obj_props)))
    
    def get_data_properties(self):
        data_props = [self.extract_ID(el) for el in self.data_properties]
        return list(set(self.filter_null(data_props)))




In [180]:
USE_folder = "/home/vlead/USE"
alignment_folder = "reference-alignment/"

# Load reference alignments 
def load_alignments(folder):
    alignments = []
    for f in os.listdir(folder):
        doc = minidom.parse(folder + f)
        ls = list(zip(doc.getElementsByTagName('entity1'), doc.getElementsByTagName('entity2')))
        alignments.extend([(a.getAttribute('rdf:resource'), b.getAttribute('rdf:resource')) for (a,b) in ls])
    return alignments
        
reference_alignments = load_alignments(alignment_folder)

ra_anatomy_coded = load_alignments("../Anatomy/Alignments/")
ra_anatomy = []
ont1 = Ontology("../Anatomy/Ontologies/mouse.owl")
ont2 = Ontology("../Anatomy/Ontologies/human.owl")
for elem in ra_anatomy_coded:
    pre1, pre2 = elem[0].split("#")[0].split(".")[0].split("/")[-1], elem[1].split("#")[0].split(".")[0].split("/")[-1]
    elem1, elem2 = elem[0].split("#")[-1], elem[1].split("#")[-1]
    ra_anatomy.append(( pre1 + "#" + ont1.mapping_dict[elem1], pre2 + "#" + ont2.mapping_dict[elem2]))

gt_mappings = [tuple([elem.split("/")[-1] for elem in el]) for el in reference_alignments]
gt_mappings.extend(ra_anatomy)


ontologies_in_alignment = [["conference_ontologies/" + el  + ".owl" for el in l.split(".")[0].split("-")]
                           for l in os.listdir("reference-alignment/")]
ontologies_in_alignment += [["../Anatomy/Ontologies/human.owl", "../Anatomy/Ontologies/mouse.owl"]]


In [112]:
# Combinatorial mapping generation

all_mappings = []
for l in ontologies_in_alignment[:-1]:
    ont1 = Ontology(l[0])
    ont2 = Ontology(l[1])
    
    ent1 = ont1.get_entities()
    ent2 = ont2.get_entities()
    
    obj1 = ont1.get_object_properties()
    obj2 = ont2.get_object_properties()
    
    data1 = ont1.get_data_properties()
    data2 = ont2.get_data_properties()
    
    mappings = list(itertools.product(ent1, ent2)) + list(itertools.product(obj1, obj2)) + list(itertools.product(data1, data2))

    
    all_mappings.extend([(l[0].split("/")[-1].split(".")[0] + "#" + el[0], l[1].split("/")[-1].split(".")[0] + "#" + el[1]) for el in mappings])
    

data = {mapping: False for mapping in all_mappings}
for mapping in set(gt_mappings):
    data[mapping] = True


error list index out of range
error list index out of range
error list index out of range
error list index out of range
error list index out of range
error list index out of range


In [118]:
# Abbrevation resolution preprocessing

abbreviations_dict = {}
final_dict = {}

for mapping in all_mappings:
    mapping = tuple([el.split("#")[1] for el in mapping])
    is_abb = re.search("[A-Z][A-Z]+", mapping[0])
    if is_abb:
        abbreviation = "".join([el[0].upper() for el in mapping[1].split("_")])
        if is_abb.group() in abbreviation:
            
            start = abbreviation.find(is_abb.group())
            end = start + len(is_abb.group())
            fullform = "_".join(mapping[1].split("_")[start:end])
            print ("left", mapping, abbreviation, fullform)
            
            rest_first = " ".join([el for el in mapping[0].replace(is_abb.group(), "").split("_") if el]).lower()
            rest_second = " ".join(mapping[1].split("_")[:start] + mapping[1].split("_")[end:])
            if is_abb.group() not in final_dict:
                final_dict[is_abb.group()] = [(fullform, rest_first, rest_second)]
            else:
                final_dict[is_abb.group()].append((fullform, rest_first, rest_second))

    is_abb = re.search("[A-Z][A-Z]+", mapping[1])
    if is_abb:
        abbreviation = "".join([el[0].upper() for el in mapping[0].split("_")])
        
        if is_abb.group() in abbreviation:
            start = abbreviation.find(is_abb.group())
            end = start + len(is_abb.group())
            fullform = "_".join(mapping[0].split("_")[start:end])
            print ("right", mapping, abbreviation, fullform)

            rest_first = " ".join([el for el in mapping[1].replace(is_abb.group(), "").split("_") if el]).lower()
            rest_second = " ".join(mapping[0].split("_")[:start] + mapping[0].split("_")[end:])
            if is_abb.group() not in final_dict:
                final_dict[is_abb.group()] = [(fullform, rest_first, rest_second)]
            else:
                final_dict[is_abb.group()].append((fullform, rest_first, rest_second))

keys = [el for el in list(set(flatten([flatten([tup[1:] for tup in final_dict[key]]) for key in final_dict]))) if el]
abb_embeds = dict(zip(keys, extractUSEEmbeddings(keys)))

scored_dict = {}
for abbr in final_dict:
    sim_list = [(tup[0], tup[1], tup[2], cos_sim(abb_embeds[tup[1]], abb_embeds[tup[2]])) if tup[1] and tup[2]
                else (tup[0], tup[1], tup[2], 0) for tup in final_dict[abbr]]
    scored_dict[abbr] = sorted(list(set(sim_list)), key=lambda x:x[-1], reverse=True)

resolved_dict = {key: scored_dict[key][0] for key in scored_dict}
filtered_dict = {key: " ".join(resolved_dict[key][0].split("_")) for key in resolved_dict if resolved_dict[key][-1] > 0.9}


right ('Submitted_contribution', 'SC_Member') SC Submitted_contribution
right ('Steering_committee', 'SC_Member') SC Steering_committee
right ('Passive_conference_participant', 'PC_Chair') PCP Passive_conference
right ('Passive_conference_participant', 'PC_Member') PCP Passive_conference
right ('Organizing_committee', 'OC_Member') OC Organizing_committee
right ('Organizing_committee', 'OC_Chair') OC Organizing_committee
right ('Program_committee', 'PC_Chair') PC Program_committee
right ('Program_committee', 'PC_Member') PC Program_committee
left ('Chair_PC', 'PC_Chair') PC PC_Chair
left ('Member_PC', 'PC_Chair') PC PC_Chair
left ('PC_Chair', 'Program_Committee') PC Program_Committee
left ('PC_Chair', 'Program_Committee_member') PCM Program_Committee
left ('PC_Chair', 'Program_Chair') PC Program_Chair
left ('PC_Member', 'Program_Committee') PC Program_Committee
left ('PC_Member', 'Program_Committee_member') PCM Program_Committee
left ('PC_Member', 'Program_Chair') PC Program_Chair
left 

In [141]:
# Extracting USE embeddings

def extractUSEEmbeddings(words):
    try:
        embed = hub.KerasLayer(USE_folder)
    except Exception as e:
        !mkdir $USE_folder
        !curl -L "https://tfhub.dev/google/universal-sentence-encoder-large/5?tf-hub-format=compressed" | tar -zxvC $USE_folder
        embed = hub.KerasLayer(USE_folder)
        pass
    word_embeddings = embed(words)
    return word_embeddings.numpy()

def cos_sim(a,b):
    return 1 - spatial.distance.cosine(a, b)

def camel_case_split(identifier):
    matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
    return [m.group(0) for m in matches]

def parse(word):
    return flatten([el.split("_") for el in camel_case_split(word)])
    

extracted_elems = []

for ont_name in list(set(flatten(ontologies_in_alignment[:-1]))):
    ont = Ontology(ont_name)
    entities = ont.get_entities()
    props = ont.get_object_properties() + ont.get_data_properties()
    triples = list(set(flatten(ont.get_triples())))
    extracted_elems.extend([ont_name.split("/")[-1].split(".")[0] + "#" + elem for elem in entities + props + triples])

extracted_elems = list(set(extracted_elems))
inp = [" ".join(parse(word.split("#")[1])) for word in extracted_elems]

# Resolving abbreviations to full forms
inp_resolved = []
for concept in inp:
    for key in filtered_dict:
        concept = concept.replace(key, filtered_dict[key])
    final_list = []
    # Lowering case except in abbreviations
    for word in concept.split(" "):
        if not re.search("[A-Z][A-Z]+", word):
            final_list.append(word.lower())
        else:
            final_list.append(word)
    concept = " ".join(final_list)
    inp_resolved.append(concept)


url = "https://montanaflynn-spellcheck.p.rapidapi.com/check/"

headers = {
    'x-rapidapi-host': "montanaflynn-spellcheck.p.rapidapi.com",
    'x-rapidapi-key': "9965b01207msh06291e57d6f2c55p1a6a16jsn0fb016da4a62"
    }

inp_spellchecked = []
for concept in inp_resolved:
    querystring = {"text": concept}
    response = requests.request("GET", url, headers=headers, params=querystring).json()
    if response["suggestion"] != concept:
        resolved = str(concept)
        final_list = []
        for word in concept.split(" "):
            if not re.search("[A-Z][A-Z]+", concept):
                final_list.append(word.lower())
            else:
                final_list.append(word)
        resolved = " ".join(final_list)
#         print (resolved, "suggestion", response)
        for word in response["corrections"]:
            if not re.search("[A-Z][A-Z]+", concept):
                resolved = resolved.replace(word.lower(), response["corrections"][word][0].lower())
                
        
        print (concept, resolved)
        inp_spellchecked.append(resolved)
    else:
        inp_spellchecked.append(concept)

print ("Total number of extracted unique classes and properties from entire RA set: ", len(extracted_elems))

extracted_elems = ["<UNK>"] + extracted_elems

# stopwords = ["has", "is", "a", "an", "the"]
stopwords = ["has"]
inp_stemmed = []
for elem in inp_spellchecked:
    words = " ".join([word for word in elem.split() if word not in stopwords])
    words = words.replace("-", " ")
    inp_stemmed.append(words)

anatomy_terms = []
for ont_name in list(set(flatten(ontologies_in_alignment[-1:]))):
    ont = Ontology(ont_name)
    entities = ont.get_entities()
    props = ont.get_object_properties() + ont.get_data_properties()
    triples = list(set(flatten(ont.get_triples())))
    anatomy_terms.extend([ont_name.split("/")[-1].split(".")[0] + "#" + elem for elem in entities + props + triples])

anatomy_terms = list(set(anatomy_terms))
inp_anatomy = [" ".join(parse(word.split("#")[1])) for word in anatomy_terms]

roman_regex = "^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$"
inp_anatomy = [" ".join([word.replace("/", " or ") for word in elem.split() 
           if not re.search(word, roman_regex) and not re.search(word, "\d")]) for elem in inp_anatomy]

extracted_elems.extend(anatomy_terms)
inp_stemmed.extend(inp_anatomy)

embeds = np.array([np.zeros(512,)] + list(extractUSEEmbeddings(inp_stemmed)))
# embeds = np.array([np.zeros(512,)] + list(extractUSEEmbeddings(inp_spellchecked)))
embeddings = dict(zip(extracted_elems, embeds))


emb_vals = list(embeddings.values())
emb_indexer = {key: i for i, key in enumerate(list(embeddings.keys()))}
emb_indexer_inv = {i: key for i, key in enumerate(list(embeddings.keys()))}

inp_anatomy

error list index out of range
LCD projector LCD projector
registeered applicant registered applicant
cheque exchequer
technically organises technically organists
IASTED member IASTED member
print hardcopy mailing manifests print hard copy mailing manifests
operating topicsystems operating topic systems
has a URL has a URL
contribution 1th-author contribution th-author
has VAT has VAT
conference www conference www
has an ISBN has an ISBN
accepts hardcopy submissions accepts hard copy submissions
flyer flayer
author cd proceedings included author cd proceedings included
SC member SC member
technically organised by technically organized by
organised by organized by
organisation organization
presentationed by presentation ed by
was a committe co-chair of was a committee co-chair of
registation deadline registration deadline
sponzorship sponsorship
nonauthor registration fee non author registration fee
accpet if room rating accept if room rating
scientifically organises scientifically organ

['gastroepiploic vein',
 'Brachial Plexus',
 'glomerular parietal epithelium',
 'foot digit bone',
 'Intestinal Epithelium',
 'Omotransverse',
 'Bronchus Smooth Muscle Tissue',
 'subscapular vein',
 'left adrenal gland',
 'Pancreatico-Duodenal Vein',
 'metacarpal or metatarsal bone',
 'Long Bone',
 'Respiratory System Part',
 'foot skin',
 'bile duct intrahepatic part',
 'incus',
 'gracilis',
 'bulboglandular',
 'Gracilis',
 'Epidermal Ridges',
 'sublingual gland',
 'apex of arytenoid',
 'Unipolar Neuron',
 'Roof of the Fourth Ventricle',
 'dental pulp',
 'Carpal Joint',
 'bulbourethral gland secretion',
 'Squamous Epithelium',
 'Bone Marrow Blood-Forming Cell',
 'Cornea',
 'ovarian vein',
 'Immature Lymphocyte',
 'phrenic artery',
 'Frontal Bone',
 'lower respiratory tract',
 'cecum',
 'Laryngeal Connective Tissue',
 'common bile duct',
 'External Maleolus',
 'forelimb skin',
 'Facial Nerve Nucleus',
 'Bile Duct Tissue',
 'Subependymal Tissue',
 'Corona Dentis',
 'T-Lymphoblast',
 'Mu

In [181]:
def path_to_root(elem, ont_mappings):
    if elem not in ont_mappings or not ont_mappings[elem]:
        return []
    output = ont_mappings[elem] + path_to_root(ont_mappings[elem][0], ont_mappings)
    return output


def get_one_hop_neighbours(ont, K=1):
    ont_obj = Ontology(ont)
    triples = ont_obj.get_triples()
#     print (triples)
    entities = [(a,b) for (a,b,c) in triples]
    neighbours_dict = {elem: [elem] for elem in list(set(flatten(entities)))}
    for e1, e2 in entities:
        neighbours_dict[e1].append(e2)
        neighbours_dict[e2].append(e1)
    
    rootpath_dict = ont_obj.parents_dict
    rootpath_dict = {elem: path_to_root(elem, rootpath_dict) for elem in rootpath_dict}
    ont = ont.split("/")[-1].split(".")[0]
    
    for entity in neighbours_dict:
        if entity in rootpath_dict and len(rootpath_dict[entity]) > 0:
            neighbours_dict[entity].extend(rootpath_dict[entity])
        else:
            continue
    
#     prop_triples = ont_obj.get_triples(subclass_of=False)
#     neighbours_dict_props = {c: [c] for a,b,c in prop_triples}
#     for e1, e2, p in prop_triples:
#         neighbours_dict_props[p].extend([e1, e2])

    #neighbours_dict = {**neighbours_dict, **neighbours_dict_props}
    
    # for elem in ont_obj.get_entities() + ont_obj.get_object_properties() + ont_obj.get_data_properties():
    #     if elem not in neighbours_dict:
    #         neighbours_dict[elem] = [elem]

    neighbours_dict = {el: neighbours_dict[el][:1] + sorted(list(set(neighbours_dict[el][1:])))
                       for el in neighbours_dict}
#     neighbours_dict = {el: neighbours_dict[el][:23] for el in neighbours_dict if len( neighbours_dict[el]) > 2}
    
    neighbours_dict = {ont + "#" + el: [ont + "#" + e for e in neighbours_dict[el]] for el in neighbours_dict}
    return neighbours_dict

neighbours_dicts = {ont.split("/")[-1].split(".")[0]: get_one_hop_neighbours(ont) for ont in list(set(flatten(ontologies_in_alignment)))}
max_neighbours = np.max(flatten([[len(el[e]) for e in el] for el in neighbours_dicts.values()]))
neighbours_lens = {ont: {key: len(neighbours_dicts[ont][key]) for key in neighbours_dicts[ont]}
                   for ont in neighbours_dicts}
neighbours_dicts = {ont: {key: neighbours_dicts[ont][key] + ["<UNK>" for i in range(max_neighbours -len(neighbours_dicts[ont][key]))]
              for key in neighbours_dicts[ont]} for ont in neighbours_dicts}

ontologies_in_alignment = [[el.split("/")[-1].split(".")[0] for el in ont] for ont in ontologies_in_alignment]
f = open("../data_generic_rootpath.pkl", "wb")
pickle.dump([data, emb_indexer, emb_indexer_inv, emb_vals, gt_mappings, neighbours_dicts, ontologies_in_alignment], f)



error list index out of range


In [9]:
# AML test
def is_test(test_onto, key):
    return tuple([el.split("#")[0] for el in key]) in test_onto

results = []
all_ont_pairs = list(set([tuple([el.split("#")[0] for el in l]) for l in data.keys()]))
for i in list(range(0, len(all_ont_pairs), 3)):
    test_onto = all_ont_pairs[i:i+3]
    for ont_pair in test_onto:
        a, b, c = ont_pair[0], ont_pair[1], ont_pair[0] + "-" + ont_pair[1]
        java_command = "java -jar AML_v3.1/AgreementMakerLight.jar -s conference_ontologies/" + a + ".owl" + \
                            " -t conference_ontologies/" + b + ".owl -o AML-test-results/" + c + ".rdf -a"
        process = subprocess.Popen(java_command.split(), stdout=subprocess.PIPE)
        output, error = process.communicate()
    print (os.listdir("AML-test-results/"))
    pred_aml = load_alignments("AML-test-results/")
    pred_aml = [tuple([el.split("/")[-1] for el in key]) for key in pred_aml]
    tp = len([elem for elem in pred_aml if data[elem]])
    fn = len([key for key in gt_mappings if key not in set(pred_aml) and is_test(test_onto, key)])
    fp = len([elem for elem in pred_aml if not data[elem]])

    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1score = 2 * precision * recall / (precision + recall)
    f2score = 5 * precision * recall / (4 * precision + recall)
    f0_5score = 1.25 * precision * recall / (0.25 * precision + recall)
    print (precision, recall, f1score, f2score, f0_5score)
    
    metrics = [precision, recall, f1score, f2score, f0_5score]
    results.append(metrics)
    
    _ = [os.remove(f) for f in glob.glob('AML-test-results/*')]
    
print ("Final Results:", np.mean(results, axis=0))

['confOf-sigkdd.rdf', 'iasted-sigkdd.rdf', 'cmt-ekaw.rdf']
0.8275862068965517 0.7272727272727273 0.7741935483870968 0.7453416149068324 0.8053691275167786
['confOf-iasted.rdf', 'conference-edas.rdf', 'cmt-sigkdd.rdf']
0.8148148148148148 0.5789473684210527 0.6769230769230768 0.6145251396648045 0.7534246575342465
['ekaw-sigkdd.rdf', 'conference-sigkdd.rdf', 'conference-confOf.rdf']
0.78125 0.6097560975609756 0.684931506849315 0.6377551020408163 0.7396449704142012
['confOf-edas.rdf', 'edas-iasted.rdf', 'cmt-conference.rdf']
0.7941176470588235 0.5094339622641509 0.6206896551724137 0.548780487804878 0.7142857142857143
['edas-sigkdd.rdf', 'conference-iasted.rdf', 'ekaw-iasted.rdf']
0.7916666666666666 0.48717948717948717 0.6031746031746031 0.5277777777777778 0.7037037037037036
['cmt-confOf.rdf', 'cmt-edas.rdf', 'edas-ekaw.rdf']
0.8181818181818182 0.5192307692307693 0.6352941176470589 0.5601659751037344 0.733695652173913
['conference-ekaw.rdf', 'confOf-ekaw.rdf', 'cmt-iasted.rdf']
0.79545454545

In [168]:
ontologies_in_alignment = [[el.split("/")[1].split(".")[0] for el in ont] for ont in ontologies_in_alignment][:-1] + [["human", "mouse"]]

In [8]:
for i in list(range(0, len(ontologies_in_alignment)-1, 3)):
    
    test_onto = ontologies_in_alignment[i:i+3]
    print (test_onto)

[['conference', 'ekaw'], ['confOf', 'ekaw'], ['ekaw', 'sigkdd']]
[['edas', 'sigkdd'], ['confOf', 'sigkdd'], ['iasted', 'sigkdd']]
[['confOf', 'iasted'], ['conference', 'iasted'], ['confOf', 'edas']]
[['edas', 'iasted'], ['conference', 'edas'], ['cmt', 'ekaw']]
[['cmt', 'confOf'], ['cmt', 'edas'], ['conference', 'sigkdd']]
[['cmt', 'sigkdd'], ['conference', 'confOf'], ['edas', 'ekaw']]
[['cmt', 'conference'], ['cmt', 'iasted'], ['ekaw', 'iasted']]


In [170]:
[[el for el in neighbours_dicts[ont] if len(neighbours_dicts[ont][el]) == 156] for ont in neighbours_dicts]

[[], [], [], [], [], [], ['human#Brain_Part'], [], []]

In [20]:
def count_non_unk(elem):
    return len([l for l in elem if l!="<UNK>"])
neighbours_dicts = {ont: {el: neighbours_dicts[ont][el][:int(sys.argv[1])] for el in neighbours_dicts[ont]
       if count_non_unk(neighbours_dicts[ont][el]) > int(sys.argv[2])} for ont in neighbours_dicts}

(167, 1240)

In [27]:
a = torch.randn((1,22,512))
b = torch.randn((1,1,512))
W_sub = nn.Linear(512, 1)

att_weights = W_sub(a - b.squeeze(1)[:, None]).squeeze(-1)

In [31]:
fn, fp = pickle.load(open("test_results.pkl", "rb"))

In [169]:
max_neighbours

156

In [81]:
pickle.dump([fn, fp], open("test_best.pkl", "wb"))

In [166]:
Ontology("../Anatomy/Ontologies/mouse.owl").triples

[('tail muscle', 'muscle', 'subclass_of'),
 ('nail', 'Thing', 'subclass_of'),
 ('loop of henle ascending limb thick segment',
  'distal straight tubule macula densa',
  'part_of'),
 ('heart', 'mesocardium', 'part_of'),
 ('third ventricle', 'third ventricle choroid plexus', 'part_of'),
 ('cerebral cortex', 'neocortex', 'part_of'),
 ('outer renal medulla', 'outer medulla inner stripe', 'part_of'),
 ('external anal sphincter', 'skeletal muscle', 'subclass_of'),
 ('cerebellar cortex', 'cerebellar hemisphere', 'part_of'),
 ('upper leg nerve', 'leg nerve', 'subclass_of'),
 ('bronchus basement membrane', 'bronchus basal lamina', 'part_of'),
 ('internal spermatic artery', 'artery', 'subclass_of'),
 ('respiratory bronchiole', 'bronchiole', 'subclass_of'),
 ('kidney cortex', 'renal cortex tubule', 'part_of'),
 ('semicircular canal', 'semicircular duct', 'part_of'),
 ('hindbrain', 'myelencephalon', 'part_of'),
 ('hindlimb', 'foot', 'part_of'),
 ('limb muscle', 'muscle', 'subclass_of'),
 ('intrins

In [83]:
cos_sim(*extractUSEEmbeddings(["program committee member", "member program committee"]))

['gastroepiploic vein',
 'brachial plexus',
 'glomerular parietal epithelium',
 'foot digit bone',
 'intestinal epithelium',
 'omotransverse',
 'bronchus smooth muscle tissue',
 'subscapular vein',
 'left adrenal gland',
 'pancreatico-duodenal vein',
 'metacarpal/metatarsal bone',
 'long bone',
 'rejected paper',
 'respiratory system part',
 'foot skin',
 'bile duct intrahepatic part',
 'incus',
 'gracilis',
 'bulboglandular',
 'gracilis',
 'epidermal ridges',
 'sublingual gland',
 'apex of arytenoid',
 'unipolar neuron',
 'roof of the fourth ventricle',
 'dental pulp',
 'reject paper',
 'carpal joint',
 'bulbourethral gland secretion',
 'squamous epithelium',
 'bone marrow blood-forming cell',
 'ovarian vein',
 'cornea',
 'immature lymphocyte',
 'phrenic artery',
 'frontal bone',
 'lower respiratory tract',
 'cecum',
 'laryngeal connective tissue',
 'common bile duct',
 'external maleolus',
 'forelimb skin',
 'facial nerve nucleus',
 'bile duct tissue',
 'subependymal tissue',
 'confe

In [182]:
ontologies_in_alignment

[['conference', 'ekaw'],
 ['confOf', 'ekaw'],
 ['ekaw', 'sigkdd'],
 ['edas', 'sigkdd'],
 ['confOf', 'sigkdd'],
 ['iasted', 'sigkdd'],
 ['confOf', 'iasted'],
 ['conference', 'iasted'],
 ['confOf', 'edas'],
 ['edas', 'iasted'],
 ['conference', 'edas'],
 ['cmt', 'ekaw'],
 ['cmt', 'confOf'],
 ['cmt', 'edas'],
 ['conference', 'sigkdd'],
 ['cmt', 'sigkdd'],
 ['conference', 'confOf'],
 ['edas', 'ekaw'],
 ['cmt', 'conference'],
 ['cmt', 'iasted'],
 ['ekaw', 'iasted'],
 ['human', 'mouse']]