In [17]:
# Construction of dataset

import os, itertools, time
import subprocess
from xml.dom import minidom
from collections import Counter, OrderedDict
from operator import itemgetter
from nltk.corpus import wordnet
import tensorflow as tf
import tensorflow_hub as hub
from scipy import spatial
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import numpy as np


USE_folder = "/home/vlead/USE"
alignment_folder = "reference-alignment/"

# Load reference alignments 
def load_alignments(folder):
    alignments = []
    for f in os.listdir(folder):
        doc = minidom.parse(folder + f)
        ls = list(zip(doc.getElementsByTagName('entity1'), doc.getElementsByTagName('entity2')))
        alignments.extend([(a.getAttribute('rdf:resource'), b.getAttribute('rdf:resource')) for (a,b) in ls])
    return alignments
        
reference_alignments = load_alignments(alignment_folder)

In [2]:
flatten = lambda l: [item for sublist in l for item in sublist]

class Ontology():
    def __init__(self, ontology):
        self.ontology = ontology
        self.ontology_obj = minidom.parse(ontology)
        self.root = self.ontology_obj.documentElement
        self.subclasses = self.parse_subclasses()
        self.object_properties = self.parse_object_properties()
        self.data_properties = self.parse_data_properties()
        self.triples = self.parse_triples()
        self.classes = self.parse_classes()
    
    def get_child_node(self, element, tag):
        return [e for e in element._get_childNodes() if type(e)==minidom.Element and e._get_tagName() == tag]
        
    def has_attribute_value(self, element, attribute, value):
        return True if element.getAttribute(attribute).split("#")[-1] == value else False
    
    def get_subclass_triples(self):
        return [(a,b,"subclass_of") for (a,b) in self.get_subclasses()]
    
    def parse_triples(self, union_flag=0, subclass_of=True):
        obj_props = self.object_properties
        data_props = self.data_properties
        props = obj_props + data_props
        all_triples = []
        for prop in props:
            domain_children = self.get_child_node(prop, "rdfs:domain")
            range_children = self.get_child_node(prop, "rdfs:range")
            domain_prop = self.filter_null([self.extract_ID(el) for el in domain_children])
            range_prop = self.filter_null([self.extract_ID(el) for el in range_children])
            if not domain_children or not range_children:
                continue
            if not domain_prop:
                domain_prop = self.filter_null([self.extract_ID(el) for el in domain_children[0].getElementsByTagName("owl:Class")])
            if not range_prop:
                range_prop = self.filter_null([self.extract_ID(el) for el in range_children[0].getElementsByTagName("owl:Class")])
            if domain_prop and range_prop:
                if union_flag == 0:
                    all_triples.extend([(el[0], el[1], self.extract_ID(prop)) for el in list(itertools.product(domain_prop, range_prop))])
                else:
                    all_triples.append(("###".join(domain_prop), "###".join(range_prop), self.extract_ID(prop)))
        if subclass_of:
            all_triples.extend(self.get_subclass_triples())
        return list(set(all_triples))
    
    def get_triples(self, union_flag=0, subclass_of=True):
        if union_flag == 0:
            return self.triples
        else:
            return self.parse_triples(union_flag = 1, subclass_of = False)

    def parse_subclasses(self, union_flag=0):
        subclasses = self.root.getElementsByTagName("rdfs:subClassOf")
        subclass_pairs = []
        for el in subclasses:
            inline_subclasses = self.extract_ID(el)
            if inline_subclasses:
                subclass_pairs.append((el, el.parentNode))
            else:
                level1_class = self.get_child_node(el, "owl:Class")
                if not level1_class:
                    continue
                if self.extract_ID(level1_class[0]):
                    subclass_pairs.append((level1_class[0], el.parentNode))
                else:
                    level2classes = level1_class[0].getElementsByTagName("owl:Class")
                    
                    subclass_pairs.extend([(elem, el.parentNode) for elem in level2classes if self.extract_ID(elem)])
        return subclass_pairs
        
    def get_subclasses(self):
        return [(self.extract_ID(a), self.extract_ID(b)) for (a,b) in self.subclasses]
    
    def filter_null(self, data):
        return [el for el in data if el]
    
    def extract_ID(self, element):
        element_id = element.getAttribute("rdf:ID") or element.getAttribute("rdf:resource") or element.getAttribute("rdf:about")
        return element_id.split("#")[-1]
    
    def parse_classes(self):
        class_elems = [self.extract_ID(el) for el in self.root.getElementsByTagName("owl:Class")]
        subclass_classes = list(set(flatten([el[:-1] for el in self.triples])))
        return list(set(self.filter_null(class_elems + subclass_classes)))
    
    def get_classes(self):
        return self.classes
    
    def get_entities(self):
        entities = [self.extract_ID(el) for el in self.root.getElementsByTagName("owl:Class")]
        return list(set(self.filter_null(entities)))

    def parse_data_properties(self):
        data_properties = [el for el in self.get_child_node(self.root, 'owl:DatatypeProperty')]
        fn_data_properties = [el for el in self.get_child_node(self.root, 'owl:FunctionalProperty') if el]
        fn_data_properties = [el for el in fn_data_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "DatatypeProperty")]]
        inv_fn_data_properties = [el for el in self.get_child_node(self.root, 'owl:InverseFunctionalProperty') if el]
        inv_fn_data_properties = [el for el in inv_fn_data_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "DatatypeProperty")]]
        return data_properties + fn_data_properties + inv_fn_data_properties
        
    def parse_object_properties(self):
        obj_properties = [el for el in self.get_child_node(self.root, 'owl:ObjectProperty')]
        fn_obj_properties = [el for el in self.get_child_node(self.root, 'owl:FunctionalProperty') if el]
        fn_obj_properties = [el for el in fn_obj_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "ObjectProperty")]]
        inv_fn_obj_properties = [el for el in self.get_child_node(self.root, 'owl:InverseFunctionalProperty') if el]
        inv_fn_obj_properties = [el for el in inv_fn_obj_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "ObjectProperty")]]
        return obj_properties + fn_obj_properties + inv_fn_obj_properties
    
    def get_object_properties(self):
        obj_props = [self.extract_ID(el) for el in self.object_properties]
        return list(set(self.filter_null(obj_props)))
    
    def get_data_properties(self):
        data_props = [self.extract_ID(el) for el in self.data_properties]
        return list(set(self.filter_null(data_props)))




In [6]:
# Data Generation for RotatE embeddings from OpenKE

ontologies_in_alignment = [l.split(".")[0].split("-") for l in os.listdir("reference-alignment/")]

def generate(ontology, concept_prefix, id_prefix=[0,0]):
    ont = Ontology(ontology)
    classes_dict = {concept_prefix+"#"+elem : str(id_prefix[0] + i) for i,elem in enumerate(ont.get_classes())} 
    props = ont.get_object_properties() + ont.get_data_properties() + ["subclass_of"]
    prop_dict = {concept_prefix+"#"+elem : str(id_prefix[1] + i) for i,elem in enumerate(props)} 
    triplets = ["\t".join((classes_dict[concept_prefix+"#"+el[0]], classes_dict[concept_prefix+"#"+el[1]], prop_dict[concept_prefix+"#"+el[2]]))
                for el in ont.get_triples()]
    return classes_dict, prop_dict, triplets

    
for l in ontologies_in_alignment:
    ont1 = "conference_ontologies/" + l[0] + ".owl"
    ont2 = "conference_ontologies/" + l[1] + ".owl"
    
    benchmark_dir = "OpenKE/benchmarks/" + l[0] + "-" + l[1]
    if not os.path.isdir(benchmark_dir):
        os.mkdir(benchmark_dir)
    
    c1, p1, t1 = generate(ont1, l[0])
    c2, p2, t2 = generate(ont2, l[1], [len(c1), len(p1)])
    
    classes_dict = {**c1, **c2}
    prop_dict = {**p1, **p2}
    triplets = t1 + t2
    
    entity_str = str(len(classes_dict)) + "\n" + "\n".join(["\t".join(elem) for elem in classes_dict.items()])
    prop_str = str(len(prop_dict)) + "\n" + "\n".join(["\t".join(elem) for elem in prop_dict.items()])
    triplets_str = str(len(triplets)) + "\n" + "\n".join(triplets)

    open(benchmark_dir + "/entity2id.txt", "w+").write(entity_str)
    open(benchmark_dir + "/relation2id.txt", "w+").write(prop_str)
    open(benchmark_dir + "/train2id.txt", "w+").write(triplets_str)

In [26]:
# Extracting USE embeddings
def extractUSEEmbeddings(words):
    try:
        embed = hub.KerasLayer(USE_folder)
    except Exception as e:
        !mkdir $USE_folder
        !curl -L "https://tfhub.dev/google/universal-sentence-encoder-large/5?tf-hub-format=compressed" | tar -zxvC $USE_folder
        embed = hub.KerasLayer(USE_folder)
        pass
    word_embeddings = embed(words)
    return word_embeddings.numpy()

def cos_sim(a,b):
    return 1 - spatial.distance.cosine(a, b)

def camel_case_split(identifier):
    matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
    return [m.group(0).lower() for m in matches]

def parse(word):
    return flatten([el.split("_") for el in camel_case_split(word)])
    

extracted_elems = []

for ont_name in list(set(flatten(ontologies_in_alignment))):
    ont = Ontology("conference_ontologies/" + ont_name + ".owl")
    entities = ont.get_entities()
    props = ont.get_object_properties() + ont.get_data_properties()
    triples = list(set(flatten(ont.get_triples(subclass_of=False))))
    extracted_elems.extend(entities + props + triples)

extracted_elems = list(set(extracted_elems))

entities_parsed = [parse(word) for word in extracted_elems]
inp = [" ".join(e) for e in entities_parsed]
vectorizer = TfidfVectorizer(token_pattern=r"(?u)\S+")
X = vectorizer.fit_transform(inp)
word2idx_tfidf = {word: i for (i, word)  in enumerate(vectorizer.get_feature_names())}
entity2idx_tfidf = {word: i for (i, word)  in enumerate(extracted_elems)}


extracted_elems.extend(flatten(entities_parsed))
extracted_elems = list(set(extracted_elems))

print ("Total number of extracted unique classes and properties from entire RA set: ", len(extracted_elems))

embeds = extractUSEEmbeddings(extracted_elems)
embeddings = dict(zip(extracted_elems, embeds))    

Total number of extracted unique classes and properties from entire RA set:  1228


In [54]:
# Feature generation

features_dict = {}

def get_tfidf_score(word, phrase):
    temp = X[entity2idx_tfidf[phrase]]
    return temp[:,word2idx_tfidf[word]][0,0]

# max_weight = 0

def get_property_features(triple):
#     weight = np.sum([get_tfidf_score(word, triple[-1]) for word in parse(triple[-1])])
    feats = []
#     feats.append(weight*embeddings[triple[-1]]) # Property name    
    feats.append(np.sum([get_tfidf_score(word, triple[-1]) * embeddings[word] for word in parse(triple[-1])]))
    feats.append([embeddings[el] for el in triple[0].split("###")]) # Domain 
    feats.append([embeddings[el] for el in triple[1].split("###")]) # Range
    
#     global max_weight
#     if weight > max_weight:
#         max_weight = weight
    return feats
    
def get_entity_features(entity):
    feats = []
#     weight = np.sum([get_tfidf_score(word, entity) for word in parse(entity)])
#     feats.append(weight*embeddings[entity]) # Entity name
    feats.append(np.sum([get_tfidf_score(word, entity) * embeddings[word] for word in parse(entity)]))
#     global max_weight
#     if weight > max_weight:
#         max_weight = weight
    return feats
    
for ont_name in list(set(flatten(ontologies_in_alignment))):
    ont = Ontology("conference_ontologies/" + ont_name + ".owl")
    
    triples = ont.get_triples(union_flag=1, subclass_of=False)
    entities = ont.get_entities()
    props = ont.get_object_properties() + ont.get_data_properties()

    for triple in triples:
        features_dict[triple[-1]] = {"type": "triple", "features": get_property_features(triple)}
    for entity in entities:
        features_dict[entity] = {"type": "entity", "features": get_entity_features(entity)}
    for prop in props:
        if prop not in features_dict:
            features_dict[prop] = {"type": "property", "features": get_entity_features(prop)}
#     for entity in features_dict:
#         tmp = np.array(features_dict[entity]["features"][0])
#         features_dict[entity]["features"][0] = (tmp / max_weight)


In [55]:
# Combinatorial mapping generation

all_mappings = []
for l in ontologies_in_alignment:
    ont1 = Ontology("conference_ontologies/" + l[0] + ".owl")
    ont2 = Ontology("conference_ontologies/" + l[1] + ".owl")
    
    ent1 = ont1.get_entities()
    ent2 = ont2.get_entities()
    
    obj1 = ont1.get_object_properties()
    obj2 = ont2.get_object_properties()
    
    data1 = ont1.get_data_properties()
    data2 = ont2.get_data_properties()
    
    mappings = list(itertools.product(ent1, ent2)) + list(itertools.product(obj1, obj2)) + list(itertools.product(data1, data2))
    
    all_mappings.extend([(l[0] + "#" + el[0], l[1] + "#" + el[1]) for el in mappings])
    

In [56]:
gt_mappings = [tuple([elem.split("/")[-1] for elem in el]) for el in reference_alignments]

def calc_sim_score(mapping):
    features = embedify(mapping)
    if features[0]["type"] != "triple" and features[1]["type"] != "triple":
        return cos_sim(*tuple([el["features"] for el in features]))
    elif features[0]["type"] == "triple"  and features[1]["type"] == "triple":
        sim_score = 0
        names, domains, ranges = [], [], []
        
        for elem in [el["features"] for el in features]:
            names.append(elem[0])
            domains.append(elem[1])
            ranges.append(elem[2])
        name_score = cos_sim(names[0], names[1])
#         domain_score = np.mean([cos_sim(*elem) for elem in itertools.product(domains[0], domains[1])])
#         ranges_score = np.mean([cos_sim(*elem) for elem in itertools.product(ranges[0], ranges[1])])
#         return ((name_score + domain_score + ranges_score)/(len(domains[0]) + len(ranges[0]) +  ))
#         return (name_score + domain_score + ranges_score)/3
        return name_score
    return cos_sim(*tuple([el["features"][0] for el in features]))
        
        

def embedify(mapping):
    removed_hash = tuple([elem.split("#")[1] for elem in mapping])
    return (features_dict[removed_hash[0]], features_dict[removed_hash[1]])

data = {}
for mapping in all_mappings:
    if mapping in gt_mappings:
        data[(mapping[0], mapping[1])] = (calc_sim_score(mapping), "T")
    else:
        data[(mapping[0], mapping[1])] = (calc_sim_score(mapping), "F")


In [None]:
def is_valid(test_onto, key):
    return tuple([el.split("#")[0] for el in key]) not in test_onto

data = OrderedDict(sorted(data.items(),  key=lambda x:x[1][0], reverse=True))
all_ont_pairs = list(set([tuple([el.split("#")[0] for el in l]) for l in data.keys()]))
results = []
failed = []
for i in list(range(0, len(all_ont_pairs), 3)):
    test_onto = all_ont_pairs[i:i+3]
    
    train_data = {elem: data[elem] for elem in data if tuple([el.split("#")[0] for el in elem]) not in test_onto}
    test_data = {elem: data[elem] for elem in data if tuple([el.split("#")[0] for el in elem]) in test_onto}

    opt_threshold, optimum_metrics = -1000, [-1000 for i in range(5)]
    t = time.time()
    for j,threshold in enumerate(np.arange(0.15, 1.0005, 0.01)):
        print ("threshold =", threshold, "Time = ", time.time()-t) 
        pred = []
        for i,key in enumerate(train_data):
            if train_data[key][0] > threshold:
                pred.append(key)

        tp = len([elem for elem in pred if train_data[elem][1] == "T"])
        fn = len([key for key in gt_mappings if key not in set(pred) and is_valid(test_onto, key)])
        fp = len([elem for elem in pred if train_data[elem][1] == "F"])

        try:
            precision = tp/(tp+fp)
            recall = tp/(tp+fn)
            f1score = 2 * precision * recall / (precision + recall)
            f2score = 5 * precision * recall / (4 * precision + recall)
            f0_5score = 1.25 * precision * recall / (0.25 * precision + recall)
        except Exception as e:
            print (e)
            continue
        print (precision, recall, f1score, f2score, f0_5score)

        if f1score > optimum_metrics[2]:
            optimum_metrics = [precision, recall, f1score, f2score, f0_5score]
            opt_threshold = threshold
    
    threshold = opt_threshold
    pred = []
    for i,key in enumerate(test_data):
        if test_data[key][0] > threshold:
            pred.append(key)

    curr = dict()
    
    curr["fn"] = [key for key in gt_mappings if key not in set(pred) and not is_valid(test_onto, key)]
    curr["fp"] = [elem for elem in pred if test_data[elem][1] == "F"]
    tp = len([elem for elem in pred if test_data[elem][1] == "T"])
    fn = len(curr["fn"])
    fp = len(curr["fp"])

    
    
    try:
        precision = tp/(tp+fp)
        recall = tp/(tp+fn)
        f1score = 2 * precision * recall / (precision + recall)
        f2score = 5 * precision * recall / (4 * precision + recall)
        f0_5score = 1.25 * precision * recall / (0.25 * precision + recall)
    except Exception as e:
        print (e)
        pass
            
    metrics = [precision, recall, f1score, f2score, f0_5score]
    failed.append(curr)
    results.append(metrics)

print ("Final Results:", np.mean(results, axis=0))

threshold = 0.15 Time =  2.3603439331054688e-05
0.0028753784825843553 0.967032967032967 0.005733708341025335 0.014207909069381956 0.0035915533196019906
threshold = 0.16 Time =  3.675703525543213


In [None]:
# AML test
results = []
for i in list(range(0, len(all_ont_pairs), 3)):
    test_onto = all_ont_pairs[i:i+3]
    for ont_pair in test_onto:
        a, b, c = ont_pair[0], ont_pair[1], ont_pair[0] + "-" + ont_pair[1]
        java_command = "java -jar AML_v3.1/AgreementMakerLight.jar -s conference_ontologies/" + a + ".owl" + \
                            " -t conference_ontologies/" + b + ".owl -o AML-test-results/" + c + ".rdf -a"
    #     print (java_command)
        process = subprocess.Popen(java_command.split(), stdout=subprocess.PIPE)
        output, error = process.communicate()
    #     print (a,b,c)

    pred_aml = load_alignments("AML-test-results/")
    pred_aml = [tuple([el.split("/")[-1] for el in key]) for key in pred_aml]
    tp = len([elem for elem in pred_aml if data[elem][1] == "T"])
    fn = len([key for key in gt_mappings if key not in set(pred_aml) and not is_valid(test_onto, key)])
    fp = len([elem for elem in pred_aml if data[elem][1] == "F"])

    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1score = 2 * precision * recall / (precision + recall)
    f2score = 5 * precision * recall / (4 * precision + recall)
    f0_5score = 1.25 * precision * recall / (0.25 * precision + recall)
    print (precision, recall, f1score, f2score, f0_5score)
    
    metrics = [precision, recall, f1score, f2score, f0_5score]
    results.append(metrics)
    
    rm_command = "rm -rf AML-test-results/*"
    process = subprocess.Popen(rm_command.split(), stdout=subprocess.PIPE)
    output, error = process.communicate()
    
print ("Final Results:", np.mean(results, axis=0))

In [24]:
opt_threshold

0.7700000000000006

[]

In [554]:
cos_sim([[[1,2,3], [3,4,5]]], [[[32,14,3], [42, 53, 52]]])

ValueError: Input vector should be 1-D.

In [576]:
list(itertools.product([[1,2], [3,4]], [[1,2], [3,4]]))

[([1, 2], [1, 2]), ([1, 2], [3, 4]), ([3, 4], [1, 2]), ([3, 4], [3, 4])]

In [14]:
len(list(set(flatten([parse(word) for word in extracted_elems])))), len(word2idx_tfidf.keys())

(533, 533)

In [639]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(inp)
word2idx_tfidf = {word: i for (i, word)  in enumerate(vectorizer.get_feature_names())}
entity2idx_tfidf = {word: i for (i, word)  in enumerate(extracted_elems)}


{'paperPresentedAs': 0,
 'ParallelAndDistributedComputingTopic': 1,
 'finalizePaperAssignment': 2,
 'is_signed_by': 3,
 'reviewCriteriaEnteredBy': 4,
 'ReviewForm': 5,
 'listsEvent': 6,
 'Building': 7,
 'is_held_after': 8,
 'partOf': 9,
 'was_a_committe_co-chair_of': 10,
 'hasKeyword': 11,
 'Worker_non_speaker': 12,
 'hasMember': 13,
 'is_sent_before': 14,
 'technicallyOrganises': 15,
 'name': 16,
 'Presenter': 17,
 'SignalProcessingTopic': 18,
 'hasBid': 19,
 'hasReviewHistory': 20,
 'has_a_track-workshop-tutorial_chair': 21,
 'has_title': 22,
 'Committee_member': 23,
 'hasLastName': 24,
 'Academic_Institution': 25,
 'ACM_SIGKDD': 26,
 'ComputerNetworksSensorTopic': 27,
 'publisherOf': 28,
 'Fee': 29,
 'WelcomeTalk': 30,
 'invites_co-reviewers': 31,
 'Chairman': 32,
 'Video_presentation': 33,
 'endReview': 34,
 'Invited_Talk': 35,
 'Call_for_paper': 36,
 'Trip': 37,
 'CallForManuscripts': 38,
 'ConferenceSession': 39,
 'Workshop_Paper': 40,
 'Paper': 41,
 'OrganizationalMeeting': 42,


In [47]:
tmp[0]

array([-3.32923084e-02, -6.61843270e-03, -1.69279389e-02, -1.89156849e-02,
       -3.77495140e-02, -1.86415445e-02,  5.27783819e-02,  6.30775169e-02,
       -1.89297069e-02, -9.57503170e-03,  3.84151302e-02,  7.43583497e-03,
        1.56472195e-02,  1.30205071e-02, -3.44151445e-02,  5.70965791e-03,
       -4.81052399e-02, -2.15850538e-03,  2.90137548e-02, -1.14982098e-03,
        4.35708500e-02,  5.33411242e-02, -9.89831984e-03, -7.55268112e-02,
       -3.03623429e-03, -1.04758637e-02, -4.64556646e-03,  5.28141037e-02,
       -3.03096529e-02,  7.37254024e-02,  7.13311741e-03,  1.19511643e-02,
        5.17832562e-02,  6.22331239e-02, -8.19792971e-02,  1.09951505e-02,
       -1.09791644e-02, -3.31103392e-02,  9.43025760e-03,  5.71564247e-04,
       -2.54904814e-02, -7.33156642e-03, -4.20193449e-02, -6.96072914e-03,
        7.61979818e-02,  5.74754141e-02,  1.99243743e-02,  2.04879674e-03,
        9.07520875e-02, -3.81723116e-03, -4.66725640e-02,  4.07577977e-02,
       -9.50154290e-03, -