In [1]:
# Construction of dataset

import os, itertools, time, pickle, sys, glob, requests
import subprocess
from xml.dom import minidom
from collections import Counter, OrderedDict
from operator import itemgetter
from nltk.corpus import wordnet
import tensorflow as tf
import tensorflow_hub as hub
from scipy import spatial
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import numpy as np
import scipy.sparse as sp
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from math import ceil, exp
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import networkx as nx
import matplotlib.pyplot as plt
from orderedset import OrderedSet
from copy import deepcopy
%matplotlib inline  

In [2]:
flatten = lambda l: [item for sublist in l for item in sublist]

class Ontology():
    def __init__(self, ontology):
        self.ontology = ontology
        self.ontology_obj = minidom.parse(ontology)
        self.root = self.ontology_obj.documentElement
        self.construct_mapping_dict()
        
        self.parents_dict = {}
        self.subclasses = self.parse_subclasses()
        self.object_properties = self.parse_object_properties()
        self.data_properties = self.parse_data_properties()
        self.triples = self.parse_triples()
        self.classes = self.parse_classes()        
    
    def construct_mapping_dict(self):
        self.mapping_dict = {self.extract_ID(el, False): self.get_child_node(el, "rdfs:label")[0].firstChild.nodeValue for el in self.root.getElementsByTagName("owl:Class") if self.get_child_node(el, "rdfs:label")}
        self.mapping_dict_inv = {self.mapping_dict[key]: key for key in self.mapping_dict}
        return
        
    def get_child_node(self, element, tag):
        return [e for e in element._get_childNodes() if type(e)==minidom.Element and e._get_tagName() == tag]
        
    def has_attribute_value(self, element, attribute, value):
        return True if element.getAttribute(attribute).split("#")[-1] == value else False
    
    def get_subclass_triples(self):
        subclasses = self.get_subclasses()
        self.parents_dict = {}
        for (a,b,c,d) in subclasses:
            if c == "subclass_of" and a!="Thing" and b!="Thing":
                if b not in self.parents_dict:
                    self.parents_dict[b] = [a]
                else:
                    self.parents_dict[b].append(a)
        return [(b,a,c,d) for (a,b,c,d) in subclasses]
    
    def parse_triples(self, union_flag=0, subclass_of=True):
        obj_props = [(prop, "Object Property") for prop in self.object_properties]
        data_props = [(prop, "Datatype Property") for prop in self.data_properties]
        props = obj_props + data_props
        all_triples = []
        for prop, prop_type in props:
            domain_children = self.get_child_node(prop, "rdfs:domain")
            range_children = self.get_child_node(prop, "rdfs:range")
            domain_prop = self.filter_null([self.extract_ID(el) for el in domain_children])
            range_prop = self.filter_null([self.extract_ID(el) for el in range_children])
            if not domain_children or not range_children:
                continue
            if not domain_prop:
                domain_prop = self.filter_null([self.extract_ID(el) for el in domain_children[0].getElementsByTagName("owl:Class")])
            if not range_prop:
                range_prop = self.filter_null([self.extract_ID(el) for el in range_children[0].getElementsByTagName("owl:Class")])
            if domain_prop and range_prop:
                if union_flag == 0:
                    all_triples.extend([(el[0], el[1], self.extract_ID(prop), prop_type) for el in list(itertools.product(domain_prop, range_prop))])
                else:
                    all_triples.append(("###".join(domain_prop), "###".join(range_prop), self.extract_ID(prop), prop_type))
        if subclass_of:
            all_triples.extend(self.get_subclass_triples())
        return list(set(all_triples))
    
    def get_triples(self, union_flag=0, subclass_of=True):
        return self.parse_triples(union_flag, subclass_of)

    def parse_subclasses(self, union_flag=0):
        subclasses = self.root.getElementsByTagName("rdfs:subClassOf")
        subclass_pairs = []
        for el in subclasses:
            inline_subclasses = self.extract_ID(el)
            if inline_subclasses:
                subclass_pairs.append((el, el.parentNode, "subclass_of", "Subclass"))
            else:
                level1_class = self.get_child_node(el, "owl:Class")
                if not level1_class:
                    restriction = el.getElementsByTagName("owl:Restriction")
                    if not restriction:
                        continue
                    prop = self.get_child_node(restriction[0], "owl:onProperty")
                    some_vals = self.get_child_node(restriction[0], "owl:someValuesFrom")
                    
                    if not prop or not some_vals:
                        continue
#                     print(self.extract_ID(el), "**", self.extract_ID(some_vals[0]), "**", self.extract_ID(prop[0]))
                    try:
                        if self.extract_ID(prop[0]) and self.extract_ID(some_vals[0]):
                            subclass_pairs.append((el.parentNode, some_vals[0], self.extract_ID(prop[0]), "Object Property"))
                        elif self.extract_ID(prop[0]) and not self.extract_ID(some_vals[0]):
                            class_vals = self.get_child_node(some_vals[0], "owl:Class")
                            subclass_pairs.append((el.parentNode, class_vals[0], self.extract_ID(prop[0]), "Object Property"))
                        elif not self.extract_ID(prop[0]) and self.extract_ID(some_vals[0]):
                            prop_vals = self.get_child_node(prop[0], "owl:ObjectProperty")
                            subclass_pairs.append((el.parentNode, some_vals[0], self.extract_ID(prop_vals[0]), "Object Property"))
                        else:
                            prop_vals = self.get_child_node(prop[0], "owl:ObjectProperty")
                            class_vals = self.get_child_node(some_vals[0], "owl:Class")
                            subclass_pairs.append((el.parentNode, class_vals[0], self.extract_ID(prop_vals[0]), "Object Property"))
                    except:
                        try:
                            if not self.extract_ID(prop[0]) and self.extract_ID(some_vals[0]):
                                prop_vals = self.get_child_node(prop[0], "owl:DatatypeProperty")
                                subclass_pairs.append((el.parentNode, some_vals[0], self.extract_ID(prop_vals[0]), "Datatype Property"))
                            elif not self.extract_ID(prop[0]) and not self.extract_ID(some_vals[0]):
                                prop_vals = self.get_child_node(prop[0], "owl:DatatypeProperty")
                                class_vals = self.get_child_node(some_vals[0], "owl:Class")
                                subclass_pairs.append((el.parentNode, class_vals[0], self.extract_ID(prop_vals[0]), "Datatype Property"))
                        except Exception as e:
                            print (e)
                            continue
                else:
                    if self.extract_ID(level1_class[0]):
                        subclass_pairs.append((level1_class[0], el.parentNode, "subclass_of", "Subclass"))
                    else:
#                         level2classes = level1_class[0].getElementsByTagName("owl:Class")
#                         subclass_pairs.extend([(elem, el.parentNode, "subclass_of", "Subclass") for elem in level2classes if self.extract_ID(elem)])
                        continue
        return subclass_pairs
        
    def get_subclasses(self):
        subclasses = [(self.extract_ID(a), self.extract_ID(b), c, d) for (a,b,c,d) in self.subclasses]
        return [el for el in subclasses if el[0] and el[1] and el[2] and el[0]!="Thing" and el[1]!="Thing"]
    
    def filter_null(self, data):
        return [el for el in data if el]
    
    def extract_ID(self, element, check_coded = True):
        element_id = element.getAttribute("rdf:ID") or element.getAttribute("rdf:resource") or element.getAttribute("rdf:about")
        element_id = element_id.split("#")[-1]
        if len(list(filter(str.isdigit, element_id))) >= 3 and "_" in element_id and check_coded:
            return self.mapping_dict[element_id]
        return element_id.replace("UNDEFINED_", "").replace("DO_", "")
    
    def parse_classes(self):
        class_elems = [self.extract_ID(el) for el in self.root.getElementsByTagName("owl:Class")]
        subclass_classes = list(set(flatten([el[:-2] for el in self.triples])))
        return list(set(self.filter_null(class_elems + subclass_classes)))
    
    def get_classes(self):
        return self.classes
    
    def get_entities(self):
        entities = [self.extract_ID(el) for el in self.root.getElementsByTagName("owl:Class")]
        return list(set(self.filter_null(entities)))

    def parse_data_properties(self):
        data_properties = [el for el in self.get_child_node(self.root, 'owl:DatatypeProperty')]
        fn_data_properties = [el for el in self.get_child_node(self.root, 'owl:FunctionalProperty') if el]
        fn_data_properties = [el for el in fn_data_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "DatatypeProperty")]]
        inv_fn_data_properties = [el for el in self.get_child_node(self.root, 'owl:InverseFunctionalProperty') if el]
        inv_fn_data_properties = [el for el in inv_fn_data_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "DatatypeProperty")]]
        return data_properties + fn_data_properties + inv_fn_data_properties
        
    def parse_object_properties(self):
        obj_properties = [el for el in self.get_child_node(self.root, 'owl:ObjectProperty')]
        fn_obj_properties = [el for el in self.get_child_node(self.root, 'owl:FunctionalProperty') if el]
        fn_obj_properties = [el for el in fn_obj_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "ObjectProperty")]]
        inv_fn_obj_properties = [el for el in self.get_child_node(self.root, 'owl:InverseFunctionalProperty') if el]
        inv_fn_obj_properties = [el for el in inv_fn_obj_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "ObjectProperty")]]
        return obj_properties + fn_obj_properties + inv_fn_obj_properties
    
    def get_object_properties(self):
        obj_props = [self.extract_ID(el) for el in self.object_properties]
        return list(set(self.filter_null(obj_props)))
    
    def get_data_properties(self):
        data_props = [self.extract_ID(el) for el in self.data_properties]
        return list(set(self.filter_null(data_props)))


In [261]:
USE_folder = "/home/vlead/USE"
alignment_folder = "reference-alignment/"

# Load reference alignments 
def load_alignments(folder):
    alignments = []
    for f in os.listdir(folder):
        doc = minidom.parse(folder + f)
        ls = list(zip(doc.getElementsByTagName('entity1'), doc.getElementsByTagName('entity2'), doc.getElementsByTagName('measure')))
        alignments.extend([(a.getAttribute('rdf:resource'), b.getAttribute('rdf:resource')) for (a,b,c) in ls])
    return alignments

# Extracting USE embeddings

def extractUSEEmbeddings(words):
    model = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5?tf-hub-format=compressed")
    embeds = model(words)
    return embeds.numpy()

def cos_sim(a,b):
    return 1 - spatial.distance.cosine(a, b)


reference_alignments = load_alignments(alignment_folder)

# ra_anatomy_coded = load_alignments("../Anatomy/Alignments/")
# ra_anatomy = []
# ont1 = Ontology("../Anatomy/Ontologies/mouse.owl")
# ont2 = Ontology("../Anatomy/Ontologies/human.owl")
# for elem in ra_anatomy_coded:
#     pre1, pre2 = elem[0].split("#")[0].split(".")[0].split("/")[-1], elem[1].split("#")[0].split(".")[0].split("/")[-1]
#     elem1, elem2 = elem[0].split("#")[-1], elem[1].split("#")[-1]
#     ra_anatomy.append(( pre1 + "#" + ont1.mapping_dict[elem1], pre2 + "#" + ont2.mapping_dict[elem2]))

gt_mappings = [tuple([elem.split("/")[-1] for elem in el]) for el in reference_alignments]
# gt_mappings.extend(ra_anatomy)


ontologies_in_alignment = pickle.load(open("../data_generic.pkl", "rb"))[-1][:-1]
# ontologies_in_alignment += [["../Anatomy/Ontologies/human.owl", "../Anatomy/Ontologies/mouse.owl"]]


In [139]:
# Combinatorial mapping generation
all_mappings = []
props = []
for l in ontologies_in_alignment:
    ont1 = Ontology("conference_ontologies/" + l[0] + ".owl")
    ont2 = Ontology("conference_ontologies/" + l[1] + ".owl")
    
    ent1 = ont1.get_entities()
    ent2 = ont2.get_entities()
    
    obj1 = ont1.get_object_properties()
    obj2 = ont2.get_object_properties()
    
    data1 = ont1.get_data_properties()
    data2 = ont2.get_data_properties()

    mappings = list(itertools.product(ent1, ent2)) + list(itertools.product(obj1, obj2)) + list(itertools.product(data1, data2))
    prop = list(itertools.product(obj1, obj2)) + list(itertools.product(data1, data2))

    all_mappings.extend([(l[0].split(".")[0].split("/")[-1] + "#" + el[0], l[1].split(".")[0].split("/")[-1] + "#" + el[1]) for el in mappings])
    props.extend([(l[0].split(".")[0].split("/")[-1] + "#" + el[0], l[1].split(".")[0].split("/")[-1] + "#" + el[1]) for el in prop])

data = {mapping: False for mapping in all_mappings}
for mapping in set(gt_mappings):
    data[tuple(mapping[:2])] = True

list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range


In [170]:
sorted(Counter([float(el[-1]) for el in gt_mappings]).items())

[(0.6, 4),
 (0.6075, 3),
 (0.6123, 1),
 (0.6194, 1),
 (0.6248, 1),
 (0.627, 1),
 (0.6362, 1),
 (0.6534, 1),
 (0.6579, 1),
 (0.66, 7),
 (0.6621, 1),
 (0.6656, 4),
 (0.6674, 1),
 (0.6857, 1),
 (0.6939, 2),
 (0.7128, 1),
 (0.7316, 1),
 (0.7351, 7),
 (0.7361, 1),
 (0.7367, 3),
 (0.7635, 1),
 (0.7679, 1),
 (0.7753, 1),
 (0.78, 1),
 (0.7841, 1),
 (0.7875, 1),
 (0.8036, 4),
 (0.8168, 1),
 (0.8338, 1),
 (0.846, 2),
 (0.8571, 1),
 (0.8821, 1),
 (0.9603, 12),
 (0.9619, 2),
 (0.9653, 1),
 (0.9801, 65),
 (0.9802, 2),
 (0.9804, 1),
 (0.9807, 1),
 (0.9809, 1),
 (0.9812, 2),
 (0.9813, 1),
 (0.9823, 1),
 (0.9824, 1),
 (0.9827, 1),
 (0.9828, 1),
 (0.9835, 1),
 (0.9836, 3),
 (0.9847, 1),
 (0.9848, 2),
 (0.9855, 1),
 (0.9856, 1),
 (0.9857, 6),
 (0.9863, 1),
 (0.9865, 1),
 (0.9869, 2),
 (0.9874, 1),
 (0.9878, 1),
 (0.988, 1),
 (0.9884, 1),
 (0.9886, 1),
 (0.9888, 1),
 (0.989, 2),
 (0.9894, 1),
 (0.9897, 2),
 (0.9898, 2),
 (0.9899, 2),
 (0.99, 47),
 (1.0, 7)]

In [8]:
# Abbrevation resolution preprocessing

abbreviations_dict = {}
final_dict = {}

for mapping in all_mappings:
    mapping = tuple([el.split("#")[1] for el in mapping])
    is_abb = re.search("[A-Z][A-Z]+", mapping[0])
    if is_abb:
        abbreviation = "".join([el[0].upper() for el in mapping[1].split("_")])
        if is_abb.group() in abbreviation:
            
            start = abbreviation.find(is_abb.group())
            end = start + len(is_abb.group())
            fullform = "_".join(mapping[1].split("_")[start:end])
            print ("left", mapping, abbreviation, fullform)
            
            rest_first = " ".join([el for el in mapping[0].replace(is_abb.group(), "").split("_") if el]).lower()
            rest_second = " ".join(mapping[1].split("_")[:start] + mapping[1].split("_")[end:])
            if is_abb.group() not in final_dict:
                final_dict[is_abb.group()] = [(fullform, rest_first, rest_second)]
            else:
                final_dict[is_abb.group()].append((fullform, rest_first, rest_second))

    is_abb = re.search("[A-Z][A-Z]+", mapping[1])
    if is_abb:
        abbreviation = "".join([el[0].upper() for el in mapping[0].split("_")])
        
        if is_abb.group() in abbreviation:
            start = abbreviation.find(is_abb.group())
            end = start + len(is_abb.group())
            fullform = "_".join(mapping[0].split("_")[start:end])
            print ("right", mapping, abbreviation, fullform)

            rest_first = " ".join([el for el in mapping[1].replace(is_abb.group(), "").split("_") if el]).lower()
            rest_second = " ".join(mapping[0].split("_")[:start] + mapping[0].split("_")[end:])
            if is_abb.group() not in final_dict:
                final_dict[is_abb.group()] = [(fullform, rest_first, rest_second)]
            else:
                final_dict[is_abb.group()].append((fullform, rest_first, rest_second))

keys = [el for el in list(set(flatten([flatten([tup[1:] for tup in final_dict[key]]) for key in final_dict]))) if el]
abb_embeds = dict(zip(keys, extractUSEEmbeddings(keys)))

scored_dict = {}
for abbr in final_dict:
    sim_list = [(tup[0], tup[1], tup[2], cos_sim(abb_embeds[tup[1]], abb_embeds[tup[2]])) if tup[1] and tup[2]
                else (tup[0], tup[1], tup[2], 0) for tup in final_dict[abbr]]
    scored_dict[abbr] = sorted(list(set(sim_list)), key=lambda x:x[-1], reverse=True)

resolved_dict = {key: scored_dict[key][0] for key in scored_dict}
filtered_dict = {key: " ".join(resolved_dict[key][0].split("_")) for key in resolved_dict if resolved_dict[key][-1] > 0.9}


left ('Member_PC', 'Program_Committee') PC Program_Committee
left ('Member_PC', 'Program_Committee_member') PCM Program_Committee
left ('Member_PC', 'Program_Chair') PC Program_Chair
left ('Chair_PC', 'Program_Committee') PC Program_Committee
left ('Chair_PC', 'Program_Committee_member') PCM Program_Committee
left ('Chair_PC', 'Program_Chair') PC Program_Chair
left ('Member_PC', 'Presenter_city') PC Presenter_city
left ('Chair_PC', 'Presenter_city') PC Presenter_city
left ('PC_Member', 'Program_Committee') PC Program_Committee
left ('PC_Member', 'Program_Committee_member') PCM Program_Committee
left ('PC_Member', 'Program_Chair') PC Program_Chair
left ('PC_Chair', 'Program_Committee') PC Program_Committee
left ('PC_Chair', 'Program_Committee_member') PCM Program_Committee
left ('PC_Chair', 'Program_Chair') PC Program_Chair
left ('OC_Member', 'Organizing_Committee') OC Organizing_Committee
left ('OC_Member', 'Organizing_Committee_member') OCM Organizing_Committee
left ('OC_Chair', 'Orga

In [31]:
filtered_dict = {}
def camel_case_split(identifier):
    matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
    return [m.group(0) for m in matches]

def parse(word):
    return flatten([el.split("_") for el in camel_case_split(word)])
    

extracted_elems = []

for ont_name in list(set(flatten(ontologies_in_alignment))):
    ont = Ontology("conference_ontologies/" + ont_name + ".owl")
    entities = ont.get_entities()
    props = ont.get_object_properties() + ont.get_data_properties()
    triples = list(set(flatten([(a,b,c) for (a,b,c,d) in ont.get_triples()])))
    extracted_elems.extend([ont_name + "#" + elem for elem in entities + props + triples])

extracted_elems = list(set(extracted_elems))
inp = [" ".join(parse(word.split("#")[1])) for word in extracted_elems]

# Resolving abbreviations to full forms
inp_resolved = []
for concept in inp:
    for key in filtered_dict:
        concept = concept.replace(key, filtered_dict[key])
    final_list = []
    # Lowering case except in abbreviations
    for word in concept.split(" "):
        if not re.search("[A-Z][A-Z]+", word):
            final_list.append(word.lower())
        else:
            final_list.append(word)
    concept = " ".join(final_list)
    inp_resolved.append(concept)


url = "https://montanaflynn-spellcheck.p.rapidapi.com/check/"

headers = {
    'x-rapidapi-host': "montanaflynn-spellcheck.p.rapidapi.com",
    'x-rapidapi-key': "9965b01207msh06291e57d6f2c55p1a6a16jsn0fb016da4a62"
    }

inp_spellchecked = []
for concept in inp_resolved:
    querystring = {"text": concept}
    response = requests.request("GET", url, headers=headers, params=querystring)
    
    if response.json()["suggestion"] != concept:
        resolved = str(concept)
        final_list = []
        for word in concept.split(" "):
            if not re.search("[A-Z][A-Z]+", concept):
                final_list.append(word.lower())
            else:
                final_list.append(word)
        resolved = " ".join(final_list)
#         print (resolved, "suggestion", response)
        for word in response["corrections"]:
            if not re.search("[A-Z][A-Z]+", concept):
                resolved = resolved.replace(word.lower(), response.json()["corrections"][word][0].lower())
                
        
        print (concept, resolved)
        inp_spellchecked.append(resolved)
    else:
        inp_spellchecked.append(concept)

print ("Total number of extracted unique classes and properties from entire RA set: ", len(extracted_elems))

extracted_elems = ["<UNK>"] + extracted_elems

# stopwords = ["has", "is", "a", "an", "the"]
stopwords = ["has"]
inp_stemmed = []
for elem in inp_spellchecked:
    words = " ".join([word for word in elem.split() if word not in stopwords])
    words = words.replace("-", " ")
    inp_stemmed.append(words)


embeds = np.array([np.zeros(512,)] + list(extractUSEEmbeddings(inp_stemmed)))
# embeds = np.array([np.zeros(512,)] + list(extractUSEEmbeddings(inp_spellchecked)))
embeddings = dict(zip(extracted_elems, embeds))


emb_vals = list(embeddings.values())
emb_indexer = {key: i for i, key in enumerate(list(embeddings.keys()))}
emb_indexer_inv = {i: key for i, key in enumerate(list(embeddings.keys()))}



list index out of range


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [102]:
# import random
random.seed(0)
a = [1,2,3,4]
for i in range(6):

    random.shuffle(a)
    print (a)


[3, 1, 2, 4]
[3, 1, 4, 2]
[3, 4, 1, 2]
[4, 3, 2, 1]
[2, 4, 1, 3]
[3, 4, 1, 2]


In [74]:

from spellchecker import SpellChecker

spell = SpellChecker()

inp_spellchecked = []
for concept in inp_resolved:
    resolved = str(concept)
    final_list = []
    words = concept.split(" ")
    for word in words:
        if "-" in word:
            parts = word.split("-")
            temp = []
            for part in parts:
                part_corrected = spell.correction(part)
                if part!=part_corrected:
                    print ("-: {} corrected to {}".format(part, part_corrected), parts)
                    temp.append(part_corrected)
                else:
                    temp.append(part)
            word = "-".join(temp)
            final_list.append(word)
        else:
            word_corrected = spell.correction(word)
            if word!=word_corrected and not re.search("[A-Z][A-Z]+", concept):
                print ("{} corrected to {}".format(word, word_corrected), concept)
                final_list.append(word_corrected)
            else:
                final_list.append(word)
    resolved = " ".join(final_list)
    inp_spellchecked.append(resolved)

powerline corrected to powering powerline transmission topic
sponzorship corrected to sponsorship sponzorship
commitee corrected to committee technical commitee
nonauthor corrected to co-author nonauthor registration fee
www corrected to wow conference www
registation corrected to registration registation deadline
registeered corrected to registered registeered applicant
commtitee corrected to committee has a commtitee
accpet corrected to accept accpet if room rating
sponzor corrected to sponsor sponzor fee
writen corrected to written is writen by
attendee corrected to attended attendee
aapplications corrected to applications computer networks aapplications topic
presentationed corrected to presentation presentationed by
holded corrected to holder holded by
attendee corrected to attended author attendee book registration fee
attendee corrected to attended author attendee cd registration fee
committe corrected to committee was a committe co-chair of
organizator corrected to organization

In [125]:
import requests, urllib



url = "https://grammarbot.p.rapidapi.com/check"

headers = {
    'x-rapidapi-host': "grammarbot.p.rapidapi.com",
    'x-rapidapi-key': "9965b01207msh06291e57d6f2c55p1a6a16jsn0fb016da4a62",
    'content-type': "application/x-www-form-urlencoded"
    }

inp_spellchecked = []

for concept in inp_resolved:
    payload = "language=en-US&text=" + urllib.parse.quote_plus(concept)
    response = requests.request("POST", url, data=payload, headers=headers).json()
    concept_corrected = str(concept)
    for elem in response["matches"]:
        start, end = elem["offset"], elem["offset"] + elem["length"]
        concept_corrected = concept_corrected[:start] + elem["replacements"][0]["value"] + concept_corrected[end:]
    if concept.lower() != concept_corrected.lower():
        print ("{} corrected to {}".format(concept, concept_corrected))
        inp_spellchecked.append(concept_corrected)
    else:
        inp_spellchecked.append(concept)


payed by corrected to paid by
sponzorship corrected to sponsorship
technical commitee corrected to Technical committee
best student paper award corrected to the best student paper award
nonauthor registration fee corrected to non author registration fee
registation deadline corrected to registration deadline
technic activity corrected to ethnic activity
organises corrected to organisms
registeered applicant corrected to registered applicant
has programme corrected to Has programmed
has a commtitee corrected to Has a committee
accpet if room rating corrected to accept if room rating
operating topicsystems corrected to Operating topic systems
sponzor fee corrected to sponsor fee
ACM SIGKDD corrected to ACM Signed
is writen by corrected to Is written by
scientifically organises corrected to Scientifically organisms
computer networks aapplications topic corrected to Computer networks applications topic
non academic event corrected to non-academic event
presentationed by corrected to presen

In [76]:
_, _, emb_indexer, emb_indexer_inv, emb_vals, gt_mappings, features_dict, ontologies_in_alignment = pickle.load(open("Input/data_conf_oaei_aml_bagofnbrs.pkl", "rb"))
# data_items = data.items()
# np.random.shuffle(list(data_items))
# data = OrderedDict(data_items)
pickle.dump([data, emb_indexer, emb_indexer_inv, emb_vals, gt_mappings, features_dict, ontologies_in_alignment], open("Input/data_conf_bagofnbrs.pkl", "wb"))

In [16]:

# Load reference alignments 
def load_alignments(folder):
    alignments = []
    for f in os.listdir(folder):
#         print (folder + f)
        doc = minidom.parse(folder + f)
        ls = list(zip(doc.getElementsByTagName('entity1'), doc.getElementsByTagName('entity2'), doc.getElementsByTagName('measure')))
        src = doc.getElementsByTagName('Ontology')[0].getAttribute("rdf:about").split("/")[-1].split(".")[0] + ".owl"
        targ = doc.getElementsByTagName('Ontology')[1].getAttribute("rdf:about").split("/")[-1].split(".")[0] + ".owl"
        print (src, targ)
        alignments.extend([(a.getAttribute('rdf:resource'), b.getAttribute('rdf:resource'), c.firstChild.nodeValue) for (a,b,c) in ls])
    return alignments

load_alignments("../VeeAlign/datasets/web-directory/alignments/")

google web
dmoz google
google yahoo
web yahoo
dmoz yahoo
dmoz web


[('http://localhost:80/google.owl#World_Deutsch_Online-Shops_Fotografie',
  'http://localhost:80/web.owl#Verzeichnis_Einkaufen-Sparen_Freizeit-Hobby_Fotografie',
  '1.0'),
 ('http://localhost:80/google.owl#World_Deutsch_Online-Shops_Freizeit',
  'http://localhost:80/web.owl#Verzeichnis_Einkaufen-Sparen_Freizeit-Hobby',
  '1.0'),
 ('http://localhost:80/google.owl#World_Deutsch_Online-Shops_Freizeit_Basteln',
  'http://localhost:80/web.owl#Verzeichnis_Einkaufen-Sparen_Freizeit-Hobby_Basteln-Handarbeit_Basteln-allgemein',
  '1.0'),
 ('http://localhost:80/google.owl#World_Deutsch_Online-Shops_Freizeit_Basteln_Kerzen',
  'http://localhost:80/web.owl#Verzeichnis_Einkaufen-Sparen_Freizeit-Hobby_Basteln-Handarbeit_Kerzen',
  '1.0'),
 ('http://localhost:80/google.owl#World_Deutsch_Online-Shops_Freizeit_Basteln_Window-Color',
  'http://localhost:80/web.owl#Verzeichnis_Einkaufen-Sparen_Freizeit-Hobby_Basteln-Handarbeit_Window-Color',
  '1.0'),
 ('http://localhost:80/google.owl#World_Deutsch_Onlin

In [183]:
def path_to_root(elem, ont_mappings, curr = [], rootpath=[]):
    curr.append(elem)
    if elem not in ont_mappings or not ont_mappings[elem]:
        rootpath.append(curr)
        return
    for node in ont_mappings[elem]:
        curr_orig = deepcopy(curr)
        _ = path_to_root(node, ont_mappings, curr, rootpath)
        curr = curr_orig
    return rootpath

def get_one_hop_neighbours(ont, K=1):
    ont_obj = Ontology("conference_ontologies/" + ont + ".owl")
    triples = ont_obj.get_triples()
    entities = [(a,b) for (a,b,c,d) in triples]
    neighbours_dict = {elem: [[] for i in range(4)] for elem in list(set(flatten(entities)))}
    print (ont)
    for (e1, e2, p, d) in triples:
        if e1==e2:
            continue
        if d == "Object Property":
            neighbours_dict[e1][2].append([p + " -> " + e2])
            neighbours_dict[e2][2].append([p + " -> " + e1])
        elif d == "Datatype Property":
            neighbours_dict[e1][3].append([p + " -> " + e2])
            neighbours_dict[e2][3].append([p + " -> " + e1])
        elif d == "Subclass":
            neighbours_dict[e2][1].append([e1])
        else:
            print ("Error wrong value of d: ", d)
    
    rootpath_dict = ont_obj.parents_dict
    rootpath_dict_new = {}
    for elem in rootpath_dict:
        rootpath_dict_new[elem] = path_to_root(elem, rootpath_dict, [], [])
    ont = ont.split("/")[-1].split(".")[0]

    
    for entity in neighbours_dict:
        if entity in rootpath_dict_new and len(rootpath_dict_new[entity]) > 0:
            neighbours_dict[entity][0].extend(rootpath_dict_new[entity])
        else:
            continue
    
#     prop_triples = ont_obj.get_triples(subclass_of=False)
#     neighbours_dict_props = {c: [c] for a,b,c in prop_triples}
#     for e1, e2, p in prop_triples:
#         neighbours_dict_props[p].extend([e1, e2])

    #neighbours_dict = {**neighbours_dict, **neighbours_dict_props}
    
    # for elem in ont_obj.get_entities() + ont_obj.get_object_properties() + ont_obj.get_data_properties():
    #     if elem not in neighbours_dict:
    #         neighbours_dict[elem] = [elem]

#     neighbours_dict = {elem: [key[:1] + sorted(list(set(key[1:]))) for key in neighbours_dict[elem]]
#                        for elem in neighbours_dict}
#     neighbours_dict = {el: neighbours_dict[el][:23] for el in neighbours_dict if len( neighbours_dict[el]) > 2}
#     ont = ont.split("/")[-1].split(".")[0]
    neighbours_dict = {ont + "#" + el: [[tuple([ont + "#" + node for node in path]) for path in nbr_type]
                                        for nbr_type in neighbours_dict[el]] 
                       for el in neighbours_dict}
    neighbours_dict = {el: [[list(path) for path in nbr_type] for nbr_type in neighbours_dict[el]]
                       for el in neighbours_dict}
    return neighbours_dict

# neighbours_dicts = {ont.split("/")[-1].split(".")[0]: get_one_hop_neighbours(ont) for ont in list(set(flatten(ontologies_in_alignment)))}
neighbours_dicts = {}
for ont in list(set(flatten(ontologies_in_alignment))):
    neighbours_dicts = {**neighbours_dicts, **get_one_hop_neighbours(ont)}
# max_paths = np.max([[len(nbr_type) for nbr_type in elem] for elem in neighbours_dicts.values()])
# max_pathlen = np.max(flatten([flatten([[len(path) for path in nbr_type] for nbr_type in elem]) for elem in neighbours_dicts.values()]), axis=0)
# neighbours_dicts_lenpadded = {elem: [[path + ["<UNK>" for i in range(max_pathlen -len(path))] for path in nbr_type]
#                                 for nbr_type in neighbours_dicts[elem]] for elem in neighbours_dicts}
# neighbours_dicts_pathpadded = {elem: [nbr_type + [["<UNK>" for j in range(max_pathlen)] for i in range(max_paths - len(nbr_type))]
#                                 for k,nbr_type in enumerate(neighbours_dicts_lenpadded[elem])] for elem in neighbours_dicts_lenpadded}
# neighbours_dicts_pathpadded = {elem: np.array(neighbours_dicts_pathpadded[elem]) for elem in neighbours_dicts_pathpadded}
# # ontologies_in_alignment_rev = [[el.split("/")[1].split(".")[0] for el in ont] for ont in ontologies_in_alignment]
# f = open("data_aml_uniqpath.pkl", "wb")
# pickle.dump([data, aml_data, emb_indexer, emb_indexer_inv, emb_vals, gt_mappings, neighbours_dicts_pathpadded, ontologies_in_alignment], f)
# # # # neighbours_dicts

sigkdd
edas
list index out of range
confOf
conference
cmt
iasted
ekaw


In [273]:
from orderedset import OrderedSet
def get_context_inf(pair):
    final_results = []
    for elem in pair:
        if elem not in neighbours_dicts:
            return []
        context = neighbours_dicts[elem]
        ancestors = [" -> ".join([elem for elem in path]) for path in context[0]]
        children = list(flatten(context[1])) #shorten later
        oprop_neighbours = [el for el in flatten(context[2])]
        dprop_neighbours = [el for el in flatten(context[3])]
        final_results.append([elem, ancestors, children, oprop_neighbours, dprop_neighbours])
    children_a, children_b = final_results[0][2], final_results[1][2]
    all_children_pairs = [(elem_pair[0], elem_pair[1], cos_sim(*[emb_vals[emb_indexer[elem]] for elem in elem_pair])) 
     for elem_pair in list(itertools.product(children_a, children_b))]
    all_children_pairs = [elem[:2] for elem in sorted(all_children_pairs, key = lambda el: el[-1], reverse=True)][:6]
    all_children_pairs = [tuple([elem for elem in elem_pair]) for elem_pair in all_children_pairs]
    try:
        children_a, children_b = [list(OrderedSet(elem)) if elem else [] for elem in list(zip(*all_children_pairs))]
    except Exception as e:
#         print (e)
        children_a, children_b = [], []
    final_results[0][2] = children_a
    final_results[1][2] = children_b
    
    children_a, children_b = final_results[0][3], final_results[1][3]
    all_children_pairs = [(elem_pair[0], elem_pair[1], cos_sim(*[emb_vals[emb_indexer[elem.split(" -> ")[-1]]] for elem in elem_pair])) 
     for elem_pair in list(itertools.product(children_a, children_b))]
    all_children_pairs = [elem[:2] for elem in sorted(all_children_pairs, key = lambda el: el[-1], reverse=True)][:6]
    all_children_pairs = [tuple([elem for elem in elem_pair]) for elem_pair in all_children_pairs]
    try:
        children_a, children_b = [list(OrderedSet(elem)) if elem else [] for elem in list(zip(*all_children_pairs))]
    except Exception as e:
#         print (e)
        children_a, children_b = [], []
    final_results[0][3] = children_a
    final_results[1][3] = children_b
    return final_results

def group_cos_sim(a, b):
    return np.mean([cos_sim(emb_vals[emb_indexer[pair[0]]], emb_vals[emb_indexer[pair[1]]]) for pair in itertools.product(a,b)])

def sort_by_sim(unsorted_list):
    w_e = 1
    w_r = 0.7
    w_o, w_d = 0.5, 0.2
    w_c = 0.25
    sorted_list = []
    for elem_pair in unsorted_list:
        score = 0
        score += w_e * cos_sim(emb_vals[emb_indexer[elem_pair[0][0]]], emb_vals[emb_indexer[elem_pair[1][0]]])
        ancestors_score = [group_cos_sim(path_pair[0].split(" -> "), path_pair[1].split(" -> ")) for path_pair in itertools.product(elem_pair[0][1], elem_pair[1][1])]
        if ancestors_score:
            score += w_r * max(ancestors_score)
        children_score = sorted([cos_sim(emb_vals[emb_indexer[pair[0]]], emb_vals[emb_indexer[pair[1]]]) for pair in itertools.product(elem_pair[0][2], elem_pair[1][2])])[:3]
        if children_score:
            score += w_c * np.mean(children_score)
        obj_score = [cos_sim(emb_vals[emb_indexer[pair[0].split(" -> ")[1]]], emb_vals[emb_indexer[pair[1].split(" -> ")[1]]]) for pair in itertools.product(elem_pair[0][3], elem_pair[1][3])]
        if obj_score:
            score += w_o * max(obj_score)
        data_score = [cos_sim(emb_vals[emb_indexer[pair[0].split(" -> ")[1]]], emb_vals[emb_indexer[pair[1].split(" -> ")[1]]]) for pair in itertools.product(elem_pair[0][4], elem_pair[1][4])]
        if data_score:
            score += w_d * max(data_score)
        sorted_list.append([elem_pair[0], elem_pair[1], score])
    return sorted(sorted_list, key = lambda el:el[-1], reverse=True)
unsorted_entities = [elem for elem in [get_context_inf(el[:2]) for el in all_mappings] if elem]
sorted_entities = sort_by_sim(unsorted_entities)

In [274]:
# open ("aml_false_negatives.tsv").write("\n".join)
def stringify(elem_pair):
    final_list = []
    for elem in elem_pair[:2]:        
        curr = []
        for obj in elem:
            if isinstance(obj, str):
                curr.append(obj)
            elif obj == []:
                curr.append(" ")
            elif isinstance(obj, list):
                curr.append(", ".join(obj))
        final_list.extend(curr)
    final_list.append(str(round(elem_pair[-1], 3)))
    return "\t".join(final_list)

false_negatives = [stringify(elem_pair) for elem_pair in sorted_entities if tuple([el[0] for el in elem_pair[:2]]) not in aml_mappings]

In [275]:
[i for i,elem in enumerate(false_negatives) if (elem.split("\t")[0], elem.split("\t")[5]) in gt_mappings]

[18,
 44,
 49,
 54,
 57,
 63,
 67,
 94,
 106,
 107,
 136,
 150,
 188,
 196,
 208,
 224,
 228,
 244,
 245,
 247,
 264,
 314,
 330,
 332,
 338,
 367,
 381,
 382,
 388,
 392,
 440,
 448,
 459,
 470,
 481,
 490,
 492,
 552,
 596,
 604,
 677,
 688,
 698,
 702,
 703,
 756,
 775,
 790,
 815,
 853,
 925,
 938,
 1089,
 1120,
 1388,
 1479,
 1781,
 1851,
 1981,
 1993,
 2013,
 2148,
 2322,
 2888,
 3166,
 4626,
 4627,
 5364,
 6297,
 8151,
 11253,
 18435,
 29693,
 53291,
 61541]

In [253]:
open("false_negatives.tsv","w+").write("\n".join(false_negatives[:10000]))

3163687

In [255]:
false_negatives

['edas#ActivePaper\tedas#ActivePaper -> edas#Paper -> edas#Document\tedas#RatedPapers\tedas#hasRating -> edas#ReviewRating\t \tekaw#Paper\tekaw#Paper -> ekaw#Document\tekaw#Submitted_Paper, ekaw#Regular_Paper, ekaw#Demo_Paper, ekaw#Poster_Paper, ekaw#Industrial_Paper, ekaw#Conference_Paper\tekaw#hasReview -> ekaw#Review, ekaw#reviewOfPaper -> ekaw#Review, ekaw#hasReviewer -> ekaw#Possible_Reviewer, ekaw#reviewerOfPaper -> ekaw#Possible_Reviewer, ekaw#coversTopic -> ekaw#Research_Topic, ekaw#topicCoveredBy -> ekaw#Research_Topic\t \t1.348',
 'cmt#Paper\tcmt#Paper -> cmt#Document\tcmt#PaperAbstract, cmt#PaperFullVersion\tcmt#readByReviewer -> cmt#Reviewer, cmt#markConflictOfInterest -> cmt#Reviewer, cmt#readPaper -> cmt#Reviewer, cmt#assignedTo -> cmt#Reviewer, cmt#hasBeenAssigned -> cmt#Reviewer, cmt#readByMeta-Reviewer -> cmt#Meta-Reviewer\tcmt#paperID -> cmt#unsignedLong, cmt#title -> cmt#string\tedas#ActivePaper\tedas#ActivePaper -> edas#Paper -> edas#Document\tedas#RatedPapers\tedas

In [246]:
gt_mappings = [el[:2] for el in gt_mappings]
open("gt_mappings_fn.tsv", "w+").write("\n".join([elem for i,elem in enumerate(false_negatives) if (elem.split("\t")[0], elem.split("\t")[5]) in gt_mappings]))

22695

In [149]:
neighbours_dicts = {}
for key in neighbours_dicts_conf:
    neighbours_dicts[key] = neighbours_dicts_conf[key]
for key in neighbours_dicts_german:
    neighbours_dicts[key] = neighbours_dicts_german[key]
max_paths = np.max([[len(nbr_type) for nbr_type in elem] for elem in neighbours_dicts.values()])
max_pathlen = np.max(flatten([flatten([[len(path) for path in nbr_type] for nbr_type in elem]) for elem in neighbours_dicts.values()]), axis=0)
max_types = np.max([len([nbr_type for nbr_type in elem if nbr_type]) for elem in neighbours_dicts.values()])

In [80]:
f = open("Input/data_conf_oaei_german_aml.pkl", "wb")
pickle.dump([data_conf, data_german, aml_data, emb_indexer, emb_indexer_inv, emb_vals, neighbours_dicts, max_paths, max_pathlen, max_types, ontologies_in_alignment], f)


In [187]:
data_conf, data_german, _, emb_indexer, emb_indexer_inv, emb_vals, neighbours_dicts, max_paths, max_pathlen, max_types, ontologies_in_alignment = pickle.load(open("Input/data_conf_oaei_german_aml_thresh.pkl", "rb"))
pickle.dump([data_conf, data_german, data_logmap, emb_indexer, emb_indexer_inv, emb_vals, neighbours_dicts, max_paths, max_pathlen, max_types, ontologies_in_alignment], open("data_conf_oaei_german_logmap_thresh.pkl", "wb"))


In [150]:
f = open("Input/data_conf_oaei_german_bagofnbrs.pkl", "wb")
# pickle.dump([data_conf, data_german, emb_indexer_german, emb_indexer_inv_german, emb_vals_german, emb_indexer_conf, emb_indexer_inv_conf, emb_vals_conf, neighbours_dicts_german, neighbours_dicts_conf, max_paths_german, max_pathlen_german, max_types_german, max_paths_conf, max_pathlen_conf, max_types_conf, ontologies_in_alignment], f)
pickle.dump([data_conf, data_german, emb_indexer_german, emb_indexer_inv_german, emb_vals_german, emb_indexer_conf, emb_indexer_inv_conf, emb_vals_conf, neighbours_dicts, max_paths, max_pathlen, max_types, ontologies_in_alignment], f)

In [111]:
data_conf, data_german, aml_data, emb_indexer, emb_indexer_inv, emb_vals, _, _, _, _, ontologies_in_alignment = pickle.load(open("Input/data_conf_oaei_german_aml.pkl", "rb"))
f = open("Input/data_conf_oaei_german_aml_bagofnbrs.pkl", "wb")
pickle.dump([data_conf, data_german, aml_data, emb_indexer, emb_indexer_inv, emb_vals, neighbours_dicts, max_paths, max_pathlen, max_types, ontologies_in_alignment], f)


In [130]:
def path_to_root(elem, ont_mappings, curr = [], rootpath=[]):
    curr.append(elem)
    if elem not in ont_mappings or not ont_mappings[elem]:
        rootpath.append(curr)
        return
    for node in ont_mappings[elem]:
        curr_orig = deepcopy(curr)
        _ = path_to_root(node, ont_mappings, curr, rootpath)
        curr = curr_orig
    return rootpath

def get_one_hop_neighbours(ont, K=1):
    ont_obj = Ontology("conference_ontologies/" + ont + ".owl")
    triples = ont_obj.get_triples()
    entities = [(a,b) for (a,b,c,d) in triples]
    neighbours_dict = {elem: [[] for i in range(4)] for elem in list(set(flatten(entities)))}
    print (ont)
    for (e1, e2, p, d) in triples:
        if e1==e2:
            continue
        if d == "Object Property":
            neighbours_dict[e1][2].append(e2)
            neighbours_dict[e2][2].append(e1)
        elif d == "Datatype Property":
            neighbours_dict[e1][3].append(e2)
            neighbours_dict[e2][3].append(e1)
        elif d == "Subclass":
            neighbours_dict[e2][1].append(e1)
        else:
            print ("Error wrong value of d: ", d)
    
    rootpath_dict = ont_obj.parents_dict
    rootpath_dict_new = {}
    for elem in rootpath_dict:
        rootpath_dict_new[elem] = path_to_root(elem, rootpath_dict, [], [])
    ont = ont.split("/")[-1].split(".")[0]

    
    for entity in neighbours_dict:
        neighbours_dict[entity][1] = [neighbours_dict[entity][1]]
        neighbours_dict[entity][2] = [neighbours_dict[entity][2]]
        neighbours_dict[entity][3] = [neighbours_dict[entity][3]]
        if entity in rootpath_dict_new and len(rootpath_dict_new[entity]) > 0:
            neighbours_dict[entity][0].extend(rootpath_dict_new[entity])
        else:
            continue
    
#     prop_triples = ont_obj.get_triples(subclass_of=False)
#     neighbours_dict_props = {c: [c] for a,b,c in prop_triples}
#     for e1, e2, p in prop_triples:
#         neighbours_dict_props[p].extend([e1, e2])

    #neighbours_dict = {**neighbours_dict, **neighbours_dict_props}
    
    # for elem in ont_obj.get_entities() + ont_obj.get_object_properties() + ont_obj.get_data_properties():
    #     if elem not in neighbours_dict:
    #         neighbours_dict[elem] = [elem]

#     neighbours_dict = {elem: [key[:1] + sorted(list(set(key[1:]))) for key in neighbours_dict[elem]]
#                        for elem in neighbours_dict}
#     neighbours_dict = {el: neighbours_dict[el][:23] for el in neighbours_dict if len( neighbours_dict[el]) > 2}
#     ont = ont.split("/")[-1].split(".")[0]
    neighbours_dict = {ont + "#" + el: [OrderedSet([tuple([ont + "#" + node for node in path]) for path in nbr_type])
                                        for nbr_type in neighbours_dict[el]] 
                       for el in neighbours_dict}
    neighbours_dict = {el: [[list(path) for path in nbr_type] for nbr_type in neighbours_dict[el]]
                       for el in neighbours_dict}
    return neighbours_dict

# neighbours_dicts = {ont.split("/")[-1].split(".")[0]: get_one_hop_neighbours(ont) for ont in list(set(flatten(ontologies_in_alignment)))}
neighbours_dicts = {}
for ont in list(set(flatten(ontologies_in_alignment))):
    neighbours_dicts = {**neighbours_dicts, **get_one_hop_neighbours(ont)}
max_paths = np.max([[len(nbr_type) for nbr_type in elem] for elem in neighbours_dicts.values()])
max_pathlen = np.max(flatten([flatten([[len(path) for path in nbr_type] for nbr_type in elem]) for elem in neighbours_dicts.values()]), axis=0)
neighbours_dicts_lenpadded = {elem: [[path + ["<UNK>" for i in range(max_pathlen -len(path))] for path in nbr_type]
                                for nbr_type in neighbours_dicts[elem]] for elem in neighbours_dicts}
neighbours_dicts_pathpadded = {elem: [nbr_type + [["<UNK>" for j in range(max_pathlen)] for i in range(max_paths - len(nbr_type))]
                                for k,nbr_type in enumerate(neighbours_dicts_lenpadded[elem])] for elem in neighbours_dicts_lenpadded}
neighbours_dicts_pathpadded = {elem: np.array(neighbours_dicts_pathpadded[elem]) for elem in neighbours_dicts_pathpadded}
# # # ontologies_in_alignment_rev = [[el.split("/")[1].split(".")[0] for el in ont] for ont in ontologies_in_alignment]
# # f = open("data_aml_uniqpath.pkl", "wb")
# # pickle.dump([data, aml_data, emb_indexer, emb_indexer_inv, emb_vals, gt_mappings, neighbours_dicts_pathpadded, ontologies_in_alignment], f)
# # # # # neighbours_dicts

sigkdd
edas
list index out of range
confOf
conference
cmt
iasted
ekaw


In [131]:
max_paths, max_pathlen

(2, 38)

In [190]:
sorted(Counter(flatten([[len(nbr_type) for nbr_type in elem] for elem in neighbours_dicts.values()])).items())


[(0, 9808),
 (1, 4062),
 (2, 257),
 (3, 151),
 (4, 93),
 (5, 56),
 (6, 43),
 (7, 39),
 (8, 17),
 (9, 15),
 (10, 9),
 (11, 6),
 (12, 7),
 (13, 7),
 (15, 5),
 (16, 3),
 (17, 3),
 (18, 1),
 (19, 2),
 (20, 4),
 (21, 6),
 (22, 4),
 (23, 1),
 (24, 2),
 (26, 3),
 (38, 1),
 (42, 1),
 (48, 1),
 (49, 1)]

In [132]:
_, data_conf, emb_indexer_conf, emb_indexer_inv_conf, emb_vals_conf, gt_mappings_conf, _, ontologies_in_alignment_conf = pickle.load(open("../Input/data_aml.pkl", "rb"))
pickle.dump([aml_data, data_conf, emb_indexer_conf, emb_indexer_inv_conf, emb_vals_conf, gt_mappings_conf, neighbours_dicts_pathpadded, ontologies_in_alignment_conf], open("Input/data_conf_oaei_aml_bagofnbrs.pkl", "wb"))

In [189]:
# max_paths
def count_non_unk(elem):
    return len([l for l in elem if list(l)!=["<UNK>" for i in range(6)]])
sorted(Counter(flatten([[len(nbr_type) for nbr_type in elem] for elem in neighbours_dicts.values()])).items())


[(0, 9808),
 (1, 4062),
 (2, 257),
 (3, 151),
 (4, 93),
 (5, 56),
 (6, 43),
 (7, 39),
 (8, 17),
 (9, 15),
 (10, 9),
 (11, 6),
 (12, 7),
 (13, 7),
 (15, 5),
 (16, 3),
 (17, 3),
 (18, 1),
 (19, 2),
 (20, 4),
 (21, 6),
 (22, 4),
 (23, 1),
 (24, 2),
 (26, 3),
 (38, 1),
 (42, 1),
 (48, 1),
 (49, 1)]

In [26]:
output = """
Output_att12_1_0.6075_conf_oaei_german_aml.txt:Final Results: [0.72243346 0.62295082 0.66901408 0.64059339 0.70007369]

Output_att18_1_0.6075_conf_oaei_german_aml.txt:Final Results: [0.66013072 0.66229508 0.66121113 0.66186107 0.66056246]

Output_att18_3_0.6075_conf_oaei_german_aml.txt:Final Results: [0.73895582 0.60327869 0.66425993 0.62627638 0.70714835]

Output_att18_6_0.6075_conf_oaei_german_aml.txt:Final Results: [0.73451327 0.5442623  0.6252354  0.57399723 0.68651778]

Output_att3_1_0.6075_conf_oaei_german_aml.txt:Final Results: [0.71698113 0.62295082 0.66666667 0.63973064 0.6959707 ]

Output_att3_3_0.6075_conf_oaei_german_aml.txt:Final Results: [0.732      0.6        0.65945946 0.62244898 0.70114943]

Output_att3_4_0.6075_conf_oaei_german_aml.txt:Final Results: [0.72623574 0.62622951 0.67253521 0.64396494 0.70375829]

Output_att5_1_0.6075_conf_oaei_german_aml.txt:Final Results: [0.76237624 0.50491803 0.60749507 0.54149086 0.6918239 ]

Output_att5_3_0.6075_conf_oaei_german_aml.txt:Final Results: [0.75732218 0.59344262 0.66544118 0.62028787 0.71768438]

Output_att5_4_0.6075_conf_oaei_german_aml.txt:Final Results: [0.724      0.59344262 0.65225225 0.61564626 0.69348659]

Output_att7_1_0.6075_conf_oaei_german_aml.txt:Final Results: [0.7188755  0.58688525 0.64620939 0.609258   0.68793236]

Output_att7_3_0.6075_conf_oaei_german_aml.txt:Final Results: [0.75565611 0.54754098 0.63498099 0.57945871 0.70227082]

Output_att7_3_0.6075_conf_oaei_german_aml.txt:Final Results: [0.72265625 0.60655738 0.65953654 0.62669377 0.69601204]

Output_att7_4_0.6075_conf_oaei_german_aml.txt:Final Results: [0.61490683 0.64918033 0.63157895 0.64202335 0.62146893]

Output_att7_6_0.6075_conf_oaei_german_aml.txt:Final Results: [0.72       0.59016393 0.64864865 0.6122449  0.68965517]

Output_att12_1_0.612_conf_oaei_german_aml.txt:Final Results: [0.70454545 0.60983607 0.65377856 0.62668464 0.68332109]

Output_att18_1_0.612_conf_oaei_german_aml.txt:Final Results: [0.69960474 0.58032787 0.6344086  0.60081466 0.67198178]

Output_att18_3_0.612_conf_oaei_german_aml.txt:Final Results: [0.70212766 0.64918033 0.6746167  0.65912117 0.69085834]

Output_att18_6_0.612_conf_oaei_german_aml.txt:Final Results: [0.73895582 0.60327869 0.66425993 0.62627638 0.70714835]

Output_att3_1_0.612_conf_oaei_german_aml.txt:Final Results: [0.77       0.50491803 0.60990099 0.54225352 0.69683258]

Output_att3_3_0.612_conf_oaei_german_aml.txt:Final Results: [0.75117371 0.52459016 0.61776062 0.55826936 0.69144339]

Output_att3_4_0.612_conf_oaei_german_aml.txt:Final Results: [0.72373541 0.60983607 0.66192171 0.62965471 0.69767442]

Output_att5_1_0.612_conf_oaei_german_aml.txt:Final Results: [0.76168224 0.53442623 0.62813102 0.56834031 0.70198105]

Output_att5_3_0.612_conf_oaei_german_aml.txt:Final Results: [0.69056604 0.6        0.64210526 0.61616162 0.67032967]

Output_att5_4_0.612_conf_oaei_german_aml.txt:Final Results: [0.68503937 0.5704918  0.62254025 0.59023066 0.65859198]

Output_att7_1_0.612_conf_oaei_german_aml.txt:Final Results: [0.77777778 0.55081967 0.64491363 0.58495822 0.71856287]

Output_att7_3_0.612_conf_oaei_german_aml.txt:Final Results: [0.76605505 0.54754098 0.63862333 0.58066759 0.70943076]

Output_att7_3_0.612_conf_oaei_german_aml.txt:Final Results: [0.70866142 0.59016393 0.64400716 0.61058345 0.68130204]

Output_att7_4_0.612_conf_oaei_german_aml.txt:Final Results: [0.75980392 0.50819672 0.60903733 0.54424157 0.69134701]

Output_att7_6_0.612_conf_oaei_german_aml.txt:Final Results: [0.67142857 0.61639344 0.64273504 0.62666667 0.65964912]

Output_att12_1_0.735_conf_oaei_german_aml.txt:Final Results: [0.74248927 0.56721311 0.64312268 0.59532003 0.69927243]

Output_att13_4_0.735_conf_oaei_german_aml.txt:Final Results: [0.73390558 0.56065574 0.63568773 0.58843772 0.69118836]

Output_att13_5_0.735_conf_oaei_german_aml.txt:Final Results: [0.73251029 0.58360656 0.64963504 0.60833903 0.69694597]

Output_att13_8_0.735_conf_oaei_german_aml.txt:Final Results: [0.71272727 0.64262295 0.67586207 0.65551839 0.6975089 ]

Output_att18_1_0.735_conf_oaei_german_aml.txt:Final Results: [0.7        0.59672131 0.64424779 0.61486486 0.67657993]

Output_att18_3_0.735_conf_oaei_german_aml.txt:Final Results: [0.68265683 0.60655738 0.64236111 0.620389   0.66594672]

Output_att18_6_0.735_conf_oaei_german_aml.txt:Final Results: [0.73662551 0.58688525 0.65328467 0.61175666 0.70086139]

Output_att21_1_0.735_conf_oaei_german_aml.txt:Final Results: [0.6962963  0.61639344 0.65391304 0.63087248 0.67870036]

Output_att21_5_0.735_conf_oaei_german_aml.txt:Final Results: [0.69144981 0.60983607 0.64808362 0.62458026 0.67342505]

Output_att3_1_0.735_conf_oaei_german_aml.txt:Final Results: [0.72961373 0.55737705 0.63197026 0.58499656 0.68714632]

Output_att3_3_0.735_conf_oaei_german_aml.txt:Final Results: [0.68904594 0.63934426 0.66326531 0.64870259 0.67849687]

Output_att3_4_0.735_conf_oaei_german_aml.txt:Final Results: [0.74152542 0.57377049 0.64695009 0.60096154 0.70056045]

Output_att5_1_0.735_conf_oaei_german_aml.txt:Final Results: [0.7398374  0.59672131 0.66061706 0.6207367  0.70597362]

Output_att5_3_0.735_conf_oaei_german_aml.txt:Final Results: [0.76635514 0.53770492 0.63198459 0.57182706 0.70628768]

Output_att5_4_0.735_conf_oaei_german_aml.txt:Final Results: [0.73221757 0.57377049 0.64338235 0.59972584 0.69389374]

Output_att7_1_0.735_conf_oaei_german_aml.txt:Final Results: [0.66542751 0.58688525 0.62369338 0.60107455 0.6480811 ]

Output_att7_3_0.735_conf_oaei_german_aml.txt:Final Results: [0.7265625  0.60983607 0.6631016  0.6300813  0.69977427]

Output_att7_3_0.735_conf_oaei_german_aml.txt:Final Results: [0.69548872 0.60655738 0.64798599 0.62247645 0.67567568]

Output_att7_6_0.735_conf_oaei_german_aml.txt:Final Results: [0.70869565 0.53442623 0.60934579 0.56206897 0.66530612]
"""
def parse_line(line):
    
    max_paths, max_pathlen, threshold = line[10:].split("_")[:3]
    a = "VeeAlign (max_path={}, max_pathlen={}, threshold={})(self+self german+aml two-step)".format(max_paths, max_pathlen, threshold)
    content = a + "\t" + "\t".join(line.split("[")[1].split("]")[0].split())
    return content
    
open("results.tsv", "w+").write("\n".join([parse_line(l) for l in output.split("\n") if l]))

6827

In [195]:
pickle.load(open("Input/da")

array([1., 2., 3.])

In [200]:
def check_best_performance():
    output_file = "Results/Output_att*conf_oaei_german_aml.txt"
    results_lines = [[l for l in open(file).read().split("\n") if "Final Results:" in l] for file in glob.glob(output_file)]
    results_lines = [line[0] for line in results_lines if line]
    results_lines = [line.split("[")[1].split("]")[0].split(" ") for line in results_lines]
    results_lines = [float([value for value in line if value][2]) for line in results_lines]
    return max(results_lines)

check_best_performance()

0.68292683

In [114]:
c = torch.randn((10,4,22,6,512))
b = torch.randn((10,4,1,6,512))
d = b * c


In [122]:
e = torch.bmm(c.permute(0,1,3,4,2).reshape(-1, 22, 1), b.permute(0,1,3,4,2).reshape(-1, 1, 1)).squeeze(-1).reshape(-1,4,6,512,22).permute(0,1,4,2,3)


In [123]:
from gensim.models import KeyedVectors
word_vectors = KeyedVectors.load("~/", mmap='r')

(tensor([[[[-2.6975e-02,  1.1427e+00,  4.1698e-01,  ..., -8.3755e+00,
            -4.7765e-01, -2.1633e-01],
           [ 2.8020e-01, -6.6962e-01,  1.4016e+00,  ...,  7.5675e-04,
             2.2802e-01, -1.7932e+00],
           [-1.1675e-01,  2.9058e-02,  1.4350e-01,  ..., -1.8113e+00,
            -9.6346e-01,  2.6943e+00],
           [ 9.3339e-02,  5.5481e-01,  5.7394e-01,  ...,  6.9919e-01,
            -3.4617e-01,  2.7778e-01],
           [-1.4753e-01, -4.3234e-01,  1.7809e+00,  ..., -1.4585e+00,
            -7.7736e-01,  1.1985e-01],
           [-5.5448e-01,  8.2052e-01,  1.4681e-03,  ..., -3.1119e-02,
            -1.6508e-01, -6.0103e-01]],
 
          [[ 1.1321e-01,  1.9631e-01,  1.0589e+00,  ...,  4.4958e+00,
             4.9575e-02,  2.7624e-01],
           [-2.5041e-01,  1.0904e+00,  1.4865e+00,  ..., -3.6319e-01,
            -4.8349e-01,  6.8312e-01],
           [-2.0786e+00,  5.2171e-03, -1.3269e-02,  ...,  1.5515e-01,
             1.1037e+00, -2.8357e-01],
           [-1.6

In [126]:
data_conf, data_german, aml_data, emb_indexer, emb_indexer_inv, emb_vals, neighbours_dicts, max_paths, max_pathlen, max_types, ontologies_in_alignment = pickle.load(open("Input/data_conf_oaei_german_aml_thresh.pkl", "rb"))
data_conf, data_german, logmap_data, emb_indexer, emb_indexer_inv, emb_vals, neighbours_dicts, max_paths, max_pathlen, max_types, ontologies_in_alignment = pickle.load(open("Input/data_conf_oaei_german_logmap_thresh.pkl", "rb"))
pickle.dump([data_conf, data_german, aml_data, logmap_data, emb_indexer, emb_indexer_inv, emb_vals, neighbours_dicts, max_paths, max_pathlen, max_types, ontologies_in_alignment], open("Input/data_conf_oaei_german_logmap_aml_thresh.pkl", "wb"))

In [127]:
ctimes, dtimes = [], []
for i in range(100):
    a = torch.randn((10,4,22,6,512))
    b = torch.randn((10,4,1,6,512))
    t = time.time()
    c = a * b
    ctimes.append(time.time()-t)
    t = time.time()
    d = torch.bmm(c.permute(0,1,3,4,2).reshape(-1, 22, 1), b.permute(0,1,3,4,2).reshape(-1, 1, 1)).squeeze(-1).reshape(-1,4,6,512,22).permute(0,1,4,2,3)
    dtimes.append(time.time()-t)

In [130]:
a = torch.randn((10,4,22,6,512))
b = torch.randn((10,4,22,6,512))
c = a * torch.sum(b,dim=2).unsqueeze(2)

In [12]:
# AML test
def is_test(test_onto, key):
    return tuple([el.split("#")[0] for el in key]) in test_onto

results = []
outputs = {}
# all_ont_pairs = list(set([tuple([el.split("#")[0] for el in l]) for l in data.keys()]))
ontologies_in_alignment = [tuple(pair) for pair in ontologies_in_alignment]
gt_mappings = [el[:2] for el in gt_mappings]

test_onto = ontologies_in_alignment
for ont_pair in test_onto:
    a, b, c = ont_pair[0], ont_pair[1], ont_pair[0] + "-" + ont_pair[1]
    java_command = "java -jar AML_v3.1/AgreementMakerLight.jar -s conference_ontologies/" + a + ".owl" + \
                        " -t conference_ontologies/" + b + ".owl -o AML-test-results/" + c + ".rdf -a"
    process = subprocess.Popen(java_command.split(), stdout=subprocess.PIPE)
    output, error = process.communicate()
print (test_onto)
pred_aml = load_alignments("AML-test-results/")
pred_aml = [tuple([el.split("/")[-1] for el in key][:2]) for key in pred_aml]
outputs[ont_pair] = pred_aml
tp = len([elem for elem in pred_aml if data[elem]])
fn = len([key for key in gt_mappings if key not in set(pred_aml) and is_test(test_onto, key)])
fp = len([elem for elem in pred_aml if not data[elem]])

precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1score = 2 * precision * recall / (precision + recall)
f2score = 5 * precision * recall / (4 * precision + recall)
f0_5score = 1.25 * precision * recall / (0.25 * precision + recall)
print (precision, recall, f1score, f2score, f0_5score)

metrics = [precision, recall, f1score, f2score, f0_5score]
results.append(metrics)

_ = [os.remove(f) for f in glob.glob('AML-test-results/*')]

print ("Final Results:", np.mean(results, axis=0))

[('confOf', 'sigkdd'), ('iasted', 'sigkdd'), ('cmt', 'ekaw'), ('confOf', 'iasted'), ('conference', 'edas'), ('cmt', 'sigkdd'), ('ekaw', 'sigkdd'), ('conference', 'confOf'), ('conference', 'sigkdd'), ('confOf', 'edas'), ('cmt', 'conference'), ('edas', 'iasted'), ('conference', 'iasted'), ('edas', 'sigkdd'), ('ekaw', 'iasted'), ('cmt', 'edas'), ('edas', 'ekaw'), ('cmt', 'confOf'), ('confOf', 'ekaw'), ('conference', 'ekaw'), ('cmt', 'iasted')]
0.8026905829596412 0.5868852459016394 0.678030303030303 0.6202356202356203 0.747702589807853
Final Results: [0.80269058 0.58688525 0.6780303  0.62023562 0.74770259]


In [10]:
# AML test
def is_test(test_onto, key):
    return tuple([el.split("#")[0] for el in key]) in test_onto

results = []
outputs = {}
# all_ont_pairs = list(set([tuple([el.split("#")[0] for el in l]) for l in data.keys()]))
ontologies_in_alignment = [tuple(pair) for pair in ontologies_in_alignment]
gt_mappings = [el[:2] for el in gt_mappings]
for i in list(range(0, len(ontologies_in_alignment), 1)):
    test_onto = ontologies_in_alignment[i:i+1]
    for ont_pair in test_onto:
        a, b, c = ont_pair[0], ont_pair[1], ont_pair[0] + "-" + ont_pair[1]
        java_command = "java -jar AML_v3.1/AgreementMakerLight.jar -s conference_ontologies/" + a + ".owl" + \
                            " -t conference_ontologies/" + b + ".owl -o AML-test-results/" + c + ".rdf -a"
        process = subprocess.Popen(java_command.split(), stdout=subprocess.PIPE)
        output, error = process.communicate()
    print (test_onto)
    pred_aml = load_alignments("AML-test-results/")
    pred_aml = [tuple([el.split("/")[-1] for el in key][:2]) for key in pred_aml]
    outputs[ont_pair] = pred_aml
    tp = len([elem for elem in pred_aml if data[elem]])
    fn = len([key for key in gt_mappings if key not in set(pred_aml) and is_test(test_onto, key)])
    fp = len([elem for elem in pred_aml if not data[elem]])

    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1score = 2 * precision * recall / (precision + recall)
    f2score = 5 * precision * recall / (4 * precision + recall)
    f0_5score = 1.25 * precision * recall / (0.25 * precision + recall)
    print (precision, recall, f1score, f2score, f0_5score)
    
    metrics = [precision, recall, f1score, f2score, f0_5score]
    results.append(metrics)
    
    _ = [os.remove(f) for f in glob.glob('AML-test-results/*')]
    
print ("Final Results:", np.mean(results, axis=0))

[('confOf', 'sigkdd')]
1.0 0.7142857142857143 0.8333333333333333 0.7575757575757576 0.9259259259259259
[('iasted', 'sigkdd')]
0.8666666666666667 0.8666666666666667 0.8666666666666667 0.8666666666666667 0.8666666666666667
[('cmt', 'ekaw')]
0.6666666666666666 0.5454545454545454 0.6 0.5660377358490566 0.6382978723404255
[('confOf', 'iasted')]
0.8 0.4444444444444444 0.5714285714285714 0.4878048780487804 0.6896551724137931
[('conference', 'edas')]
0.75 0.5294117647058824 0.6206896551724139 0.5625 0.6923076923076923
[('cmt', 'sigkdd')]
0.9 0.75 0.8181818181818182 0.7758620689655173 0.8653846153846154
[('ekaw', 'sigkdd')]
0.8 0.7272727272727273 0.761904761904762 0.7407407407407407 0.7843137254901961
[('conference', 'confOf')]
0.7272727272727273 0.5333333333333333 0.6153846153846153 0.5633802816901409 0.6779661016949153
[('conference', 'sigkdd')]
0.8181818181818182 0.6 0.6923076923076923 0.6338028169014084 0.7627118644067796
[('confOf', 'edas')]
0.9 0.47368421052631576 0.6206896551724138 0.523

In [9]:
data_log

[('conference#Information_for_participants', 'ekaw#Programme_Brochure', '1.0'),
 ('conference#Person', 'ekaw#Person', '1.0'),
 ('conference#Tutorial', 'ekaw#Tutorial', '1.0'),
 ('conference#Review', 'ekaw#Review', '1.0'),
 ('conference#has_a_review', 'ekaw#hasReview', '1.0'),
 ('conference#Workshop', 'ekaw#Workshop', '1.0'),
 ('conference#Late_paid_applicant', 'ekaw#Late-Registered_Participant', '1.0'),
 ('conference#Early_paid_applicant',
  'ekaw#Early-Registered_Participant',
  '1.0'),
 ('conference#Organization', 'ekaw#Organisation', '1.0'),
 ('conference#Track-workshop_chair', 'ekaw#Workshop_Chair', '1.0'),
 ('conference#Abstract', 'ekaw#Abstract', '1.0'),
 ('conference#Conference_proceedings', 'ekaw#Conference_Proceedings', '1.0'),
 ('conference#Conference_volume', 'ekaw#Conference', '1.0'),
 ('conference#Rejected_contribution', 'ekaw#Rejected_Paper', '1.0'),
 ('conference#Poster', 'ekaw#Poster_Paper', '1.0'),
 ('conference#Track', 'ekaw#Track', '1.0'),
 ('conference#Topic', 'ekaw

In [122]:

# LogMap test
def is_test(test_onto, key):
    return tuple([el.split("#")[0] for el in key]) in test_onto

results = []
prefix = "/data/Vivek/IBM/IBM-Internship/conference_ontologies/"
for i in list(range(0, len(ontologies_in_alignment), 3)):
    test_onto = ontologies_in_alignment[i:i+3]
    tp_tot, fn_tot, fp_tot = [], [], []
    for ont_pair in test_onto:
        a, b, c = prefix + ont_pair[0], prefix + ont_pair[1], ont_pair[0] + "-" + ont_pair[1]
        !mkdir $c
        java_command = "java -jar logmap-matcher/target/logmap-matcher-4.0.jar MATCHER file:" +  a + ".owl file:" + b + ".owl " + \
                        "/data/Vivek/IBM/IBM-Internship/" + c + "/ false"
        process = subprocess.Popen(java_command.split(), stdout=subprocess.PIPE)
        output, error = process.communicate()
        
        pred_aml = [l.strip().split("\t")[:2] for l in open(c + "/logmap2_mappings.tsv", "r").read().split("\n")[:-1]]
        pred_aml = [tuple([el.split("/")[-1] for el in key]) for key in pred_aml]
        tp = [elem for elem in pred_aml if data[elem]]
        fn = [key for key in gt_mappings if key not in set(pred_aml) and is_test(test_onto, key)]
        fp = [elem for elem in pred_aml if not data[elem]]
        
        tp_tot.extend(tp)
        fn_tot.extend(fn)
        fp_tot.extend(fp)
        
        !rm -rf $c
   
    precision = len(tp_tot)/(len(tp_tot)+len(fp_tot))
    recall = len(tp_tot)/(len(tp_tot)+len(fn_tot))
    f1score = 2 * precision * recall / (precision + recall)
    f2score = 5 * precision * recall / (4 * precision + recall)
    f0_5score = 1.25 * precision * recall / (0.25 * precision + recall)
    print (test_onto, precision, recall, f1score, f2score, f0_5score)
    
    metrics = [precision, recall, f1score, f2score, f0_5score]
    results.append(metrics)
    
    
    
print ("Final Results:", np.mean(results, axis=0))

[('cmt', 'ekaw')] 0.75 0.5454545454545454 0.631578947368421 0.5769230769230769 0.6976744186046512
[('cmt', 'sigkdd')] 1.0 0.9166666666666666 0.9565217391304348 0.9322033898305083 0.9821428571428572
[('conference', 'sigkdd')] 0.7857142857142857 0.7333333333333333 0.7586206896551724 0.7432432432432431 0.7746478873239436
[('edas', 'iasted')] 0.875 0.3684210526315789 0.5185185185185185 0.41666666666666663 0.6862745098039215
[('ekaw', 'iasted')] 0.7 0.7 0.7 0.7 0.7
[('cmt', 'confOf')] 0.8333333333333334 0.3125 0.45454545454545453 0.35714285714285715 0.625
[('cmt', 'iasted')] 0.8 1.0 0.888888888888889 0.9523809523809523 0.8333333333333334
Final Results: [0.82057823 0.65376794 0.70123918 0.66836574 0.75701043]


In [179]:

# LogMap test
def is_test(test_onto, key):
    return tuple([el.split("#")[0] for el in key]) in test_onto

results = []
prefix = "/data/Vivek/IBM/IBM-Internship/conference_ontologies/"
for ont_pair in ontologies_in_alignment:
    a, b, c = prefix + ont_pair[0], prefix + ont_pair[1], "LogMap-conference-results/" + ont_pair[0] + "-" + ont_pair[1]
    !rm -rf $c
    !mkdir $c
    java_command = "java -jar logmap-matcher/target/logmap-matcher-4.0.jar MATCHER file:" +  a + ".owl file:" + b + ".owl " + \
                    "/data/Vivek/IBM/IBM-Internship/" + c + "/ false"
    process = subprocess.Popen(java_command.split(), stdout=subprocess.PIPE)
    output, error = process.communicate()
        
    pred_aml = [l.strip().split("\t") for l in open(c + "/logmap2_mappings.tsv", "r").read().split("\n")[:-1]]
    pred_aml = [tuple([el.split("/")[-1] for el in key]) for key in pred_aml]
    results.extend(pred_aml)

In [300]:
data_conf, data_german, aml_data, emb_indexer, emb_indexer_inv, emb_vals, neighbours_dicts, max_paths, max_pathlen, max_types, ontologies_in_alignment = pickle.load(open("Input/data_conf_oaei_german_aml_thresh.pkl", "rb"))
aml_data_corrected = dict(aml_data)
for elem in aml_data:
    if elem in corrected_aml:
        aml_data_corrected[elem] = int(corrected_aml[elem])


In [284]:
fn = [l.split("\t")[:5] for l in open("validated_fn.tsv", "r").read().split("\n")[1:1200]]
fn = [[False if el =="FALSE" else True if el=="TRUE" else el for el in elem] for elem in fn]
corrected_aml = {tuple(elem[:2]): sum(elem[2:]) >= 2 for elem in fn}

In [290]:
fp = [l.split("\t")[:3] for l in open("validated_fp.tsv", "r").read().split("\n")[1:]]
fp = [[False if el =="FALSE" else True if el=="TRUE" else el for el in elem] for elem in fp]
for elem in fp:
    corrected_aml[tuple(elem[:2])] = elem[2]

In [296]:
[elem for elem in aml_data_corrected.items() if aml_data_corrected[elem[0]] != aml_data_corrected[elem[0]]

[['cmt#Paper', 'edas#Paper', True],
 ['cmt#Paper', 'sigkdd#Paper', True],
 ['edas#Paper', 'sigkdd#Paper', True],
 ['edas#Paper', 'ekaw#Paper', True],
 ['cmt#Paper', 'ekaw#Paper', True],
 ['cmt#Author', 'sigkdd#Author', True],
 ['cmt#Review', 'ekaw#Review', True],
 ['cmt#Review', 'iasted#Review', True],
 ['cmt#Person', 'edas#Person', True],
 ['cmt#Reviewer', 'iasted#Reviewer', True],
 ['ekaw#Paper', 'sigkdd#Paper', True],
 ['iasted#Speaker', 'sigkdd#Speaker', True],
 ['ekaw#Review', 'iasted#Review', True],
 ['iasted#Registration_fee', 'sigkdd#Registration_fee', True],
 ['conference#Person', 'edas#Person', True],
 ['cmt#Reviewer', 'conference#Reviewer', True],
 ['cmt#Person', 'conference#Person', True],
 ['cmt#Review', 'conference#Review', True],
 ['confOf#Author', 'sigkdd#Author', True],
 ['confOf#Social_event', 'ekaw#Social_Event', True],
 ['conference#Reviewer', 'iasted#Reviewer', True],
 ['conference#Review', 'ekaw#Review', True],
 ['confOf#Person', 'sigkdd#Person', True],
 ['edas#Ac

In [295]:
corrected_aml.items()

dict_items([(('edas#ActivePaper', 'ekaw#Paper'), False), (('conference#Conference_part', 'edas#Conference'), False), (('cmt#Paper', 'edas#ActivePaper'), False), (('cmt#Reviewer', 'ekaw#Review'), False), (('cmt#Chairman', 'edas#ConferenceChair'), False), (('cmt#Co-author', 'sigkdd#Author'), False), (('conference#Presentation', 'ekaw#Individual_Presentation'), False), (('cmt#Reviewer', 'sigkdd#Author'), False), (('cmt#Reviewer', 'ekaw#Possible_Reviewer'), False), (('cmt#Chairman', 'edas#SessionChair'), False), (('conference#Conference_document', 'edas#Conference'), False), (('conference#Conference_document', 'edas#ConferenceEvent'), False), (('cmt#Chairman', 'sigkdd#Speaker'), False), (('iasted#Building', 'sigkdd#Hotel'), False), (('cmt#Conference', 'conference#Conference_volume'), False), (('iasted#Hotel_room', 'sigkdd#Hotel'), False), (('iasted#Memeber_registration_fee', 'sigkdd#Registration_fee'), False), (('conference#Paper', 'ekaw#Paper'), False), (('ekaw#Possible_Reviewer', 'iasted

In [83]:
import requests

url = "https://montanaflynn-spellcheck.p.rapidapi.com/check/"

headers = {
    'x-rapidapi-host': "montanaflynn-spellcheck.p.rapidapi.com",
    'x-rapidapi-key': "9965b01207msh06291e57d6f2c55p1a6a16jsn0fb016da4a62"
    }

# inp_spellchecked = []
for concept in inp[731:]:
    querystring = {"text": concept}
    response = requests.request("GET", url, headers=headers, params=querystring).json()
    if response["suggestion"] != concept:
        resolved = str(concept)
        for word in response["corrections"]:
            if not re.search("[A-Z][A-Z]+", concept):
                resolved = resolved.replace(word, response["corrections"][word][0])
        
        inp_spellchecked.append(resolved)
        print (concept, resolved)
    else:
        inp_spellchecked.append(concept)




registeered applicant registered applicant
technically organised by technically organized by
ngo no
sponzorship sponsorship


In [77]:
querystring = {"text": "technically Organised By"}
response = requests.request("GET", url, headers=headers, params=querystring)
response.json()

{'original': 'technically Organised By',
 'suggestion': 'technically Organized By',
 'corrections': {'Organised': ['Organized',
   'Organist',
   'Organism',
   'Organizes',
   'Disorganize',
   'Organize',
   'Agonized']}}

In [298]:
int(False)

0

In [78]:
fn_spellchecked, fp_spellchecked = [dict(el) for el in pickle.load(open("test_v2.pkl", "rb"))]
fn_baseline, fp_baseline = [dict(el) for el in pickle.load(open("test_best.pkl", "rb"))]
fn_unhas, fp_unhas = [dict(el) for el in pickle.load(open("test_unhas.pkl", "rb"))]
fn_resolved, fp_resolved = [dict(el) for el in pickle.load(open("test_resolved.pkl", "rb"))]

fn_dict, fp_dict = {}, {}
def create_comparison_file(file, idx):
    fn, fp = [dict(el) for el in pickle.load(open(file, "rb"))]
    
    for key in fn:
        if key in fn_dict:
            fn_dict[key][idx] = fn[key]
        else:
            fn_dict[key] = ["N/A" for i in range(4)]
            fn_dict[key][idx] = fn[key]
    
    for key in fp:
        if key in fp_dict:
            fp_dict[key][idx] = fp[key]
        else:
            fp_dict[key] = ["N/A" for i in range(4)]
            fp_dict[key][idx] = fp[key]
    

create_comparison_file("test_best.pkl", 0)
create_comparison_file("test_unhas.pkl", 1)
create_comparison_file("test_v2.pkl", 2)
create_comparison_file("test_resolved.pkl", 3)

open("fn - comparison.tsv", "w+").write("\n".join(["\t".join([str(el) for el in flatten(el)]) for el in fn_dict.items()]))
open("fp - comparison.tsv", "w+").write("\n".join(["\t".join([str(el) for el in flatten(el)]) for el in fp_dict.items()]))

7796

In [14]:
ontologies_in_alignment = pickle.load(open("data_path.pkl", "rb"))[-1]
ontologies_in_alignment

[['confOf', 'sigkdd'],
 ['iasted', 'sigkdd'],
 ['cmt', 'ekaw'],
 ['confOf', 'iasted'],
 ['conference', 'edas'],
 ['cmt', 'sigkdd'],
 ['ekaw', 'sigkdd'],
 ['conference', 'confOf'],
 ['conference', 'sigkdd'],
 ['confOf', 'edas'],
 ['cmt', 'conference'],
 ['edas', 'iasted'],
 ['conference', 'iasted'],
 ['edas', 'sigkdd'],
 ['ekaw', 'iasted'],
 ['cmt', 'edas'],
 ['edas', 'ekaw'],
 ['cmt', 'confOf'],
 ['confOf', 'ekaw'],
 ['conference', 'ekaw'],
 ['cmt', 'iasted']]

In [72]:
d = {('confOf#Organization', 'sigkdd#Organizator'): (1,2,3,4),
 ('iasted#Document', 'sigkdd#Document'): (5,6,78,8)}
[[str(el) for el in flatten(el)] for el in d.items()]

[['confOf#Organization', 'sigkdd#Organizator', '1', '2', '3', '4'],
 ['iasted#Document', 'sigkdd#Document', '5', '6', '78', '8']]

In [34]:
abbreviations_dict = {}
final_dict = {}

for mapping in all_mappings:
    mapping = tuple([el.split("#")[1] for el in mapping])
    is_abb = re.search("[A-Z][A-Z]+", mapping[0])
    if is_abb:
        abbreviation = "".join([el[0].upper() for el in mapping[1].split("_")])
        if is_abb.group() in abbreviation:
            
            start = abbreviation.find(is_abb.group())
            end = start + len(is_abb.group())
            fullform = "_".join(mapping[1].split("_")[start:end])
            print ("left", mapping, abbreviation, fullform)
            
            rest_first = " ".join([el for el in mapping[0].replace(is_abb.group(), "").split("_") if el]).lower()
            rest_second = " ".join(mapping[1].split("_")[:start] + mapping[1].split("_")[end:])
            if is_abb.group() not in final_dict:
                final_dict[is_abb.group()] = [(fullform, rest_first, rest_second)]
            else:
                final_dict[is_abb.group()].append((fullform, rest_first, rest_second))

    is_abb = re.search("[A-Z][A-Z]+", mapping[1])
    if is_abb:
        abbreviation = "".join([el[0].upper() for el in mapping[0].split("_")])
        
        if is_abb.group() in abbreviation:
            start = abbreviation.find(is_abb.group())
            end = start + len(is_abb.group())
            fullform = "_".join(mapping[0].split("_")[start:end])
            print ("right", mapping, abbreviation, fullform)

            rest_first = " ".join([el for el in mapping[1].replace(is_abb.group(), "").split("_") if el]).lower()
            rest_second = " ".join(mapping[0].split("_")[:start] + mapping[0].split("_")[end:])
            if is_abb.group() not in final_dict:
                final_dict[is_abb.group()] = [(fullform, rest_first, rest_second)]
            else:
                final_dict[is_abb.group()].append((fullform, rest_first, rest_second))

keys = [el for el in list(set(flatten([flatten([tup[1:] for tup in final_dict[key]]) for key in final_dict]))) if el]
abb_embeds = dict(zip(keys, extractUSEEmbeddings(keys)))

scored_dict = {}
for abbr in final_dict:
    sim_list = [(tup[0], tup[1], tup[2], cos_sim(abb_embeds[tup[1]], abb_embeds[tup[2]])) if tup[1] and tup[2]
                else (tup[0], tup[1], tup[2], 0) for tup in final_dict[abbr]]
    scored_dict[abbr] = sorted(list(set(sim_list)), key=lambda x:x[-1], reverse=True)

resolved_dict = {key: scored_dict[key][0] for key in scored_dict}
filtered_dict = {key: " ".join(resolved_dict[key][0].split("_")) for key in resolved_dict if resolved_dict[key][-1] > 0.9}
inp_resolved = []
for concept in inp:
    for key in filtered_dict:
        concept = concept.replace(key, filtered_dict[key])
    inp_resolved.append(concept)
inp_resolved

left ('Chair_PC', 'Program_Chair') PC Program_Chair
left ('Chair_PC', 'Program_Committee') PC Program_Committee
left ('Chair_PC', 'Program_Committee_member') PCM Program_Committee
left ('Member_PC', 'Program_Chair') PC Program_Chair
left ('Member_PC', 'Program_Committee') PC Program_Committee
left ('Member_PC', 'Program_Committee_member') PCM Program_Committee
left ('Chair_PC', 'Presenter_city') PC Presenter_city
left ('Member_PC', 'Presenter_city') PC Presenter_city
left ('OC_Member', 'Organizing_Committee_member') OCM Organizing_Committee
left ('OC_Member', 'Organizing_Committee') OC Organizing_Committee
left ('PC_Member', 'Program_Chair') PC Program_Chair
left ('PC_Member', 'Program_Committee') PC Program_Committee
left ('PC_Member', 'Program_Committee_member') PCM Program_Committee
left ('OC_Chair', 'Organizing_Committee_member') OCM Organizing_Committee
left ('OC_Chair', 'Organizing_Committee') OC Organizing_Committee
left ('PC_Chair', 'Program_Chair') PC Program_Chair
left ('PC_C

In [48]:
keys = [el for el in list(set(flatten([flatten([tup[1:] for tup in final_dict[key]]) for key in final_dict]))) if el]
abb_embeds = dict(zip(keys, extractUSEEmbeddings(keys)))


In [27]:
cos_sim(*extractUSEEmbeddings(["Conference Banquet", "Dinner Banquet"]))

0.8085169792175293

In [159]:
len(pickle.load(open("../Input/data_german_dataset.pkl","rb"))[0])

130000

In [53]:
scored_dict = {}
for abbr in final_dict:
    sim_list = [(tup[0], tup[1], tup[2], cos_sim(abb_embeds[tup[1]], abb_embeds[tup[2]])) if tup[1] and tup[2]
                else (tup[0], tup[1], tup[2], 0) for tup in final_dict[abbr]]
    scored_dict[abbr] = sorted(list(set(sim_list)), key=lambda x:x[-1], reverse=True)


In [61]:
inp_case_handled = []
for concept in inp:
    final_list = []
    for word in concept.split(" "):
        if not re.search("[A-Z][A-Z]+", concept):
            final_list.append(word.lower())
        else:
            final_list.append(word)
    case_resolved = " ".join(final_list)
    inp_case_handled.append(case_resolved)
    
inp_case_handled

['pay',
 'rejected by',
 'Registration SIGMOD Member',
 'is connected with',
 'NGO',
 'overhead projector',
 'name of conference',
 'call for participation',
 'coffee break',
 'scientifically organised by',
 'volunteer',
 'publisher',
 'regular',
 'add program committee member',
 'contact email',
 'part of event',
 'dinner banquet',
 'social program',
 'decision',
 'is sent by',
 'was a committee chair of',
 'organisation',
 'paper',
 'assign external reviewer',
 'was a member of',
 'has cost amount',
 'session chair',
 'single level conference',
 'hotel',
 'deadline hotel reservation',
 'country',
 'important dates',
 'paper',
 'has parts',
 'event',
 'paper due on',
 'is the 1th part of',
 'has surname',
 'wireless communications topic',
 'two level conference',
 'has first name',
 'bid',
 'is designed for',
 'author',
 'double hotel room',
 'review form',
 'presentation',
 'subject area',
 'positive integer',
 'deadline abstract submission',
 'has a degree',
 'is present in',
 'rela

In [23]:
Ontology("conference_ontologies/conference.owl").triples

[('Conference_proceedings', 'string', 'has_a_name'),
 ('Committee_member', 'Committee', 'was_a_member_of'),
 ('Review', 'Conference_document', 'subclass_of'),
 ('Review_preference', 'Reviewer', 'belongs_to_reviewers'),
 ('Information_for_participants', 'Conference_document', 'subclass_of'),
 ('Regular_author', 'Conference_contributor', 'subclass_of'),
 ('Late_paid_applicant', 'Paid_applicant', 'subclass_of'),
 ('Active_conference_participant', 'Presentation', 'gives_presentations'),
 ('Invited_speaker', 'Conference_contributor', 'subclass_of'),
 ('Program_committee', 'Conference_volume', 'was_a_program_committee_of'),
 ('Topic', 'Review_preference', 'has_been_assigned_a_review_reference'),
 ('Conference_volume', 'Important_dates', 'has_important_dates'),
 ('Rejected_contribution', 'Reviewed_contribution', 'subclass_of'),
 ('Contribution_co-author', 'Regular_author', 'subclass_of'),
 ('Active_conference_participant', 'Conference_contributor', 'subclass_of'),
 ('Presentation', 'Abstract'

In [276]:
aml_data = pickle.load(open("Input/data_conf_oaei_german_aml_thresh.pkl", "rb"))[2]
# Out of 19, 13 are properties
# Total: Out of 305, 46 are properties
[elem for elem in fn if aml_data[elem[0]]]

[(('conference#has_an_email', 'sigkdd#E-mail'), 0.7713571766026859),
 (('conference#has_the_first_name', 'confOf#hasFirstName'), 0.880357117961549),
 (('conference#Conference_contribution', 'confOf#Contribution'),
  0.8868231393924292),
 (('conference#Conference_fees', 'sigkdd#Fee'), 0.6188696994468148),
 (('edas#hasName', 'sigkdd#Name_of_conference'), 0.4805131363962989),
 (('cmt#hasConferenceMember', 'edas#hasMember'), 0.6716822236329376),
 (('edas#startDate', 'sigkdd#Start_of_conference'), 0.5281862044760653),
 (('conference#has_the_first_name', 'edas#hasFirstName'), 0.880357117961549),
 (('cmt#memberOfConference', 'edas#isMemberOf'), 0.5384340162535509),
 (('conference#has_a_track-workshop-tutorial_topic', 'confOf#hasTopic'),
  0.2714889827757321),
 (('cmt#submitPaper', 'sigkdd#submit'), 0.7581631096558908),
 (('conference#has_the_last_name', 'confOf#hasSurname'), 0.87604244145975),
 (('cmt#ProgramCommitteeChair', 'sigkdd#Program_Chair'), 0.8825911780203144),
 (('conference#Late_pa

In [148]:
fp

[(('conference#Conference', 'confOf#Conference'), 0.9973421982532529),
 (('edas#Sponsorship', 'sigkdd#Sponzor'), 0.968656566134342),
 (('confOf#Organization', 'sigkdd#Organizator'), 0.998551220616579),
 (('edas#ConferenceSession', 'ekaw#Conference_Session'), 0.9999999884580107),
 (('conference#Conference', 'ekaw#Conference'), 0.9973162966528599),
 (('conference#Presentation', 'iasted#Video_presentation'),
  0.9121530492200467),
 (('edas#Presenter', 'ekaw#Presenter'), 0.9999999981909232),
 (('iasted#Sponzorship', 'sigkdd#Sponzor'), 0.9635374504545844),
 (('conference#Presentation', 'iasted#Presentation'), 0.9999998994128296),
 (('iasted#Memeber_registration_fee', 'sigkdd#Registration_Non-Member'),
  0.9158709829332377),
 (('iasted#Hotel_room', 'sigkdd#Hotel'), 0.9344941446887469),
 (('conference#Conference', 'edas#Conference'), 0.9999998180349513),
 (('iasted#Document', 'sigkdd#Document'), 0.9999999307574674),
 (('cmt#Document', 'iasted#Document'), 0.9999999162323577),
 (('conference#Ca

In [147]:
s = set(props)
len([elem for elem in gt_mappings if tuple(elem[:2]) not in s])

259