In [1]:
# Construction of dataset

import os, itertools, time, pickle, sys, glob, requests
import subprocess
from xml.dom import minidom
from collections import Counter, OrderedDict
from operator import itemgetter
from nltk.corpus import wordnet
import tensorflow_text
import tensorflow as tf
import tensorflow_hub as hub
from scipy import spatial
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import numpy as np
import scipy.sparse as sp
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from math import ceil, exp
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import networkx as nx
import matplotlib.pyplot as plt
from orderedset import OrderedSet
from copy import deepcopy
%matplotlib inline  

In [35]:
flatten = lambda l: [item for sublist in l for item in sublist]

class Ontology():
    def __init__(self, ontology):
        self.ontology = ontology
        self.ontology_obj = minidom.parse(ontology)
        self.root = self.ontology_obj.documentElement
        self.construct_mapping_dict()
        
        self.parents_dict = {}
        self.subclasses = self.parse_subclasses()
        self.object_properties = self.parse_object_properties()
        self.data_properties = self.parse_data_properties()
        self.triples = self.parse_triples()
        self.classes = self.parse_classes()        
    
    def construct_mapping_dict(self):
        self.mapping_dict = {self.extract_ID(el, False): self.get_child_node(el, "rdfs:label")[0].firstChild.nodeValue for el in self.root.getElementsByTagName("owl:Class") if self.get_child_node(el, "rdfs:label")}
        self.mapping_dict_inv = {self.mapping_dict[key]: key for key in self.mapping_dict}
        return
        
    def get_child_node(self, element, tag):
        return [e for e in element._get_childNodes() if type(e)==minidom.Element and e._get_tagName() == tag]
        
    def has_attribute_value(self, element, attribute, value):
        return True if element.getAttribute(attribute).split("#")[-1] == value else False
    
    def get_subclass_triples(self, rootpath=False):
        subclasses = self.get_subclasses(rootpath=False)
        return [(b,a,c,d) for (a,b,c,d) in subclasses]
    
    def parse_triples(self, union_flag=0, subclass_of=True, rootpath=False):
        obj_props = [(prop, "Object Property") for prop in self.object_properties]
        data_props = [(prop, "Datatype Property") for prop in self.data_properties]
        props = obj_props + data_props
        all_triples = []
        for prop, prop_type in props:
            domain_children = self.get_child_node(prop, "rdfs:domain")
            range_children = self.get_child_node(prop, "rdfs:range")
            domain_prop = self.filter_null([self.extract_ID(el) for el in domain_children])
            range_prop = self.filter_null([self.extract_ID(el) for el in range_children])
            if not domain_children or not range_children:
                continue
            if not domain_prop:
                domain_prop = self.filter_null([self.extract_ID(el) for el in domain_children[0].getElementsByTagName("owl:Class")])
            if not range_prop:
                range_prop = self.filter_null([self.extract_ID(el) for el in range_children[0].getElementsByTagName("owl:Class")])
            if domain_prop and range_prop:
                if union_flag == 0:
                    all_triples.extend([(el[0], el[1], self.extract_ID(prop), prop_type) for el in list(itertools.product(domain_prop, range_prop))])
                else:
                    all_triples.append(("###".join(domain_prop), "###".join(range_prop), self.extract_ID(prop), prop_type))
        if subclass_of:
            all_triples.extend(self.get_subclass_triples(rootpath))
        return list(set(all_triples))
    
    def get_triples(self, union_flag=0, subclass_of=True, rootpath=False):
        return self.parse_triples(union_flag, subclass_of, rootpath)

    def parse_subclasses(self, union_flag=0):
        subclasses = self.root.getElementsByTagName("rdfs:subClassOf")
        subclass_pairs = []
        for el in subclasses:
            inline_subclasses = self.extract_ID(el)
            if inline_subclasses:
                subclass_pairs.append((el, el.parentNode, "subclass_of", "Subclass"))
            else:
                level1_class = self.get_child_node(el, "owl:Class")
                if not level1_class:
                    restriction = el.getElementsByTagName("owl:Restriction")
                    if not restriction:
                        continue
                    prop = self.get_child_node(restriction[0], "owl:onProperty")
                    some_vals = self.get_child_node(restriction[0], "owl:someValuesFrom")
                    
                    if not prop or not some_vals:
                        continue
#                     print(self.extract_ID(el), "**", self.extract_ID(some_vals[0]), "**", self.extract_ID(prop[0]))
                    try:
                        if self.extract_ID(prop[0]) and self.extract_ID(some_vals[0]):
                            subclass_pairs.append((el.parentNode, some_vals[0], self.extract_ID(prop[0]), "Object Property"))
                        elif self.extract_ID(prop[0]) and not self.extract_ID(some_vals[0]):
                            class_vals = self.get_child_node(some_vals[0], "owl:Class")
                            subclass_pairs.append((el.parentNode, class_vals[0], self.extract_ID(prop[0]), "Object Property"))
                        elif not self.extract_ID(prop[0]) and self.extract_ID(some_vals[0]):
                            prop_vals = self.get_child_node(prop[0], "owl:ObjectProperty")
                            subclass_pairs.append((el.parentNode, some_vals[0], self.extract_ID(prop_vals[0]), "Object Property"))
                        else:
                            prop_vals = self.get_child_node(prop[0], "owl:ObjectProperty")
                            class_vals = self.get_child_node(some_vals[0], "owl:Class")
                            subclass_pairs.append((el.parentNode, class_vals[0], self.extract_ID(prop_vals[0]), "Object Property"))
                    except:
                        try:
                            if not self.extract_ID(prop[0]) and self.extract_ID(some_vals[0]):
                                prop_vals = self.get_child_node(prop[0], "owl:DatatypeProperty")
                                subclass_pairs.append((el.parentNode, some_vals[0], self.extract_ID(prop_vals[0]), "Datatype Property"))
                            elif not self.extract_ID(prop[0]) and not self.extract_ID(some_vals[0]):
                                prop_vals = self.get_child_node(prop[0], "owl:DatatypeProperty")
                                class_vals = self.get_child_node(some_vals[0], "owl:Class")
                                subclass_pairs.append((el.parentNode, class_vals[0], self.extract_ID(prop_vals[0]), "Datatype Property"))
                        except Exception as e:
                            print (e)
                            continue
                else:
                    if self.extract_ID(level1_class[0]):
                        subclass_pairs.append((level1_class[0], el.parentNode, "subclass_of", "Subclass"))
                    else:
#                         level2classes = level1_class[0].getElementsByTagName("owl:Class")
#                         subclass_pairs.extend([(elem, el.parentNode, "subclass_of", "Subclass") for elem in level2classes if self.extract_ID(elem)])
                        continue
        return subclass_pairs
    
    def extract_ns(self):
        '''
        Extracts namespace of an ontology
        '''
        ns = self.ontology_obj.getElementsByTagName("rdf:RDF")[0].getAttribute("xmlns")
        if ns[-1] == "#":
            return ns
        return self.ontology_obj.doctype.entities.item(0).firstChild.nodeValue
    
    def get_subclasses(self, rootpath=False):
        subclasses = [(self.extract_ID(a, not rootpath), self.extract_ID(b, not rootpath), c, d) for (a,b,c,d) in self.subclasses]
        self.parents_dict = {}
        for (a,b,c,d) in subclasses:
            if c == "subclass_of" and a!="Thing" and b!="Thing":
                if b not in self.parents_dict:
                    self.parents_dict[b] = [a]
                else:
                    self.parents_dict[b].append(a)
        return [el for el in subclasses if el[0] and el[1] and el[2] and el[0]!="Thing" and el[1]!="Thing"]
    
    def filter_null(self, data):
        return [el for el in data if el]
    
    def extract_ID(self, element, check_coded = True):
        element_id = element.getAttribute("rdf:ID") or element.getAttribute("rdf:resource") or element.getAttribute("rdf:about")
        element_id = element_id.split("#")[-1]
        if len(list(filter(str.isdigit, element_id))) >= 3 and "_" in element_id and check_coded:
            return self.mapping_dict[element_id]
        return element_id.replace("UNDEFINED_", "").replace("DO_", "")
    
    def parse_classes(self):
        class_elems = [self.extract_ID(el) for el in self.root.getElementsByTagName("owl:Class")]
        subclass_classes = list(set(flatten([el[:-1] for el in self.triples])))
        return list(set(self.filter_null(class_elems + subclass_classes)))
    
    def get_classes(self):
        return self.classes
    
    def get_entities(self):
        entities = [self.extract_ID(el) for el in self.root.getElementsByTagName("owl:Class")]
        return list(set(self.filter_null(entities)))

    def parse_data_properties(self):
        data_properties = [el for el in self.get_child_node(self.root, 'owl:DatatypeProperty')]
        fn_data_properties = [el for el in self.get_child_node(self.root, 'owl:FunctionalProperty') if el]
        fn_data_properties = [el for el in fn_data_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "DatatypeProperty")]]
        inv_fn_data_properties = [el for el in self.get_child_node(self.root, 'owl:InverseFunctionalProperty') if el]
        inv_fn_data_properties = [el for el in inv_fn_data_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "DatatypeProperty")]]
        return data_properties + fn_data_properties + inv_fn_data_properties
        
    def parse_object_properties(self):
        obj_properties = [el for el in self.get_child_node(self.root, 'owl:ObjectProperty')]
        fn_obj_properties = [el for el in self.get_child_node(self.root, 'owl:FunctionalProperty') if el]
        fn_obj_properties = [el for el in fn_obj_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "ObjectProperty")]]
        inv_fn_obj_properties = [el for el in self.get_child_node(self.root, 'owl:InverseFunctionalProperty') if el]
        inv_fn_obj_properties = [el for el in inv_fn_obj_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "ObjectProperty")]]
        return obj_properties + fn_obj_properties + inv_fn_obj_properties
    
    def get_object_properties(self):
        obj_props = [self.extract_ID(el) for el in self.object_properties]
        return list(set(self.filter_null(obj_props)))
    
    def get_data_properties(self):
        data_props = [self.extract_ID(el) for el in self.data_properties]
        return list(set(self.filter_null(data_props)))


In [36]:
alignment_folder = "german_datasets/mapping freizeit/"

ontologies_in_alignment = []

# Load reference alignments 
def load_alignments(folder):
    gt = []
    path = [folder + l for l in os.listdir(folder) if l.endswith(".txt")][0]
    mappings = [content.strip() for content in open(path).read().split("--------------------------------------------------------") if content.strip()]
    for mapping in mappings:
        src = [line.split(":")[-1].strip() for line in mapping.split("\n") if line.startswith(" + Source: ")][0]
        targ = [line.split(":")[-1].strip() for line in mapping.split("\n") if line.startswith(" + Target: ")][0]
        ontologies_in_alignment.append((folder + src, folder + targ))
        src = src.rsplit(".",1)[0].replace(".", "_").lower()
        targ = targ.rsplit(".",1)[0].replace(".", "_").lower()
        lines = [["_".join(row.strip().split(":")[0].replace(",-", "_").split(".")) for row in line.split("-",1)[1].strip().split("<->")]
                 for line in mapping.split("\n") if line.startswith(" -")]
        lines = [[src + "#" + line[0], targ + "#" + line[1]] for line in lines]
        gt.append(lines)
    return gt

# Extracting USE embeddings

def extractUSEEmbeddings(words):
    model = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3?tf-hub-format=compressed")
    embeds = model(words)
    return embeds.numpy()

def cos_sim(a,b):
    return 1 - spatial.distance.cosine(a, b)


reference_alignments = load_alignments(alignment_folder)


In [49]:
reference_alignments

[[['dmoz_freizeit#Top_World_Deutsch_Online-Shops_Freizeit_Artistik',
   'google_freizeit#World_Deutsch_Online-Shops_Freizeit_Artistik'],
  ['dmoz_freizeit#Top_World_Deutsch_Online-Shops_Freizeit_Basteln_Kerzen',
   'google_freizeit#World_Deutsch_Online-Shops_Freizeit_Basteln_Kerzen'],
  ['dmoz_freizeit#Top_World_Deutsch_Online-Shops_Freizeit_Basteln_Malen',
   'google_freizeit#World_Deutsch_Online-Shops_Freizeit_Basteln_Malen'],
  ['dmoz_freizeit#Top_World_Deutsch_Online-Shops_Freizeit_Basteln_Papier',
   'google_freizeit#World_Deutsch_Online-Shops_Freizeit_Basteln_Papier'],
  ['dmoz_freizeit#Top_World_Deutsch_Online-Shops_Freizeit_Basteln_Perlen',
   'google_freizeit#World_Deutsch_Online-Shops_Freizeit_Basteln_Perlen'],
  ['dmoz_freizeit#Top_World_Deutsch_Online-Shops_Freizeit_Basteln_Serviettentechnik',
   'google_freizeit#World_Deutsch_Online-Shops_Freizeit_Basteln_Serviettentechnik'],
  ['dmoz_freizeit#Top_World_Deutsch_Online-Shops_Freizeit_Basteln_Stempeltechnik',
   'google_frei

In [18]:
# Combinatorial mapping generation
all_mappings = []
for l in ontologies_in_alignment:
    ont1 = Ontology(l[0])
    ont2 = Ontology(l[1])
    
    ent1 = ont1.get_classes()
    ent2 = ont2.get_classes()
    
    obj1 = ont1.get_object_properties()
    obj2 = ont2.get_object_properties()
    
    data1 = ont1.get_data_properties()
    data2 = ont2.get_data_properties()

    mappings = list(itertools.product(ent1, ent2)) + list(itertools.product(obj1, obj2)) + list(itertools.product(data1, data2))
    
    pre1 = l[0].split("/")[-1].rsplit(".",1)[0].replace(".", "_").lower()
    pre2 = l[1].split("/")[-1].rsplit(".",1)[0].replace(".", "_").lower()
    print (pre1, pre2)
    all_mappings.extend([(pre1 + "#" + el[0], pre2 + "#" + el[1]) for el in mappings])
    

data = {mapping: False for mapping in all_mappings}
reference_alignments = [tuple(alignment) for alignment in reference_alignments]
s = set(all_mappings)
for mapping in set(reference_alignments):
    if mapping in s:
        data[mapping] = True
    else:
        mapping = tuple([el.replace(",-", "_") for el in mapping])
        if mapping in s:
            data[mapping] = True
        else:
            print (mapping)

dmoz_freizeit google_freizeit


In [48]:
def write_results(final_list):
    ont_name_parsed1 = Ontology(ont_name1).extract_ns()
    ont_name_parsed2 = Ontology(ont_name2).extract_ns()
    ont_name1_pre = ont_name1 if (ont_name1.startswith("http://") or ont_name1.startswith("https://")) else "file://" + ont_name1
    ont_name2_pre = ont_name2 if (ont_name2.startswith("http://") or ont_name2.startswith("https://")) else "file://" + ont_name2
    rdf = \
    """<?xml version='1.0' encoding='utf-8' standalone='no'?>
<rdf:RDF xmlns='http://knowledgeweb.semanticweb.org/heterogeneity/alignment#'
         xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#'
         xmlns:xsd='http://www.w3.org/2001/XMLSchema#'
         xmlns:align='http://knowledgeweb.semanticweb.org/heterogeneity/alignment#'>
<Alignment>
  <xml>yes</xml>
  <level>0</level>
  <type>**</type>
  <onto1>
    <Ontology rdf:about="{}">
      <location>{}</location>
    </Ontology>
  </onto1>
  <onto2>
    <Ontology rdf:about="{}">
      <location>{}</location>
    </Ontology>
  </onto2>""".format(ont_name_parsed1.split("#")[0], ont_name1_pre, ont_name_parsed2.split("#")[0], ont_name2_pre)
    for (a,b,score) in final_list:
        mapping = """
  <map>
    <Cell>
      <entity1 rdf:resource='{}'/>
      <entity2 rdf:resource='{}'/>
      <relation>=</relation>
      <measure rdf:datatype='http://www.w3.org/2001/XMLSchema#float'>{}</measure>
    </Cell>
  </map>""".format(ont_name_parsed1 + "#".join(a.split("#")[1:]), ont_name_parsed2 + "#".join(b.split("#")[1:]), score)
        rdf += mapping
    rdf += """
</Alignment>
</rdf:RDF>"""
    return rdf


# Load reference alignments 
def load_alignments(folder):
    gt = []
    path = [folder + l for l in os.listdir(folder) if l.endswith(".txt")][0]
    mappings = [content.strip() for content in open(path).read().split("--------------------------------------------------------") if content.strip()]
    for mapping in mappings:
        src = [line.split(":")[-1].strip() for line in mapping.split("\n") if line.startswith(" + Source: ")][0]
        targ = [line.split(":")[-1].strip() for line in mapping.split("\n") if line.startswith(" + Target: ")][0]
        ontologies_in_alignment.append((folder + src, folder + targ))
        src = src.rsplit(".",1)[0].replace(".", "_").lower()
        targ = targ.rsplit(".",1)[0].replace(".", "_").lower()
        lines = [["_".join(row.strip().split(":")[0].replace(",-", "_").split(".")) for row in line.split("-",1)[1].strip().split("<->")]
                 for line in mapping.split("\n") if line.startswith(" -")]
        lines = [[src + "#" + line[0], targ + "#" + line[1], 1.0] for line in lines]
        gt.append(lines)
    return gt

alignment_folder = "german_datasets/mapping webdirectory/"
ontologies_in_alignment = []

final_list = load_alignments(alignment_folder)

ontologies_in_alignment = [('/data/Vivek/IBM/VeeAlign/datasets/web-directory/ontologies/dmoz.owl',
  '/data/Vivek/IBM/VeeAlign/datasets/web-directory/ontologies/google.owl'),
 ('/data/Vivek/IBM/VeeAlign/datasets/web-directory/ontologies/dmoz.owl',
  '/data/Vivek/IBM/VeeAlign/datasets/web-directory/ontologies/web.owl'),
 ('/data/Vivek/IBM/VeeAlign/datasets/web-directory/ontologies/dmoz.owl',
  '/data/Vivek/IBM/VeeAlign/datasets/web-directory/ontologies/yahoo.small.owl'),
 ('/data/Vivek/IBM/VeeAlign/datasets/web-directory/ontologies/google.owl',
  '/data/Vivek/IBM/VeeAlign/datasets/web-directory/ontologies/web.owl'),
 ('/data/Vivek/IBM/VeeAlign/datasets/web-directory/ontologies/google.owl',
  '/data/Vivek/IBM/VeeAlign/datasets/web-directory/ontologies/yahoo.small.owl'),
 ('/data/Vivek/IBM/VeeAlign/datasets/web-directory/ontologies/web.owl',
  '/data/Vivek/IBM/VeeAlign/datasets/web-directory/ontologies/yahoo.small.owl')]

for i,ont_pair in enumerate(ontologies_in_alignment):
    ont_name1, ont_name2 = ont_pair[0], ont_pair[1]
    rdf = write_results(final_list[i])
    c = "_".join(ont_name1.split("/")[-1].split(".")[:-1]) + "-" + "_".join(ont_name2.split("/")[-1].split(".")[:-1]) + ".rdf"
    f = open("/data/Vivek/IBM/VeeAlign/datasets/web-directory/alignments/" + c, "w+")
    f.write(rdf)
    f.close()
    
    
    

In [2]:


def camel_case_split(identifier):
    matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
    return [m.group(0) for m in matches]

# Abbrevation resolution preprocessing
def parse(word):
    return " ".join(flatten([el.split("_") for el in camel_case_split(word)]))

abbreviations_dict = {}
final_dict = {}

for mapping in all_mappings:
    mapping = tuple([el.split("#")[1] for el in mapping])
    is_abb = re.search("[A-Z][A-Z]+", mapping[0])
    if is_abb:
        
        abbreviation = "".join([el[0].upper() for el in mapping[1].split("_") if el])
        if is_abb.group() in abbreviation:
            
            start = abbreviation.find(is_abb.group())
            end = start + len(is_abb.group())
            fullform = "_".join(mapping[1].split("_")[start:end])
            print ("left", mapping, abbreviation, fullform)
            
            rest_first = " ".join([el for el in mapping[0].replace(is_abb.group(), "").split("_") if el]).lower()
            rest_second = " ".join(mapping[1].split("_")[:start] + mapping[1].split("_")[end:])
            if is_abb.group() not in final_dict:
                final_dict[is_abb.group()] = [(fullform, rest_first, rest_second)]
            else:
                final_dict[is_abb.group()].append((fullform, rest_first, rest_second))

    is_abb = re.search("[A-Z][A-Z]+", mapping[1])
    if is_abb:
        abbreviation = "".join([el[0].upper() for el in mapping[0].split("_") if el])
        
        if is_abb.group() in abbreviation:
            start = abbreviation.find(is_abb.group())
            end = start + len(is_abb.group())
            fullform = "_".join(mapping[0].split("_")[start:end])
            print ("right", mapping, abbreviation, fullform)

            rest_first = " ".join([el for el in mapping[1].replace(is_abb.group(), "").split("_") if el]).lower()
            rest_second = " ".join(mapping[0].split("_")[:start] + mapping[0].split("_")[end:])
            if is_abb.group() not in final_dict:
                final_dict[is_abb.group()] = [(fullform, rest_first, rest_second)]
            else:
                final_dict[is_abb.group()].append((fullform, rest_first, rest_second))

keys = [el for el in list(set(flatten([flatten([tup[1:] for tup in final_dict[key]]) for key in final_dict]))) if el]
abb_embeds = dict(zip(keys, extractUSEEmbeddings([parse(el) for el in keys])))

scored_dict = {}
for abbr in final_dict:
    sim_list = [(tup[0], tup[1], tup[2], cos_sim(abb_embeds[tup[1]], abb_embeds[tup[2]])) if tup[1] and tup[2]
                else (tup[0], tup[1], tup[2], 0) for tup in final_dict[abbr]]
    scored_dict[abbr] = sorted(list(set(sim_list)), key=lambda x:x[-1], reverse=True)

resolved_dict = {key: scored_dict[key][0] for key in scored_dict}
filtered_dict = {key: " ".join(resolved_dict[key][0].split("_")) for key in resolved_dict if resolved_dict[key][-1] > 0.9}


NameError: name 'all_mappings' is not defined

In [4]:

filtered_dict = {}
def camel_case_split(identifier):
    matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
    return [m.group(0) for m in matches]

# Abbrevation resolution preprocessing
def parse(word):
    return " ".join(flatten([el.split("_") for el in camel_case_split(word)]))

# Extracting USE embeddings

def extractUSEEmbeddings(words):
    model = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3?tf-hub-format=compressed")
    embeds = model(words)
    return embeds.numpy()

def cos_sim(a,b):
    return 1 - spatial.distance.cosine(a, b)

extracted_elems = []
mapping_ont = {}
for ont_name in list(set(flatten(ontologies_in_alignment))):
    ont = Ontology(ont_name)
    entities = ont.get_classes()
    props = ont.get_object_properties() + ont.get_data_properties()
    triples = list(set(flatten(ont.get_triples())))
    ont_name = ont_name.split("/")[-1].rsplit(".",1)[0].replace(".", "_").lower()
    mapping_ont[ont_name] = ont
    extracted_elems.extend([ont_name + "#" + elem for elem in entities + props + triples])

extracted_elems = list(set(extracted_elems))
inp = []
for word in extracted_elems:
    ont_name = word.split("#")[0]
    elem = word.split("#")[1]
    inp.append(parse(mapping_ont[ont_name].mapping_dict.get(elem, elem)))

print ("Total number of extracted unique classes and properties from entire RA set: ", len(extracted_elems))

extracted_elems = ["<UNK>"] + extracted_elems

embeds = np.array(extractUSEEmbeddings(inp))
embeds = np.array([np.zeros(embeds.shape[1],)] + list(embeds))
# embeds = np.array([np.zeros(512,)] + list(extractUSEEmbeddings(inp_spellchecked)))
embeddings = dict(zip(extracted_elems, embeds))


emb_vals = list(embeddings.values())
emb_indexer = {key: i for i, key in enumerate(list(embeddings.keys()))}
emb_indexer_inv = {i: key for i, key in enumerate(list(embeddings.keys()))}


Total number of extracted unique classes and properties from entire RA set:  3032


In [65]:
def path_to_root(elem, ont_mappings, curr = [], rootpath=[]):
    curr.append(elem)
    if elem not in ont_mappings or not ont_mappings[elem]:
        rootpath.append(curr)
        return
    for node in ont_mappings[elem]:
        curr_orig = deepcopy(curr)
        _ = path_to_root(node, ont_mappings, curr, rootpath)
        curr = curr_orig
    return rootpath

def get_one_hop_neighbours(ont, K=1):
    ont_obj = Ontology(ont)
    triples = ont_obj.get_triples(rootpath=True)
    entities = [(a,b) for (a,b,c,d) in triples]
    neighbours_dict = {elem: [[] for i in range(4)] for elem in list(set(flatten(entities)))}
    print (ont)
    for (e1, e2, p, d) in triples:
        if e1==e2:
            continue
        if d == "Object Property":
            neighbours_dict[e1][2].append([e2])
            neighbours_dict[e2][2].append([e1])
        elif d == "Datatype Property":
            neighbours_dict[e1][3].append([e2])
            neighbours_dict[e2][3].append([e1])
        elif d == "Subclass":
            neighbours_dict[e2][1].append([e1])
        else:
            print ("Error wrong value of d: ", d)
    
    rootpath_dict = ont_obj.parents_dict
    rootpath_dict_new = {}
    for elem in rootpath_dict:
#         print ("Done for ", elem)
        rootpath_dict_new[elem] = path_to_root(elem, rootpath_dict, [], [])
    ont = ont.split("/")[-1].rsplit(".",1)[0].replace(".", "_").lower()

    
    for entity in neighbours_dict:
        if entity in rootpath_dict_new and len(rootpath_dict_new[entity]) > 0:
            neighbours_dict[entity][0].extend(rootpath_dict_new[entity])
        else:
            continue
    
#     prop_triples = ont_obj.get_triples(subclass_of=False)
#     neighbours_dict_props = {c: [c] for a,b,c in prop_triples}
#     for e1, e2, p in prop_triples:
#         neighbours_dict_props[p].extend([e1, e2])

    #neighbours_dict = {**neighbours_dict, **neighbours_dict_props}
    
    # for elem in ont_obj.get_entities() + ont_obj.get_object_properties() + ont_obj.get_data_properties():
    #     if elem not in neighbours_dict:
    #         neighbours_dict[elem] = [elem]

#     neighbours_dict = {elem: [key[:1] + sorted(list(set(key[1:]))) for key in neighbours_dict[elem]]
#                        for elem in neighbours_dict}
#     neighbours_dict = {el: neighbours_dict[el][:23] for el in neighbours_dict if len( neighbours_dict[el]) > 2}
#     ont = ont.split("/")[-1].split(".")[0]
    neighbours_dict = {ont + "#" + el: [[tuple([ont + "#" + node for node in path]) for path in nbr_type]
                                        for nbr_type in neighbours_dict[el]] 
                       for el in neighbours_dict}
    neighbours_dict = {el: [[list(path) for path in nbr_type] for nbr_type in neighbours_dict[el]]
                       for el in neighbours_dict}
    return neighbours_dict

# neighbours_dicts = {ont.split("/")[-1].split(".")[0]: get_one_hop_neighbours(ont) for ont in list(set(flatten(ontologies_in_alignment)))}
neighbours_dicts = {}
for ont in list(set(flatten(ontologies_in_alignment))):
    neighbours_dicts = {**neighbours_dicts, **get_one_hop_neighbours(ont)}
max_types = np.max([len([nbr_type for nbr_type in elem if nbr_type]) for elem in neighbours_dicts.values()])
max_paths = np.max([[len(nbr_type) for nbr_type in elem] for elem in neighbours_dicts.values()])
max_pathlen = np.max(flatten([flatten([[len(path) for path in nbr_type] for nbr_type in elem]) for elem in neighbours_dicts.values()]), axis=0)

# max_paths = np.max([[len(nbr_type) for nbr_type in elem] for elem in neighbours_dicts.values()])
# max_pathlen = np.max(flatten([flatten([[len(path) for path in nbr_type] for nbr_type in elem]) for elem in neighbours_dicts.values()]), axis=0)
# neighbours_dicts_lenpadded = {elem: [[path + ["<UNK>" for i in range(max_pathlen -len(path))] for path in nbr_type]
#                                 for nbr_type in neighbours_dicts[elem]] for elem in neighbours_dicts}
# neighbours_dicts_pathpadded = {elem: [nbr_type + [["<UNK>" for j in range(max_pathlen)] for i in range(max_paths - len(nbr_type))]
#                                 for k,nbr_type in enumerate(neighbours_dicts_lenpadded[elem])] for elem in neighbours_dicts_lenpadded}
# neighbours_dicts_pathpadded = {elem: np.array(neighbours_dicts_pathpadded[elem]) for elem in neighbours_dicts_pathpadded}
# data_items = np.array(list(data.items()))

# data_shuffled_t = [elem for elem in data_items if elem[1]]
# data_shuffled_f = [elem for elem in data_items if not elem[1]]
# np.random.shuffle(data_items)
# data_shuffled_f = data_shuffled_f[:150000-len(data_shuffled_t)]

# indices = np.random.permutation(len(data_shuffled_t) + len(data_shuffled_f[:130000-len(data_shuffled_t)]))

# data_shuffled = data_shuffled_t + data_shuffled_f
# indices = np.random.permutation(len(data_shuffled))

# indices = np.random.permutation(len(data_shuffled))

# data = OrderedDict(data_items)

# ontologies_in_alignment_rev = [[el.split("/")[-1].rsplit(".",1)[0].replace(".", "_").lower() for el in ont] for ont in ontologies_in_alignment]

# f = open("Input/data_freizeit.pkl", "wb")
# pickle.dump([data, emb_indexer, emb_indexer_inv, emb_vals, neighbours_dicts], f)
# f = open("data_aml_uniqpath.pkl", "wb")
# pickle.dump([data, aml_data, emb_indexer, emb_indexer_inv, emb_vals, gt_mappings, neighbours_dicts_pathpadded, ontologies_in_alignment], f)
# # # # # neighbours_dicts

/data/Vivek/IBM/VeeAlign/datasets/web-directory/ontologies/yahoo.small.owl
/data/Vivek/IBM/VeeAlign/datasets/web-directory/ontologies/google.owl
/data/Vivek/IBM/VeeAlign/datasets/web-directory/ontologies/dmoz.owl
/data/Vivek/IBM/VeeAlign/datasets/web-directory/ontologies/web.owl


In [69]:
max_paths

1

In [68]:
def path_to_root(elem, ont_mappings, curr = [], rootpath=[]):
    curr.append(elem)
    if elem not in ont_mappings or not ont_mappings[elem]:
        rootpath.append(curr)
        return
    for node in ont_mappings[elem]:
        curr_orig = deepcopy(curr)
        _ = path_to_root(node, ont_mappings, curr, rootpath)
        curr = curr_orig
    return rootpath

def get_one_hop_neighbours(ont, K=1):
    ont_obj = Ontology(ont)
    triples = ont_obj.get_triples(rootpath=True)
    entities = [(a,b) for (a,b,c,d) in triples]
    neighbours_dict = {elem: [[] for i in range(4)] for elem in list(set(flatten(entities)))}
    print (ont)
    for (e1, e2, p, d) in triples:
        if e1==e2:
            continue
        if d == "Object Property":
            neighbours_dict[e1][2].append(e2)
            neighbours_dict[e2][2].append(e1)
        elif d == "Datatype Property":
            neighbours_dict[e1][3].append(e2)
            neighbours_dict[e2][3].append(e1)
        elif d == "Subclass":
            neighbours_dict[e2][1].append(e1)
        else:
            print ("Error wrong value of d: ", d)
    
    rootpath_dict = ont_obj.parents_dict
    rootpath_dict_new = {}
    for elem in rootpath_dict:
#         print ("Done for ", elem)
        rootpath_dict_new[elem] = path_to_root(elem, rootpath_dict, [], [])
    ont = ont.split("/")[-1].rsplit(".",1)[0].replace(".", "_").lower()

    for entity in neighbours_dict:
        neighbours_dict[entity][1] = [neighbours_dict[entity][1]]
        neighbours_dict[entity][2] = [neighbours_dict[entity][2]]
        neighbours_dict[entity][3] = [neighbours_dict[entity][3]]
        if entity in rootpath_dict_new and len(rootpath_dict_new[entity]) > 0:
            neighbours_dict[entity][0].extend(rootpath_dict_new[entity])
        else:
            continue
    
#     prop_triples = ont_obj.get_triples(subclass_of=False)
#     neighbours_dict_props = {c: [c] for a,b,c in prop_triples}
#     for e1, e2, p in prop_triples:
#         neighbours_dict_props[p].extend([e1, e2])

    #neighbours_dict = {**neighbours_dict, **neighbours_dict_props}
    
    # for elem in ont_obj.get_entities() + ont_obj.get_object_properties() + ont_obj.get_data_properties():
    #     if elem not in neighbours_dict:
    #         neighbours_dict[elem] = [elem]

#     neighbours_dict = {elem: [key[:1] + sorted(list(set(key[1:]))) for key in neighbours_dict[elem]]
#                        for elem in neighbours_dict}
#     neighbours_dict = {el: neighbours_dict[el][:23] for el in neighbours_dict if len( neighbours_dict[el]) > 2}
#     ont = ont.split("/")[-1].split(".")[0]
    neighbours_dict = {ont + "#" + el: [[tuple([ont + "#" + node for node in path]) for path in nbr_type]
                                        for nbr_type in neighbours_dict[el]] 
                       for el in neighbours_dict}
    neighbours_dict = {el: [[list(path) for path in nbr_type] for nbr_type in neighbours_dict[el]]
                       for el in neighbours_dict}
    return neighbours_dict

# neighbours_dicts = {ont.split("/")[-1].split(".")[0]: get_one_hop_neighbours(ont) for ont in list(set(flatten(ontologies_in_alignment)))}
neighbours_dicts = {}
for ont in list(set(flatten(ontologies_in_alignment))):
    neighbours_dicts = {**neighbours_dicts, **get_one_hop_neighbours(ont)}
max_types = np.max([len([nbr_type for nbr_type in elem if nbr_type]) for elem in neighbours_dicts.values()])
max_paths = np.max([[len(nbr_type) for nbr_type in elem] for elem in neighbours_dicts.values()])
max_pathlen = np.max(flatten([flatten([[len(path) for path in nbr_type] for nbr_type in elem]) for elem in neighbours_dicts.values()]), axis=0)

# max_paths = np.max([[len(nbr_type) for nbr_type in elem] for elem in neighbours_dicts.values()])
# max_pathlen = np.max(flatten([flatten([[len(path) for path in nbr_type] for nbr_type in elem]) for elem in neighbours_dicts.values()]), axis=0)
# neighbours_dicts_lenpadded = {elem: [[path + ["<UNK>" for i in range(max_pathlen -len(path))] for path in nbr_type]
#                                 for nbr_type in neighbours_dicts[elem]] for elem in neighbours_dicts}
# neighbours_dicts_pathpadded = {elem: [nbr_type + [["<UNK>" for j in range(max_pathlen)] for i in range(max_paths - len(nbr_type))]
#                                 for k,nbr_type in enumerate(neighbours_dicts_lenpadded[elem])] for elem in neighbours_dicts_lenpadded}
# neighbours_dicts_pathpadded = {elem: np.array(neighbours_dicts_pathpadded[elem]) for elem in neighbours_dicts_pathpadded}
# data_items = np.array(list(data.items()))

# data_shuffled_t = [elem for elem in data_items if elem[1]]
# data_shuffled_f = [elem for elem in data_items if not elem[1]]
# np.random.shuffle(data_items)
# data_shuffled_f = data_shuffled_f[:150000-len(data_shuffled_t)]

# indices = np.random.permutation(len(data_shuffled_t) + len(data_shuffled_f[:130000-len(data_shuffled_t)]))

# data_shuffled = data_shuffled_t + data_shuffled_f
# indices = np.random.permutation(len(data_shuffled))

# indices = np.random.permutation(len(data_shuffled))

# data = OrderedDict(data_items)

# ontologies_in_alignment_rev = [[el.split("/")[-1].rsplit(".",1)[0].replace(".", "_").lower() for el in ont] for ont in ontologies_in_alignment]

# f = open("Input/data_freizeit_bagofnbrs.pkl", "wb")
# pickle.dump([data, emb_indexer, emb_indexer_inv, emb_vals, neighbours_dicts], f)
# f = open("data_aml_uniqpath.pkl", "wb")
# pickle.dump([data, aml_data, emb_indexer, emb_indexer_inv, emb_vals, gt_mappings, neighbours_dicts_pathpadded, ontologies_in_alignment], f)
# # # # # neighbours_dicts

/data/Vivek/IBM/VeeAlign/datasets/web-directory/ontologies/yahoo.small.owl
/data/Vivek/IBM/VeeAlign/datasets/web-directory/ontologies/google.owl
/data/Vivek/IBM/VeeAlign/datasets/web-directory/ontologies/dmoz.owl
/data/Vivek/IBM/VeeAlign/datasets/web-directory/ontologies/web.owl


In [77]:
leb_data = pickle.load(open("Input/data_freizeit.pkl", "rb"))[0]
data, emb_indexer, emb_indexer_inv, emb_vals, neighbours_dicts = pickle.load(open("Input/data_webdir.pkl", "rb"))
pickle.dump([leb_data, data, emb_indexer, emb_indexer_inv, emb_vals, neighbours_dicts], open("Input/data_fre_webdir_bagofnbrs.pkl", "wb"))

In [19]:
leb_data, data, emb_indexer, emb_indexer_inv, emb_vals, _ = pickle.load(open("../Input/data_webdir_leb.pkl", "rb"))
pickle.dump([leb_data, data, emb_indexer, emb_indexer_inv, emb_vals, neighbours_dicts], open("../Input/data_webdir_leb_bagofnbrs.pkl", "wb"))
# pickle.dump([pickle.load(open("Input/data_freizeit.pkl", "rb"))[0],
#              pickle.load(open("../Input/data_webdir.pkl", "rb"))[0],
#              emb_indexer, emb_indexer_inv, emb_vals, neighbours_dicts], f)

In [59]:
neighbours_dicts

{'yahoo_small#Firmen_Sport_Motorsport': [[['yahoo_small#Firmen_Sport_Motorsport',
    'yahoo_small#Firmen_Sport',
    'yahoo_small#Firmen']],
  [['yahoo_small#Firmen_Sport_Motorsport_Fanartikel'],
   ['yahoo_small#Firmen_Sport_Motorsport_Kartsport']],
  [],
  []],
 'yahoo_small#Firmen_Gesundheit_Alternativmedizin_Atemarbeit_Bildung-und-Ausbildung': [[['yahoo_small#Firmen_Gesundheit_Alternativmedizin_Atemarbeit_Bildung-und-Ausbildung',
    'yahoo_small#Firmen_Gesundheit_Alternativmedizin_Atemarbeit',
    'yahoo_small#Firmen_Gesundheit_Alternativmedizin',
    'yahoo_small#Firmen_Gesundheit',
    'yahoo_small#Firmen']],
  [],
  [],
  []],
 'yahoo_small#Firmen_Sport_Radsport_Mountain-Biking': [[['yahoo_small#Firmen_Sport_Radsport_Mountain-Biking',
    'yahoo_small#Firmen_Sport_Radsport',
    'yahoo_small#Firmen_Sport',
    'yahoo_small#Firmen']],
  [['yahoo_small#Firmen_Sport_Radsport_Mountain-Biking_Komponenten'],
   ['yahoo_small#Firmen_Sport_Radsport_Mountain-Biking_Hersteller']],
  [],

In [62]:
np.max([len([nbr_type for nbr_type in elem if flatten(nbr_type)]) for elem in neighbours_dicts.values()])

2

In [27]:
# max_paths
sorted(Counter(flatten([[len(nbr_type) for nbr_type in elem] for elem in pickle.load(open("../Input/data_webdir_fre.pkl", "rb"))[-1].values()])).items())


[(0, 8658),
 (1, 3486),
 (2, 148),
 (3, 99),
 (4, 69),
 (5, 33),
 (6, 32),
 (7, 37),
 (8, 12),
 (9, 11),
 (10, 6),
 (11, 6),
 (12, 8),
 (13, 7),
 (15, 4),
 (16, 3),
 (17, 3),
 (18, 1),
 (19, 2),
 (20, 4),
 (21, 5),
 (22, 3),
 (23, 1),
 (24, 2),
 (26, 3),
 (38, 1),
 (41, 1),
 (42, 1),
 (48, 1),
 (49, 1)]

In [28]:
sorted(Counter(flatten([flatten([[len(path) for path in nbr_type] for nbr_type in elem]) for elem in pickle.load(open("../Input/data_webdir_fre.pkl", "rb"))[-1].values()])).items())

[(1, 3156),
 (2, 25),
 (3, 426),
 (4, 1243),
 (5, 1054),
 (6, 329),
 (7, 53),
 (8, 26)]

In [None]:
embedding_dim = np.array(emb_vals).shape[1]
        
name_embedding = nn.Embedding(len(emb_vals), embedding_dim)
name_embedding.load_state_dict({'weight': torch.from_numpy(np.array(emb_vals))})
name_embedding.weight.requires_grad = False

cosine_sim_layer = nn.CosineSimilarity(dim=1)
output = nn.Linear(2*embedding_dim, 300)
max_pathlen = np.array(list(neighbours_dicts_pathpadded.values())).shape[3]
v = nn.Parameter(torch.DoubleTensor([1/(max_pathlen) for i in range(max_pathlen)]))

nodes = torch.randint(0, len(emb_vals), size=(10,2))
features= torch.randint(0, len(emb_vals), size=(10,2,4,22,6))

nodes = nodes.permute(1,0) # 2 * batch_size
features = features.permute(1,0,2,3,4) # 2 * batch_size * 4 * max_paths * max_pathlen
for i in range(2):
    node_emb = name_embedding(nodes[i]) # batch_size * 512
    feature_emb = name_embedding(features[i]) #  batch_size * 4 * max_paths * max_pathlen * 512

    path_weights = torch.sum(torch.sum(node_emb[:, None, None, None, :] * feature_emb, dim=-1), dim=-1)
    best_path_indices = torch.max(path_weights, dim=2)[1][(..., ) + (None, ) * 3]
    best_path_indices = best_path_indices.expand(-1, -1, -1, max_pathlen,  embedding_dim)
    best_path = torch.gather(feature_emb, 2, best_path_indices).squeeze(2) # batch_size * 4 * max_pathlen * 512
    # Another way: 
    # path_weights = masked_softmax(path_weights)
    # best_path = torch.sum(path_weights.unsqueeze(-1) * feature_emb, dim=2)

    node_weights = torch.sum(node_emb[:, None, None, :] * best_path, dim=-1).unsqueeze(-1)
    torch.matmul(v, node_weights * best_path)

In [None]:
def masked_softmax(inp):
    inp = inp.double()
    mask = ((inp != 0).double() - 1) * 9999  # for -inf
    return (inp + mask).softmax(dim=-1)
a = torch.randn((10,4,22,6,512))
b = torch.randn((10,4,22,6,512))

attended_path = torch.bmm(a.reshape(-1, 1, 512), b.reshape(-1, 512, 1))
attended_path = attended_path.reshape(-1, 4, 22, 6)
path_weights = masked_softmax(torch.sum(attended_path, dim=-1))
path_weights.shape
# best_path = torch.sum(path_weights[:, :, :, None, None] * a, dim=2)

In [None]:
indices = d[(..., ) + (None, ) * 3].expand(-1, -1, -1, 6, 512)
e = torch.gather(b, 2, indices).squeeze(2)

f = torch.sum(a[:,None,None,:] *e,dim=-1).unsqueeze(-1)

g = (f*e)

h = torch.sum((v[None,None,:,None] * g), dim=2)

In [None]:
c = torch.randn((10,4,22,6,512))
b = torch.randn((10,4,1,6,512))
d = b * c


In [None]:
e = torch.bmm(c.permute(0,1,3,4,2).reshape(-1, 22, 1), b.permute(0,1,3,4,2).reshape(-1, 1, 1)).squeeze(-1).reshape(-1,4,6,512,22).permute(0,1,4,2,3)


In [None]:
fo

In [24]:
len(data)

3318178

In [None]:
ctimes, dtimes = [], []
for i in range(100):
    a = torch.randn((10,4,22,6,512))
    b = torch.randn((10,4,1,6,512))
    t = time.time()
    c = a * b
    ctimes.append(time.time()-t)
    t = time.time()
    d = torch.bmm(c.permute(0,1,3,4,2).reshape(-1, 22, 1), b.permute(0,1,3,4,2).reshape(-1, 1, 1)).squeeze(-1).reshape(-1,4,6,512,22).permute(0,1,4,2,3)
    dtimes.append(time.time()-t)

In [None]:
a = torch.randn((10,4,22,6,512))
b = torch.randn((10,4,22,6,512))
c = a * torch.sum(b,dim=2).unsqueeze(2)

In [23]:
arr = [('german_datasets_copy/lebensmittel/Google.Lebensmittel.owl',
  'german_datasets_copy/lebensmittel/web.Lebensmittel.owl')]
for ont_pair in arr:
    a, b, c = ont_pair[0], ont_pair[1], ont_pair[0].split("/")[-1].rsplit(".",1)[0].replace(".", "_").lower() + "-" + ont_pair[1].split("/")[-1].rsplit(".",1)[0].replace(".", "_").lower()
    !rm -rf $c
    os.mkdir(c)
    java_command = "java -jar logmap-matcher/target/logmap-matcher-4.0.jar MATCHER file:" +  os.path.abspath(a) + \
                     " file:" + os.path.abspath(b) + " " + "/data/Vivek/IBM/IBM-Internship/" + c + "/ false"
    process = subprocess.Popen(java_command.split(), stdout=subprocess.PIPE)
    output, error = process.communicate()
pred_logmap = [[el.split("/")[-1] for el in l.split("\t")[:-1]] for l in open(os.path.abspath(c + "/") + "/logmap2_mappings.tsv",  "r").read().split("\n")[:-1] if not l.startswith("Optional")]


KeyboardInterrupt: 

In [None]:
gt_mappings = []
for elem in pred_logmap:
    gt_mappings.append(tuple([el.split("#")[0].replace(".v2", "").rsplit(".",1)[0].replace(".", "_").lower() + "#" + el.split("#")[1] for el in elem]))


In [None]:
data_orig = pickle.load(open("../Input/data_lebensmittel.pkl", "rb"))[0]
data_logmap = {}
for key in data_orig:
    data_logmap[key] = False
s = set(list(data_logmap.keys()))
gt_mappings = [tuple(pair) for pair in gt_mappings]
for mapping in gt_mappings:
    
    if mapping in s:
        data_logmap[mapping] = True
    else:
        mapping = tuple([el.replace(",-", "_") for el in mapping])
        if mapping in s:
            data_logmap[mapping] = True
        else:
            print (mapping)

In [None]:
all_metrics = []
def return_test_data(data, i):
    data_t = {elem: data[elem] for elem in data if data[elem]}
    data_f = {elem: data[elem] for elem in data if not data[elem]}

    data_t_items = list(data_t.keys())
    data_f_items = list(data_f.keys())

    test_data_t = data_t_items[int((0.2*i)*len(data_t)):int((0.2*i + 0.2)*len(data_t))]
    test_data_f = data_f_items[int((0.2*i)*len(data_f)):int((0.2*i + 0.2)*len(data_f))]
    
    test_data = {}
    for elem in test_data_t:
        test_data[elem] = True
    for elem in test_data_f:
        test_data[elem] = False
    return test_data

for i in range(5):
    test_gt = return_test_data(data_orig, i)
    test_logmap = {elem: data_logmap[elem] for elem in test_gt}
    tp = len([elem for elem in test_gt if test_gt[elem] and test_logmap[elem]])
    fp = len([elem for elem in test_logmap if not test_gt[elem] and test_logmap[elem]])
    fn = len([elem for elem in test_logmap if test_gt[elem] and not test_logmap[elem]])
    
    try:
        precision = tp/(tp+fp)
        recall = tp/(tp+fn)
        f1score = 2 * precision * recall / (precision + recall)
        f2score = 5 * precision * recall / (4 * precision + recall)
        f0_5score = 1.25 * precision * recall / (0.25 * precision + recall)
    except Exception as e:
        print (e)
        continue
    all_metrics.append((precision, recall, f1score, f2score, f0_5score))
    

In [None]:
np.mean(all_metrics, axis=0)

In [None]:
# AML test
def is_test(test_onto, key):
    return tuple([el.split("#")[0] for el in key]) in test_onto

results = []

for i in list(range(0, ontologies_in_alignment, 3)):
    test_onto = all_ont_pairs[i:i+3]
    for ont_pair in test_onto:
        print (ont_pair)
        a, b, c = ont_pair[0], ont_pair[1], ont_pair[0].split("/")[-1].rsplit(".",1)[0].replace(".", "_").lower() + "-" + ont_pair[1].split("/")[-1].rsplit(".",1)[0].replace(".", "_").lower()
        java_command = "java -jar logmap-matcher/target/logmap-matcher-4.0.jar MATCHER file:" +  os.path.abspath(a)
                     " file:" + os.path.abspath(b) + " " + "/data/Vivek/IBM/IBM-Internship/" + c + "/ false"
        process = subprocess.Popen(java_command.split(), stdout=subprocess.PIPE)
        output, error = process.communicate()
    print (os.listdir("AML-test-results/"))
    pred_aml = load_alignments("AML-test-results/")
    pred_aml = [tuple([el.split("/")[-1] for el in key]) for key in pred_aml]
    tp = len([elem for elem in pred_aml if data[elem]])
    fn = len([key for key in gt_mappings if key not in set(pred_aml) and is_test(test_onto, key)])
    fp = len([elem for elem in pred_aml if not data[elem]])

    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1score = 2 * precision * recall / (precision + recall)
    f2score = 5 * precision * recall / (4 * precision + recall)
    f0_5score = 1.25 * precision * recall / (0.25 * precision + recall)
    print (precision, recall, f1score, f2score, f0_5score)
    
    metrics = [precision, recall, f1score, f2score, f0_5score]
    results.append(metrics)
    
    _ = [os.remove(f) for f in glob.glob('AML-test-results/*')]
    
print ("Final Results:", np.mean(results, axis=0))

In [None]:
ontologies_in_alignment = [[el.split("/")[1].split(".")[0] for el in ont] for ont in ontologies_in_alignment][:-1] + [["human", "mouse"]]

In [75]:
output = """Output_final_13_1_webdir.txt:Final Results: [0.66664399 0.74498814 0.69136694 0.71957614 0.67328138]
Output_final_13_1_webdir_weighted.txt:Final Results: [0.61923369 0.76637851 0.67834514 0.7260592  0.64057194]
Output_final_13_3_webdir.txt:Final Results: [0.69178365 0.71548658 0.69442704 0.70488248 0.69045627]
Output_final_13_3_webdir_weighted.txt:Final Results: [0.70111349 0.75680556 0.71981218 0.73942017 0.70668741]
Output_final_1_1_webdir.txt:Final Results: [0.71539116 0.72501039 0.71067633 0.71704871 0.71102774]
Output_final_1_1_webdir_bon.txt:Final Results: [0.65799171 0.75885972 0.6958328  0.72958184 0.67083676]
Output_final_1_1_webdir_weighted.txt:Final Results: [0.68192072 0.75885972 0.70611614 0.73386037 0.68837317]
Output_final_1_1_webdir_weighted_bon.txt:Final Results: [0.69260204 0.67363954 0.67010887 0.66968792 0.67949354]
Output_final_1_3_webdir.txt:Final Results: [0.73954041 0.70366916 0.71031395 0.70440561 0.72399055]
Output_final_1_3_webdir_bon.txt:Final Results: [0.70432329 0.73252918 0.70500715 0.71794664 0.70149352]
Output_final_1_3_webdir_weighted.txt:Final Results: [0.63646319 0.73341375 0.67481788 0.70700188 0.64977715]
Output_final_1_3_webdir_weighted_bon.txt:Final Results: [0.7488171  0.71749159 0.72552426 0.719112   0.73732727]
Output_final_1_4_webdir.txt:Final Results: [0.67336301 0.74004798 0.69548654 0.71951622 0.67939369]
Output_final_1_4_webdir_bon.txt:Final Results: [0.6704195  0.74845134 0.69555314 0.7235984  0.67727037]
Output_final_1_4_webdir_weighted.txt:Final Results: [0.68965328 0.75885972 0.70612997 0.7318908  0.6927182 ]
Output_final_1_4_webdir_weighted_bon.txt:Final Results: [0.71560108 0.72794553 0.71457211 0.72109892 0.71312993]
Output_final_1_5_webdir.txt:Final Results: [0.69810348 0.72501039 0.7016606  0.71327073 0.6970218 ]
Output_final_1_5_webdir_bon.txt:Final Results: [0.74569268 0.74098169 0.73020264 0.73414531 0.73532877]
Output_final_1_5_webdir_weighted.txt:Final Results: [0.65795473 0.7429867  0.68598468 0.71588604 0.66614149]
Output_final_1_5_webdir_weighted_bon.txt:Final Results: [0.67382226 0.75045635 0.69791373 0.72560499 0.6802158 ]
Output_final_1_6_webdir.txt:Final Results: [0.71017638 0.67845907 0.6869898  0.68042753 0.69873592]
Output_final_1_6_webdir_bon.txt:Final Results: [0.66528696 0.75885972 0.70077043 0.73213035 0.67733433]
Output_final_1_6_webdir_weighted.txt:Final Results: [0.66545145 0.75885972 0.69591438 0.72905709 0.67411696]
Output_final_1_8_webdir.txt:Final Results: [0.73303028 0.72300537 0.72027802 0.72029176 0.72573344]
Output_final_1_8_webdir_weighted.txt:Final Results: [0.70687943 0.74093255 0.71530383 0.72877063 0.70791838]
Output_final_21_1_webdir.txt:Final Results: [0.73162433 0.73541876 0.72495837 0.72928237 0.72667262]
Output_final_21_1_webdir_weighted.txt:Final Results: [0.71563702 0.7429867  0.71093105 0.72542717 0.70896494]
Output_final_2_1_webdir.txt:Final Results: [0.63841476 0.75797515 0.68341049 0.72342256 0.65370972]
Output_final_2_1_webdir_weighted.txt:Final Results: [0.66433378 0.73634889 0.69161519 0.71603179 0.67340157]
Output_final_2_3_webdir.txt:Final Results: [0.71807996 0.72794553 0.71437478 0.72049388 0.71433186]
Output_final_2_3_webdir_weighted.txt:Final Results: [0.70309715 0.71202337 0.70642841 0.70950835 0.70416188]
Output_final_2_4_webdir.txt:Final Results: [0.71061499 0.71290794 0.70367273 0.70739367 0.7056517 ]
Output_final_2_4_webdir_weighted.txt:Final Results: [0.64613196 0.75045635 0.68578223 0.72074163 0.65960965]
Output_final_2_5_webdir.txt:Final Results: [0.67780457 0.75045635 0.70085353 0.72713465 0.68391962]
Output_final_2_5_webdir_weighted.txt:Final Results: [0.75565471 0.71572246 0.72736789 0.71875671 0.74186132]
Output_final_2_6_webdir.txt:Final Results: [0.68542019 0.70820366 0.69229283 0.70076055 0.68705838]
Output_final_2_6_webdir_weighted.txt:Final Results: [0.717723   0.71954217 0.71048535 0.71410422 0.71261312]
Output_final_2_8_webdir.txt:Final Results: [0.75237837 0.67910775 0.70398784 0.68710826 0.72897029]
Output_final_2_8_webdir_weighted.txt:Final Results: [0.70272109 0.70073401 0.6870482  0.69224326 0.69200491]
Output_final_3_1_webdir.txt:Final Results: [0.6189404  0.78313609 0.68306081 0.73680162 0.64167934]
Output_final_3_1_webdir_weighted.txt:Final Results: [0.74344207 0.67564097 0.70267421 0.68518956 0.72500608]
Output_final_3_3_webdir.txt:Final Results: [0.72221488 0.71837258 0.71484529 0.71566915 0.71791653]
Output_final_3_3_webdir_weighted.txt:Final Results: [0.63692078 0.75134092 0.67998589 0.71824124 0.65157935]
Output_final_3_4_webdir.txt:Final Results: [0.59023045 0.77478187 0.66276982 0.722844   0.61619681]
Output_final_3_4_webdir_weighted.txt:Final Results: [0.73103741 0.74093255 0.72688032 0.73324825 0.72694816]
Output_final_3_5_webdir.txt:Final Results: [0.68603148 0.67845907 0.67148393 0.67332386 0.67724053]
Output_final_3_5_webdir_weighted.txt:Final Results: [0.69571813 0.70450458 0.69188145 0.69759884 0.69196549]
Output_final_3_6_webdir.txt:Final Results: [0.61420104 0.76726308 0.6779114  0.72733855 0.63746367]
Output_final_3_6_webdir_weighted.txt:Final Results: [0.70598594 0.72066262 0.70204567 0.71039786 0.70163117]
Output_final_3_8_webdir.txt:Final Results: [0.73175158 0.71207252 0.70916067 0.70855368 0.71857465]
Output_final_3_8_webdir_weighted.txt:Final Results: [0.68862849 0.73252918 0.69944156 0.7165458  0.6901991 ]
Output_final_4_1_webdir.txt:Final Results: [0.71850264 0.71660703 0.71173094 0.71323039 0.71438695]
Output_final_4_1_webdir_weighted.txt:Final Results: [0.69665006 0.72995054 0.70159665 0.71556503 0.69584966]
Output_final_4_3_webdir.txt:Final Results: [0.64467729 0.75885972 0.68674035 0.72511507 0.6588176 ]
Output_final_4_3_webdir_weighted.txt:Final Results: [0.73480469 0.72501039 0.72351671 0.72316387 0.72837805]
Output_final_4_4_webdir.txt:Final Results: [0.6756888  0.72042674 0.69149201 0.70697323 0.68064441]
Output_final_4_4_webdir_weighted.txt:Final Results: [0.73456919 0.6912102  0.70038558 0.69267264 0.71673411]
Output_final_4_5_webdir.txt:Final Results: [0.73503767 0.7073191  0.71135558 0.70686413 0.72280126]
Output_final_4_5_webdir_weighted.txt:Final Results: [0.74351447 0.70708322 0.71668835 0.70913438 0.73033409]
Output_final_4_6_webdir.txt:Final Results: [0.7656637  0.70820366 0.7267697  0.71366071 0.74705547]
Output_final_4_6_webdir_weighted.txt:Final Results: [0.69524973 0.75974428 0.71432935 0.73799134 0.69987743]
Output_final_4_8_webdir.txt:Final Results: [0.71765041 0.69233065 0.69366223 0.69094085 0.70418173]
Output_final_6_1_webdir.txt:Final Results: [0.72800969 0.74093255 0.72633862 0.73333372 0.7250932 ]
Output_final_6_1_webdir_weighted.txt:Final Results: [0.7303179  0.71202337 0.71217147 0.71014687 0.72056062]
Output_final_6_3_webdir.txt:Final Results: [0.67771227 0.74181711 0.6983708  0.72163184 0.68315491]
Output_final_6_3_webdir_weighted.txt:Final Results: [0.73299279 0.71954217 0.71917943 0.71795997 0.72540357]
Output_final_6_4_webdir.txt:Final Results: [0.71960548 0.701615   0.70296877 0.70056183 0.71077577]
Output_final_6_4_webdir_weighted.txt:Final Results: [0.65101245 0.75134092 0.68840451 0.72213331 0.66360067]
Output_final_6_5_webdir.txt:Final Results: [0.71992372 0.69615036 0.69667559 0.69439272 0.70677614]
Output_final_6_5_webdir_weighted.txt:Final Results: [0.7075706  0.71754073 0.69683545 0.70577788 0.69873427]
Output_final_6_6_webdir_weighted.txt:Final Results: [0.68916592 0.71866118 0.68740805 0.70246463 0.68376454]
Output_final_9_1_webdir.txt:Final Results: [0.67798164 0.75885972 0.70769016 0.73499079 0.68786924]
Output_final_9_1_webdir_weighted.txt:Final Results: [0.62736455 0.74293756 0.67302271 0.71090772 0.64357965]
Output_final_9_3_webdir.txt:Final Results: [0.70251177 0.68280684 0.67622971 0.67681753 0.68703867]
Output_final_9_3_webdir_weighted.txt:Final Results: [0.72759681 0.71572246 0.71031673 0.71098705 0.71766385]
Output_final_9_4_webdir.txt:Final Results: [0.74775692 0.72501039 0.72945717 0.72540365 0.73840191]
Output_final_9_4_webdir_weighted.txt:Final Results: [0.73259419 0.67733862 0.6935813  0.68195576 0.71304527

"""

sorted([float(l.split(":")[2].strip().split()[2]) if l.startswith("Output") else 0 for l in output.split("\n")])

[0,
 0,
 0.66276982,
 0.67010887,
 0.67148393,
 0.67302271,
 0.67481788,
 0.67622971,
 0.6779114,
 0.67834514,
 0.67998589,
 0.68306081,
 0.68341049,
 0.68578223,
 0.68598468,
 0.68674035,
 0.6869898,
 0.6870482,
 0.68740805,
 0.68840451,
 0.69136694,
 0.69149201,
 0.69161519,
 0.69188145,
 0.69229283,
 0.6935813,
 0.69366223,
 0.69442704,
 0.69548654,
 0.69555314,
 0.6958328,
 0.69591438,
 0.69667559,
 0.69683545,
 0.69791373,
 0.6983708,
 0.69944156,
 0.70038558,
 0.70077043,
 0.70085353,
 0.70159665,
 0.7016606,
 0.70204567,
 0.70267421,
 0.70296877,
 0.70367273,
 0.70398784,
 0.70500715,
 0.70611614,
 0.70612997,
 0.70642841,
 0.70769016,
 0.70916067,
 0.71031395,
 0.71031673,
 0.71048535,
 0.71067633,
 0.71093105,
 0.71135558,
 0.71173094,
 0.71217147,
 0.71432935,
 0.71437478,
 0.71457211,
 0.71484529,
 0.71530383,
 0.71668835,
 0.71917943,
 0.71981218,
 0.72027802,
 0.72351671,
 0.72495837,
 0.72552426,
 0.72633862,
 0.7267697,
 0.72688032,
 0.72736789,
 0.72945717,
 0.73020264]

In [None]:
f = open("data_unhas.pkl", "wb")
pickle.dump([data, emb_indexer, emb_indexer_inv, emb_vals, gt_mappings, neighbours_dicts, ontologies_in_alignment], f)


In [None]:
def count_non_unk(elem):
    return len([l for l in elem if l!="<UNK>"])
neighbours_dicts = {ont: {el: neighbours_dicts[ont][el][:int(sys.argv[1])] for el in neighbours_dicts[ont]
       if count_non_unk(neighbours_dicts[ont][el]) > int(sys.argv[2])} for ont in neighbours_dicts}

In [7]:
[1/3 for i in range(3)]

[0.3333333333333333, 0.3333333333333333, 0.3333333333333333]

In [64]:
max_paths

1

In [None]:
import requests

url = "https://montanaflynn-spellcheck.p.rapidapi.com/check/"

headers = {
    'x-rapidapi-host': "montanaflynn-spellcheck.p.rapidapi.com",
    'x-rapidapi-key': "9965b01207msh06291e57d6f2c55p1a6a16jsn0fb016da4a62"
    }

# inp_spellchecked = []
for concept in inp[731:]:
    querystring = {"text": concept}
    response = requests.request("GET", url, headers=headers, params=querystring).json()
    if response["suggestion"] != concept:
        resolved = str(concept)
        for word in response["corrections"]:
            if not re.search("[A-Z][A-Z]+", concept):
                resolved = resolved.replace(word, response["corrections"][word][0])
        
        inp_spellchecked.append(resolved)
        print (concept, resolved)
    else:
        inp_spellchecked.append(concept)




In [None]:
querystring = {"text": "technically Organised By"}
response = requests.request("GET", url, headers=headers, params=querystring)
response.json()

In [None]:
inp_spellchecked[730], inp[731]

In [None]:
fn_spellchecked, fp_spellchecked = [dict(el) for el in pickle.load(open("test_v2.pkl", "rb"))]
fn_baseline, fp_baseline = [dict(el) for el in pickle.load(open("test_best.pkl", "rb"))]
fn_unhas, fp_unhas = [dict(el) for el in pickle.load(open("test_unhas.pkl", "rb"))]
fn_resolved, fp_resolved = [dict(el) for el in pickle.load(open("test_resolved.pkl", "rb"))]

fn_dict, fp_dict = {}, {}
def create_comparison_file(file, idx):
    fn, fp = [dict(el) for el in pickle.load(open(file, "rb"))]
    
    for key in fn:
        if key in fn_dict:
            fn_dict[key][idx] = fn[key]
        else:
            fn_dict[key] = ["N/A" for i in range(4)]
            fn_dict[key][idx] = fn[key]
    
    for key in fp:
        if key in fp_dict:
            fp_dict[key][idx] = fp[key]
        else:
            fp_dict[key] = ["N/A" for i in range(4)]
            fp_dict[key][idx] = fp[key]
    

create_comparison_file("test_best.pkl", 0)
create_comparison_file("test_unhas.pkl", 1)
create_comparison_file("test_v2.pkl", 2)
create_comparison_file("test_resolved.pkl", 3)

open("fn - comparison.tsv", "w+").write("\n".join(["\t".join([str(el) for el in flatten(el)]) for el in fn_dict.items()]))
open("fp - comparison.tsv", "w+").write("\n".join(["\t".join([str(el) for el in flatten(el)]) for el in fp_dict.items()]))

In [None]:
ontologies_in_alignment = pickle.load(open("data_path.pkl", "rb"))[-1]
ontologies_in_alignment

In [None]:
d = {('confOf#Organization', 'sigkdd#Organizator'): (1,2,3,4),
 ('iasted#Document', 'sigkdd#Document'): (5,6,78,8)}
[[str(el) for el in flatten(el)] for el in d.items()]

In [None]:
abbreviations_dict = {}
final_dict = {}

for mapping in all_mappings:
    mapping = tuple([el.split("#")[1] for el in mapping])
    is_abb = re.search("[A-Z][A-Z]+", mapping[0])
    if is_abb:
        abbreviation = "".join([el[0].upper() for el in mapping[1].split("_")])
        if is_abb.group() in abbreviation:
            
            start = abbreviation.find(is_abb.group())
            end = start + len(is_abb.group())
            fullform = "_".join(mapping[1].split("_")[start:end])
            print ("left", mapping, abbreviation, fullform)
            
            rest_first = " ".join([el for el in mapping[0].replace(is_abb.group(), "").split("_") if el]).lower()
            rest_second = " ".join(mapping[1].split("_")[:start] + mapping[1].split("_")[end:])
            if is_abb.group() not in final_dict:
                final_dict[is_abb.group()] = [(fullform, rest_first, rest_second)]
            else:
                final_dict[is_abb.group()].append((fullform, rest_first, rest_second))

    is_abb = re.search("[A-Z][A-Z]+", mapping[1])
    if is_abb:
        abbreviation = "".join([el[0].upper() for el in mapping[0].split("_")])
        
        if is_abb.group() in abbreviation:
            start = abbreviation.find(is_abb.group())
            end = start + len(is_abb.group())
            fullform = "_".join(mapping[0].split("_")[start:end])
            print ("right", mapping, abbreviation, fullform)

            rest_first = " ".join([el for el in mapping[1].replace(is_abb.group(), "").split("_") if el]).lower()
            rest_second = " ".join(mapping[0].split("_")[:start] + mapping[0].split("_")[end:])
            if is_abb.group() not in final_dict:
                final_dict[is_abb.group()] = [(fullform, rest_first, rest_second)]
            else:
                final_dict[is_abb.group()].append((fullform, rest_first, rest_second))

keys = [el for el in list(set(flatten([flatten([tup[1:] for tup in final_dict[key]]) for key in final_dict]))) if el]
abb_embeds = dict(zip(keys, extractUSEEmbeddings(keys)))

scored_dict = {}
for abbr in final_dict:
    sim_list = [(tup[0], tup[1], tup[2], cos_sim(abb_embeds[tup[1]], abb_embeds[tup[2]])) if tup[1] and tup[2]
                else (tup[0], tup[1], tup[2], 0) for tup in final_dict[abbr]]
    scored_dict[abbr] = sorted(list(set(sim_list)), key=lambda x:x[-1], reverse=True)

resolved_dict = {key: scored_dict[key][0] for key in scored_dict}
filtered_dict = {key: " ".join(resolved_dict[key][0].split("_")) for key in resolved_dict if resolved_dict[key][-1] > 0.9}
inp_resolved = []
for concept in inp:
    for key in filtered_dict:
        concept = concept.replace(key, filtered_dict[key])
    inp_resolved.append(concept)
inp_resolved

In [None]:
keys = [el for el in list(set(flatten([flatten([tup[1:] for tup in final_dict[key]]) for key in final_dict]))) if el]
abb_embeds = dict(zip(keys, extractUSEEmbeddings(keys)))


In [None]:
cos_sim(*extractUSEEmbeddings(["Conference Banquet", "Dinner Banquet"]))

In [None]:
fn

In [None]:
scored_dict = {}
for abbr in final_dict:
    sim_list = [(tup[0], tup[1], tup[2], cos_sim(abb_embeds[tup[1]], abb_embeds[tup[2]])) if tup[1] and tup[2]
                else (tup[0], tup[1], tup[2], 0) for tup in final_dict[abbr]]
    scored_dict[abbr] = sorted(list(set(sim_list)), key=lambda x:x[-1], reverse=True)


In [None]:
inp_case_handled = []
for concept in inp:
    final_list = []
    for word in concept.split(" "):
        if not re.search("[A-Z][A-Z]+", concept):
            final_list.append(word.lower())
        else:
            final_list.append(word)
    case_resolved = " ".join(final_list)
    inp_case_handled.append(case_resolved)
    
inp_case_handled

In [None]:
Ontology("conference_ontologies/conference.owl").triples

In [None]:
cos_sim2