In [2]:
flatten = lambda l: [item for sublist in l for item in sublist]

class Ontology():
    def __init__(self, ontology):
        self.ontology = ontology
        self.ontology_obj = minidom.parse(ontology)
        self.root = self.ontology_obj.documentElement
        self.construct_mapping_dict()
        
        self.parents_dict = {}
        self.subclasses = self.parse_subclasses()
        self.object_properties = self.parse_object_properties()
        self.data_properties = self.parse_data_properties()
        self.triples = self.parse_triples()
        self.classes = self.parse_classes()        
    
    def totext(self, content):
        return " ".join(t.nodeValue for t in content.childNodes if t.nodeType == t.TEXT_NODE)

    def construct_mapping_dict(self):
        self.mapping_dict = {self.extract_ID(el, False): self.get_child_node(el, "rdfs:label")[0].firstChild.nodeValue for el in self.root.getElementsByTagName("owl:Class") if self.get_child_node(el, "rdfs:label")}
        self.mapping_dict_inv = {self.mapping_dict[key]: key for key in self.mapping_dict}
        return
        
    def get_child_node(self, element, tag):
        return [e for e in element._get_childNodes() if type(e)==minidom.Element and e._get_tagName() == tag]
        
    def has_attribute_value(self, element, attribute, value):
        return True if element.getAttribute(attribute).split("#")[-1] == value else False
    
    def get_subclass_triples(self, rootpath=False):
        subclasses = self.get_subclasses(rootpath)
        
        return [(b,a,c) for (a,b,c) in subclasses]
    
    def parse_triples(self, union_flag=0, subclass_of=True, rootpath=False):
        obj_props = self.object_properties
        data_props = self.data_properties
        props = obj_props + data_props
        all_triples = []
        for prop in props:
            domain_children = self.get_child_node(prop, "rdfs:domain")
            range_children = self.get_child_node(prop, "rdfs:range")
            domain_prop = self.filter_null([self.extract_ID(el) for el in domain_children])
            range_prop = self.filter_null([self.extract_ID(el) for el in range_children])
            if not domain_children or not range_children:
                continue
            if not domain_prop:
                domain_prop = self.filter_null([self.extract_ID(el) for el in domain_children[0].getElementsByTagName("owl:Class")])
            if not range_prop:
                range_prop = self.filter_null([self.extract_ID(el) for el in range_children[0].getElementsByTagName("owl:Class")])
            if domain_prop and range_prop:
                if union_flag == 0:
                    all_triples.extend([(el[0], el[1], self.extract_ID(prop)) for el in list(itertools.product(domain_prop, range_prop))])
                else:
                    all_triples.append(("###".join(domain_prop), "###".join(range_prop), self.extract_ID(prop)))
        if subclass_of:
            all_triples.extend(self.get_subclass_triples(rootpath))
        return list(set(all_triples))
    
    def get_triples(self, union_flag=0, subclass_of=True, rootpath=False):
        return self.parse_triples(union_flag, subclass_of, rootpath)

    def parse_subclasses(self, union_flag=0):
        subclasses = self.root.getElementsByTagName("rdfs:subClassOf")
        subclass_pairs = []
        for el in subclasses:
            inline_subclasses = self.extract_ID(el)
            if inline_subclasses:
                subclass_pairs.append((el, el.parentNode, "subclass_of"))
            else:
                level1_class = self.get_child_node(el, "owl:Class")
                if not level1_class:
                    
                    restriction = el.getElementsByTagName("owl:Restriction")
                    if not restriction:
                        continue
                    prop = self.get_child_node(restriction[0], "owl:onProperty")
                    some_vals = self.get_child_node(restriction[0], "owl:someValuesFrom")
                    if not some_vals:
                        some_vals = self.get_child_node(restriction[0], "owl:cardinality")
                    if not prop or not some_vals:
                        continue
#                     print(self.extract_ID(el), "**", self.extract_ID(some_vals[0]), "**", self.extract_ID(prop[0]))
                    try:
                        if self.extract_ID(prop[0]) and self.extract_ID(some_vals[0]):
                            subclass_pairs.append((el.parentNode, some_vals[0], self.extract_ID(prop[0])))
                        elif self.extract_ID(prop[0]) and not self.extract_ID(some_vals[0]):
                            class_vals = self.get_child_node(some_vals[0], "owl:Class")
                            subclass_pairs.append((el.parentNode, class_vals[0], self.extract_ID(prop[0])))
                        elif not self.extract_ID(prop[0]) and self.extract_ID(some_vals[0]):
                            prop_vals = self.get_child_node(prop[0], "owl:ObjectProperty")
                            subclass_pairs.append((el.parentNode, some_vals[0], self.extract_ID(prop_vals[0])))
                        else:
                            prop_vals = self.get_child_node(prop[0], "owl:ObjectProperty")
                            class_vals = self.get_child_node(some_vals[0], "owl:Class")
                            subclass_pairs.append((el.parentNode, class_vals[0], self.extract_ID(prop_vals[0])))
                    except Exception as e:
                        print ("error", e)
                        continue
                else:
                    if self.extract_ID(level1_class[0]):
                        subclass_pairs.append((level1_class[0], el.parentNode, "subclass_of"))
                    else:
#                         level2classes = level1_class[0].getElementsByTagName("owl:Class")
#                         subclass_pairs.extend([(elem, el.parentNode, "subclass_of") for elem in level2classes if self.extract_ID(elem)])
                        continue
        return subclass_pairs
        
    def get_subclasses(self, rootpath=False):
        subclasses = [(self.extract_ID(a, not rootpath), self.extract_ID(b, not rootpath), c)
                      for (a,b,c) in self.subclasses]
        self.parents_dict = {}
        for (a,b,c) in subclasses:
            if c == "subclass_of" and a!="Thing" and b!="Thing":
                if b not in self.parents_dict:
                    self.parents_dict[b] = [a]
                else:
                    self.parents_dict[b].append(a)
        return [el for el in subclasses if el[0] and el[1] and el[2] and el[0]!="Thing" and el[1]!="Thing"]
    
    def filter_null(self, data):
        return [el for el in data if el]
    
    def extract_ID(self, element, check_coded = True):
        element_id = element.getAttribute("rdf:ID") or element.getAttribute("rdf:resource") or element.getAttribute("rdf:about")
        label = self.get_child_node(element, 'rdfs:label')
        element_id = element_id.split("#")[-1]
        if len(list(filter(str.isdigit, element_id))) >= 3 and "_" in element_id and check_coded:
            return self.mapping_dict[element_id]
#         print (element_id)
#         if check_coded and element_id in self.mapping_dict:
#             return self.mapping_dict[element_id]
        return element_id.replace("UNDEFINED_", "").replace("DO_", "")
    
    def parse_classes(self):
        class_elems = [self.extract_ID(el) for el in self.root.getElementsByTagName("owl:Class")]
        subclass_classes = list(set(flatten([el[:-1] for el in self.triples])))
        return list(set(self.filter_null(class_elems + subclass_classes)))
    
    def get_classes(self):
        return self.classes
    
    def get_entities(self):
        entities = [self.extract_ID(el) for el in self.root.getElementsByTagName("owl:Class")]
        return list(set(self.filter_null(entities)))

    def parse_data_properties(self):
        data_properties = [el for el in self.get_child_node(self.root, 'owl:DatatypeProperty')]
        fn_data_properties = [el for el in self.get_child_node(self.root, 'owl:FunctionalProperty') if el]
        fn_data_properties = [el for el in fn_data_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "DatatypeProperty")]]
        inv_fn_data_properties = [el for el in self.get_child_node(self.root, 'owl:InverseFunctionalProperty') if el]
        inv_fn_data_properties = [el for el in inv_fn_data_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "DatatypeProperty")]]
        return data_properties + fn_data_properties + inv_fn_data_properties
        
    def parse_object_properties(self):
        obj_properties = [el for el in self.get_child_node(self.root, 'owl:ObjectProperty')]
        fn_obj_properties = [el for el in self.get_child_node(self.root, 'owl:FunctionalProperty') if el]
        fn_obj_properties = [el for el in fn_obj_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "ObjectProperty")]]
        inv_fn_obj_properties = [el for el in self.get_child_node(self.root, 'owl:InverseFunctionalProperty') if el]
        inv_fn_obj_properties = [el for el in inv_fn_obj_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "ObjectProperty")]]
        return obj_properties + fn_obj_properties + inv_fn_obj_properties
    
    def get_object_properties(self):
        obj_props = [self.extract_ID(el) for el in self.object_properties]
        return list(set(self.filter_null(obj_props)))
    
    def get_data_properties(self):
        data_props = [self.extract_ID(el) for el in self.data_properties]
        return list(set(self.filter_null(data_props)))


In [3]:
USE_folder = "/home/vlead/USE_multilingual"
alignment_folder = "german_datasets/"

ontologies_in_alignment = []

# Load reference alignments 
def load_alignments(folder):
    gt = []
    for subdir in os.listdir(folder):
        path = folder + subdir + "/"
        path += [l for l in os.listdir(path) if l.endswith(".txt")][0]
        mappings = [content.strip() for content in open(path).read().split("--------------------------------------------------------") if content.strip()]
        for mapping in mappings:
            src = [line.split(":")[-1].strip() for line in mapping.split("\n") if line.startswith(" + Source: ")][0]
            targ = [line.split(":")[-1].strip() for line in mapping.split("\n") if line.startswith(" + Target: ")][0]
            ontologies_in_alignment.append((folder + subdir + "/"+src, folder + subdir + "/"+targ))
            src = src.rsplit(".",1)[0].replace(".", "_").lower()
            targ = targ.rsplit(".",1)[0].replace(".", "_").lower()
            lines = [["_".join(row.strip().split(":")[0].split(".")) for row in line.split("-",1)[1].strip().split("<->")]
                     for line in mapping.split("\n") if line.startswith(" -")]
            lines = [[src + "#" + line[0], targ + "#" + line[1]] for line in lines]
            gt.extend(lines)
    return gt



# Extracting USE embeddings

def extractUSEEmbeddings(words):
    try:
        embed = hub.KerasLayer(USE_folder)
    except Exception as e:
        !mkdir $USE_folder
        !curl -L "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3?tf-hub-format=compressed" | tar -zxvC $USE_folder
        embed = hub.KerasLayer(USE_folder)
        pass
    word_embeddings = embed(words)
    return word_embeddings.numpy()

# model = SentenceTransformer('bert-large-nli-mean-tokens')
# def extractUSEEmbeddings(words):
#     return model.encode(words)

# cos_sim(*model.encode(["My brother plays guitar", "The sun is shining"]))

def cos_sim(a,b):
    return 1 - spatial.distance.cosine(a, b)


reference_alignments = load_alignments(alignment_folder)

# ra_anatomy_coded = load_alignments("../Anatomy/Alignments/")
# ra_anatomy = []
# ont1 = Ontology("../Anatomy/Ontologies/mouse.owl")
# ont2 = Ontology("../Anatomy/Ontologies/human.owl")
# for elem in ra_anatomy_coded:
#     pre1, pre2 = elem[0].split("#")[0].split(".")[0].split("/")[-1], elem[1].split("#")[0].split(".")[0].split("/")[-1]
#     elem1, elem2 = elem[0].split("#")[-1], elem[1].split("#")[-1]
#     ra_anatomy.append(( pre1 + "#" + ont1.mapping_dict[elem1], pre2 + "#" + ont2.mapping_dict[elem2]))

# gt_mappings = [tuple([elem.split("/")[-1] for elem in el]) for el in reference_alignments]
# gt_mappings.extend(ra_anatomy)

# ontologies_in_alignment = pickle.load(open("../data_generic.pkl", "rb"))[-1][:-1]
# ontologies_in_alignment += [["../Anatomy/Ontologies/human.owl", "../Anatomy/Ontologies/mouse.owl"]]

In [29]:
ontologies_in_alignment

[('german_datasets/mapping lebensmittel/Google.Lebensmittel.owl',
  'german_datasets/mapping lebensmittel/web.Lebensmittel.owl'),
 ('german_datasets/mapping freizeit/dmoz.Freizeit.owl',
  'german_datasets/mapping freizeit/Google.Freizeit.owl'),
 ('german_datasets/mapping webdirectory/dmoz.owl',
  'german_datasets/mapping webdirectory/google.owl'),
 ('german_datasets/mapping webdirectory/dmoz.owl',
  'german_datasets/mapping webdirectory/web.owl'),
 ('german_datasets/mapping webdirectory/dmoz.owl',
  'german_datasets/mapping webdirectory/yahoo.small.owl'),
 ('german_datasets/mapping webdirectory/google.owl',
  'german_datasets/mapping webdirectory/web.owl'),
 ('german_datasets/mapping webdirectory/google.owl',
  'german_datasets/mapping webdirectory/yahoo.small.owl'),
 ('german_datasets/mapping webdirectory/web.owl',
  'german_datasets/mapping webdirectory/yahoo.small.owl')]

In [5]:
# Combinatorial mapping generation
all_mappings = []
for l in ontologies_in_alignment:
    ont1 = Ontology(l[0])
    ont2 = Ontology(l[1])
    
    ent1 = ont1.get_classes()
    ent2 = ont2.get_classes()
    
    obj1 = ont1.get_object_properties()
    obj2 = ont2.get_object_properties()
    
    data1 = ont1.get_data_properties()
    data2 = ont2.get_data_properties()

    mappings = list(itertools.product(ent1, ent2)) + list(itertools.product(obj1, obj2)) + list(itertools.product(data1, data2))
    
    pre1 = l[0].split("/")[-1].rsplit(".",1)[0].replace(".", "_").lower()
    pre2 = l[1].split("/")[-1].rsplit(".",1)[0].replace(".", "_").lower()
    print (pre1, pre2)
    all_mappings.extend([(pre1 + "#" + el[0], pre2 + "#" + el[1]) for el in mappings])
    

data = {mapping: False for mapping in all_mappings}
reference_alignments = [tuple(alignment) for alignment in reference_alignments]
s = set(all_mappings)
for mapping in set(reference_alignments):
    if mapping in s:
        data[mapping] = True
    else:
        mapping = tuple([el.replace(",-", "_") for el in mapping])
        if mapping in s:
            data[mapping] = True
        else:
            print (mapping)

google_lebensmittel web_lebensmittel
dmoz_freizeit google_freizeit
dmoz google
dmoz web
dmoz yahoo_small
google web
google yahoo_small
web yahoo_small


In [41]:
a[1], b[1]

(array([-3.51269264e-03,  3.38228121e-02,  4.22673002e-02, -2.78660022e-02,
        -3.13101038e-02,  1.66269522e-02, -2.53552590e-02,  1.10926665e-02,
         1.88992620e-02, -2.40844507e-02,  7.01542571e-02,  3.90431397e-02,
        -4.66191508e-02,  4.95024435e-02, -8.07325095e-02, -6.51722625e-02,
         1.84831601e-02, -5.36775216e-02,  5.63435107e-02,  1.96375754e-02,
        -9.62680206e-03, -1.21781193e-02,  7.82673582e-02, -1.17588174e-02,
         9.98896733e-03,  6.25176579e-02, -3.90938073e-02,  5.55893686e-03,
         2.52319220e-02,  3.91568104e-03,  1.78878773e-02, -3.94871905e-02,
        -3.31385322e-02,  5.19221425e-02,  1.92756392e-02,  5.00136353e-02,
         4.35547829e-02,  1.00794295e-02,  4.25346009e-02, -4.08579148e-02,
        -7.51301646e-02, -7.75257424e-02, -3.18838842e-02, -7.21048340e-02,
        -2.33702697e-02,  1.40265683e-02, -4.98610027e-02, -1.04602333e-02,
         6.31297454e-02,  4.95878942e-02, -4.71273661e-02, -2.52356809e-02,
        -6.3

In [6]:


def camel_case_split(identifier):
    matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
    return [m.group(0) for m in matches]

# Abbrevation resolution preprocessing
def parse(word):
    return " ".join(flatten([el.split("_") for el in camel_case_split(word)]))

abbreviations_dict = {}
final_dict = {}

for mapping in all_mappings:
    mapping = tuple([el.split("#")[1] for el in mapping])
    is_abb = re.search("[A-Z][A-Z]+", mapping[0])
    if is_abb:
        
        abbreviation = "".join([el[0].upper() for el in mapping[1].split("_") if el])
        if is_abb.group() in abbreviation:
            
            start = abbreviation.find(is_abb.group())
            end = start + len(is_abb.group())
            fullform = "_".join(mapping[1].split("_")[start:end])
            print ("left", mapping, abbreviation, fullform)
            
            rest_first = " ".join([el for el in mapping[0].replace(is_abb.group(), "").split("_") if el]).lower()
            rest_second = " ".join(mapping[1].split("_")[:start] + mapping[1].split("_")[end:])
            if is_abb.group() not in final_dict:
                final_dict[is_abb.group()] = [(fullform, rest_first, rest_second)]
            else:
                final_dict[is_abb.group()].append((fullform, rest_first, rest_second))

    is_abb = re.search("[A-Z][A-Z]+", mapping[1])
    if is_abb:
        abbreviation = "".join([el[0].upper() for el in mapping[0].split("_") if el])
        
        if is_abb.group() in abbreviation:
            start = abbreviation.find(is_abb.group())
            end = start + len(is_abb.group())
            fullform = "_".join(mapping[0].split("_")[start:end])
            print ("right", mapping, abbreviation, fullform)

            rest_first = " ".join([el for el in mapping[1].replace(is_abb.group(), "").split("_") if el]).lower()
            rest_second = " ".join(mapping[0].split("_")[:start] + mapping[0].split("_")[end:])
            if is_abb.group() not in final_dict:
                final_dict[is_abb.group()] = [(fullform, rest_first, rest_second)]
            else:
                final_dict[is_abb.group()].append((fullform, rest_first, rest_second))

keys = [el for el in list(set(flatten([flatten([tup[1:] for tup in final_dict[key]]) for key in final_dict]))) if el]
abb_embeds = dict(zip(keys, extractUSEEmbeddings([parse(el) for el in keys])))

scored_dict = {}
for abbr in final_dict:
    sim_list = [(tup[0], tup[1], tup[2], cos_sim(abb_embeds[tup[1]], abb_embeds[tup[2]])) if tup[1] and tup[2]
                else (tup[0], tup[1], tup[2], 0) for tup in final_dict[abbr]]
    scored_dict[abbr] = sorted(list(set(sim_list)), key=lambda x:x[-1], reverse=True)

resolved_dict = {key: scored_dict[key][0] for key in scored_dict}
filtered_dict = {key: " ".join(resolved_dict[key][0].split("_")) for key in resolved_dict if resolved_dict[key][-1] > 0.9}


right ('Top_World_Deutsch_Online-Shops_Essen_und_Trinken_Regionale_Spezialitaeten', 'World_Deutsch_Online-Shops_Fahrzeuge_Autos_EU-Importfahrzeuge') TWDOEUTRS Essen_und
right ('Top_World_Deutsch_Online-Shops_Fahrzeuge_Autos_Ersatzteile_und_Zubehoer_Schmierstoffe', 'World_Deutsch_Online-Shops_Fahrzeuge_Autos_EU-Importfahrzeuge') TWDOFAEUZS Ersatzteile_und
right ('Top_World_Deutsch_Online-Shops_Essen_und_Trinken_Landwirtschaftliche_Erzeugnisse', 'World_Deutsch_Online-Shops_Fahrzeuge_Autos_EU-Importfahrzeuge') TWDOEUTLE Essen_und
right ('Top_World_Deutsch_Online-Shops_Schmuck_Edel_und_Schmucksteine_Diamanten', 'World_Deutsch_Online-Shops_Fahrzeuge_Autos_EU-Importfahrzeuge') TWDOSEUSD Edel_und
right ('Top_World_Deutsch_Online-Shops_Essen_und_Trinken_Getraenke_Wein', 'World_Deutsch_Online-Shops_Fahrzeuge_Autos_EU-Importfahrzeuge') TWDOEUTGW Essen_und
right ('Top_World_Deutsch_Online-Shops_Essen_und_Trinken_Getraenke_Spirituosen_Wodka', 'World_Deutsch_Online-Shops_Fahrzeuge_Autos_EU-Importfa

right ('Top_World_Deutsch_Online-Shops_Essen_und_Trinken_Getraenke_Wein_Europaeisch_Franzoesisch', 'World_Deutsch_Online-Shops_Fahrzeuge_Autos_EU-Importfahrzeuge') TWDOEUTGWEF Essen_und
right ('Top_World_Deutsch_Online-Shops_Essen_und_Trinken_Fleisch_und_Wurst', 'World_Deutsch_Online-Shops_Fahrzeuge_Autos_EU-Importfahrzeuge') TWDOEUTFUW Essen_und
right ('Top_World_Deutsch_Online-Shops_Essen_und_Trinken_Delikatessen', 'World_Deutsch_Online-Shops_Fahrzeuge_Autos_EU-Importfahrzeuge') TWDOEUTD Essen_und
right ('Top_World_Deutsch_Online-Shops_Schmuck_Edel_und_Schmucksteine', 'World_Deutsch_Online-Shops_Fahrzeuge_Autos_EU-Importfahrzeuge') TWDOSEUS Edel_und
right ('Top_World_Deutsch_Online-Shops_Essen_und_Trinken_Getraenke_Wein_Europaeisch_Deutsch_Wuerttemberg', 'World_Deutsch_Online-Shops_Fahrzeuge_Autos_EU-Importfahrzeuge') TWDOEUTGWEDW Essen_und
right ('Top_World_Deutsch_Online-Shops_Fahrzeuge_Autos_Ersatzteile_und_Zubehoer', 'World_Deutsch_Online-Shops_Fahrzeuge_Autos_EU-Importfahrzeuge'

right ('Top_World_Deutsch_Online-Shops_Schmuck_Modeschmuck_Perlen', 'Verzeichnis_Einkaufen-Sparen_Buecher_Musik-Filme_Musik_MP3-Files') TWDOSMP Modeschmuck_Perlen
right ('Top_World_Deutsch_Online-Shops_Haus_und_Garten_Haushaltsbedarf_Haushaltsgeraete_Kaffeemaschinen', 'Firmen_Computer_Hardware_Beamer-und-Projektoren_HK-Computer-Systemhaus') TWDOHUGHHK Haushaltsgeraete_Kaffeemaschinen
right ('Top_World_Deutsch_Online-Shops_Bestattungswesen', 'Firmen_Koerperpflege_Hygieneartikel-fuer-Frauen_OB') TWDOB Online-Shops_Bestattungswesen
right ('Top_World_Deutsch_Online-Shops_Freizeit_Waffen_Hieb_und_Stichwaffen', 'Firmen_Kraftfahrzeuge_Ersatzteile-und-Zubehoer_US-Fahrzeuge') TWDOFWHUS und_Stichwaffen
right ('Top_World_Deutsch_Online-Shops_Publikationen_Comics', 'Firmen_Computer_Hardware_PCs') TWDOPC Publikationen_Comics
right ('Top_World_Deutsch_Online-Shops_Computer_Hardware_Komplettsysteme_Notebooks', 'Firmen_Computer_Hardware_Beamer-und-Projektoren_HK-Computer-Systemhaus') TWDOCHKN Hardware

right ('Top_World_Deutsch_Online-Shops_Religion_und_Spiritualitaet_Esoterik_Edelsteine', 'Firmen_Kraftfahrzeuge_Ersatzteile-und-Zubehoer_US-Fahrzeuge') TWDORUSEE und_Spiritualitaet
right ('Top_World_Deutsch_Online-Shops_Bekleidung_Trachten', 'Firmen_Koerperpflege_Hygieneartikel-fuer-Frauen_OB') TWDOBT Online-Shops_Bekleidung
right ('Top_World_Deutsch_Online-Shops_Religion_und_Spiritualitaet_Esoterik_Geomantie', 'Firmen_Kraftfahrzeuge_Ersatzteile-und-Zubehoer_US-Fahrzeuge') TWDORUSEG und_Spiritualitaet
right ('Top_World_Deutsch_Online-Shops_Bekleidung_Sport_Druck_und_Stickservice', 'Firmen_Kraftfahrzeuge_Ersatzteile-und-Zubehoer_US-Fahrzeuge') TWDOBSDUS und_Stickservice
right ('Top_World_Deutsch_Online-Shops_Bekleidung_Sport_Druck_und_Stickservice', 'Firmen_Koerperpflege_Hygieneartikel-fuer-Frauen_OB') TWDOBSDUS Online-Shops_Bekleidung
right ('Top_World_Deutsch_Online-Shops_Bekleidung_Unterwaesche', 'Firmen_Koerperpflege_Hygieneartikel-fuer-Frauen_OB') TWDOBU Online-Shops_Bekleidung
rig

right ('World_Deutsch_Online-Shops_Schmuck_Modeschmuck_Perlen', 'Verzeichnis_Einkaufen-Sparen_Buecher_Musik-Filme_Musik_MP3-Files') WDOSMP Modeschmuck_Perlen
right ('World_Deutsch_Online-Shops_Fahrzeuge_Motorraeder_Pflegemittel', 'Verzeichnis_Einkaufen-Sparen_Buecher_Musik-Filme_Musik_MP3-Files') WDOFMP Motorraeder_Pflegemittel
right ('World_Deutsch_Online-Shops_Schreibwaren-und-Buerobedarf_Materialien-und-Zubehoer_Papierwaren', 'Verzeichnis_Einkaufen-Sparen_Buecher_Musik-Filme_Musik_MP3-Files') WDOSMP Materialien-und-Zubehoer_Papierwaren
right ('World_Deutsch_Online-Shops_Bekleidung_Schuhe_Komfort', 'Firmen_Koerperpflege_Hygieneartikel-fuer-Frauen_OB') WDOBSK Online-Shops_Bekleidung
right ('World_Deutsch_Online-Shops_Bekleidung_Kostueme_Karneval-Fasching', 'Firmen_Koerperpflege_Hygieneartikel-fuer-Frauen_OB') WDOBKK Online-Shops_Bekleidung
right ('World_Deutsch_Online-Shops_Bekleidung_Schuhe_Zubehoer', 'Firmen_Koerperpflege_Hygieneartikel-fuer-Frauen_OB') WDOBSZ Online-Shops_Bekleidun

right ('World_Deutsch_Online-Shops_Computer_Hardware_Komponenten', 'Firmen_Computer_Hardware_Beamer-und-Projektoren_HK-Computer-Systemhaus') WDOCHK Hardware_Komponenten
right ('World_Deutsch_Online-Shops_Bekleidung_Unterwaesche_Socken-und-Struempfe', 'Firmen_Kraftfahrzeuge_Ersatzteile-und-Zubehoer_US-Fahrzeuge') WDOBUS Unterwaesche_Socken-und-Struempfe
right ('World_Deutsch_Online-Shops_Bekleidung_Unterwaesche_Socken-und-Struempfe', 'Firmen_Koerperpflege_Hygieneartikel-fuer-Frauen_OB') WDOBUS Online-Shops_Bekleidung
right ('World_Deutsch_Online-Shops_Bekleidung_Leder', 'Firmen_Koerperpflege_Hygieneartikel-fuer-Frauen_OB') WDOBL Online-Shops_Bekleidung
right ('World_Deutsch_Online-Shops_Bekleidung_Schuhe_Komfort_Hausschuhe', 'Firmen_Koerperpflege_Hygieneartikel-fuer-Frauen_OB') WDOBSKH Online-Shops_Bekleidung
right ('World_Deutsch_Online-Shops_Bekleidung_Sondergroessen', 'Firmen_Koerperpflege_Hygieneartikel-fuer-Frauen_OB') WDOBS Online-Shops_Bekleidung
right ('World_Deutsch_Online-Shop

right ('World_Deutsch_Online-Shops_Bekleidung_Sondergroessen_Herren', 'Firmen_Koerperpflege_Hygieneartikel-fuer-Frauen_OB') WDOBSH Online-Shops_Bekleidung
right ('World_Deutsch_Online-Shops_Bekleidung_Naturtextilien', 'Firmen_Koerperpflege_Hygieneartikel-fuer-Frauen_OB') WDOBN Online-Shops_Bekleidung
right ('World_Deutsch_Online-Shops_Haus-und-Garten_Haushaltsbedarf_Haushaltsgeraete_Kaffeemaschinen', 'Firmen_Computer_Hardware_Beamer-und-Projektoren_HK-Computer-Systemhaus') WDOHHHK Haushaltsgeraete_Kaffeemaschinen
right ('World_Deutsch_Online-Shops_Bekleidung_Damen', 'Firmen_Koerperpflege_Hygieneartikel-fuer-Frauen_OB') WDOBD Online-Shops_Bekleidung
right ('World_Deutsch_Online-Shops_Unterhaltung_Bild-und-Tontraeger_Musik_Genres_Musik-anderer-Kulturen', 'Firmen_Musik_Labels_Bertelsmann-Music-Group-Entertainment-BMG') WDOUBMGM Bild-und-Tontraeger_Musik_Genres
right ('World_Deutsch_Online-Shops_Bekleidung_Nischenprodukte', 'Firmen_Koerperpflege_Hygieneartikel-fuer-Frauen_OB') WDOBN Online

In [7]:
extracted_elems = []
mapping_ont = {}
for ont_name in list(set(flatten(ontologies_in_alignment))):
    ont = Ontology(ont_name)
    entities = ont.get_classes()
    props = ont.get_object_properties() + ont.get_data_properties()
    triples = list(set(flatten(ont.get_triples())))
    ont_name = ont_name.split("/")[-1].rsplit(".",1)[0].replace(".", "_").lower()
    print (ont_name)
    mapping_ont[ont_name] = ont
    extracted_elems.extend([ont_name + "#" + elem for elem in entities + props + triples])

extracted_elems = list(set(extracted_elems))
inp = []
for word in extracted_elems:
    ont_name = word.split("#")[0]
    elem = word.split("#")[1]
    inp.append(parse(elem))

print ("Total number of extracted unique classes and properties from entire RA set: ", len(extracted_elems))

extracted_elems = ["<UNK>"] + extracted_elems

embeds = np.array(extractUSEEmbeddings(inp))
embeds = np.array([np.zeros(embeds.shape[1],)] + list(embeds))
# embeds = np.array([np.zeros(512,)] + list(extractUSEEmbeddings(inp_spellchecked)))
embeddings = dict(zip(extracted_elems, embeds))


emb_vals = list(embeddings.values())
emb_indexer = {key: i for i, key in enumerate(list(embeddings.keys()))}
emb_indexer_inv = {i: key for i, key in enumerate(list(embeddings.keys()))}


dmoz
web
google
yahoo_small
google_freizeit
google_lebensmittel
dmoz_freizeit
web_lebensmittel
Total number of extracted unique classes and properties from entire RA set:  3282


In [30]:
inp

['World Deutsch Online-Shops Sport Pferdesport Reitbedarf Westernreiten',
 'Verzeichnis Einkaufen-Sparen Haus-Garten Haustierbedarf Hund-Katze',
 'Firmen Bekleidung Damenbekleidung Spezialgroessen',
 'World Deutsch Online-Shops Essen-und-Trinken Getraenke Wein Europaeisch Schweizer',
 'Verzeichnis Einkaufen-Sparen Freizeit-Hobby Sammeln Figuren',
 'Top World Deutsch Online-Shops Freizeit Waffen Schusswaffen Luft und Gasdruck',
 'Top World Deutsch Online-Shops Haustiere',
 'Firmen Koerperpflege Kosmetik',
 'Verzeichnis Einkaufen-Sparen Sportartikel Rugby',
 'Top World Deutsch Online-Shops Secondhand',
 'Firmen Buecher Buchhandel Hochschulbuchhandlungen',
 'World Deutsch Online-Shops Freizeit Zaubern',
 'World Deutsch Online-Shops Schreibwaren-und-Buerobedarf Materialien-und-Zubehoer Papierwaren',
 'Firmen Haus-und-Garten Garten-und-Pflanzen Zimmer-und-Dekorationspflanzen',
 'World Deutsch Online-Shops Kunsthandwerk Regionale-Spezialitaeten Amerikanisch',
 'Verzeichnis Einkaufen-Sparen H

In [15]:

def path_to_root(elem, ont_mappings, path_store, i):
    if elem in path_store:
        return path_store[elem]
    if elem not in ont_mappings or not ont_mappings[elem]:
        path_store[elem] = []
        return []
    output = flatten([[e] + path_to_root(e, ont_mappings, path_store, i+1) for e in ont_mappings[elem]])
    path_store[elem] = output
    return output

def get_one_hop_neighbours(ont, K=1):

    ont_obj = Ontology(ont)
    triples = ont_obj.get_triples(rootpath=True)
#     print (triples)
    entities = [(a,b) for (a,b,c) in triples]
#     neighbours_dict = {ont_obj.mapping_dict.get(elem, elem): [ont_obj.mapping_dict.get(elem, elem)]
#                        for elem in list(set(flatten(entities)))}
#     for e1, e2 in entities:
#         neighbours_dict[ont_obj.mapping_dict.get(e1, e1)].append(ont_obj.mapping_dict.get(e2, e2))
#         neighbours_dict[ont_obj.mapping_dict.get(e2, e2)].append(ont_obj.mapping_dict.get(e1, e1))
    
#     rootpath_dict = {elem: list(OrderedSet(ont_obj.parents_dict[elem])) for elem in ont_obj.parents_dict}
    
#     rootpath_dict_new = {}
#     path_store = {}
#     for elem in rootpath_dict:
#         rootpath_dict_new[elem] = path_to_root(elem, rootpath_dict, path_store, 0)
#     rootpath_dict_new = {ont_obj.mapping_dict.get(elem, elem): 
#                      [ont_obj.mapping_dict.get(e, e) for e in rootpath_dict[elem]]
#                     for elem in rootpath_dict_new}
#     ont = ont.split("/")[-1].rsplit(".",1)[0].replace(".", "_").lower()
#     print (ont)
#     for entity in neighbours_dict:
#         if entity in rootpath_dict_new and len(rootpath_dict_new[entity]) > 0:
#             neighbours_dict[entity].extend(rootpath_dict_new[entity])
#         else:
#             continue
    
    neighbours_dict = {elem: [elem] for elem in list(set(flatten(entities)))}
    for e1, e2 in entities:
        neighbours_dict[e1].append(e2)
        neighbours_dict[e2].append(e1)
    
    rootpath_dict = {elem: list(OrderedSet(ont_obj.parents_dict[elem])) for elem in ont_obj.parents_dict}
    
    path_store = {}
    for elem in rootpath_dict:
        rootpath_dict[elem] = path_to_root(elem, rootpath_dict, path_store, 0)

#     print (ont)
    ont = ont.split("/")[-1].rsplit(".",1)[0].replace(".", "_").lower()
    print (ont)
    for entity in neighbours_dict:
        if entity in rootpath_dict and len(rootpath_dict[entity]) > 0:
            neighbours_dict[entity].extend(rootpath_dict[entity])
        else:
            continue
#     prop_triples = ont_obj.get_triples(subclass_of=False)
#     neighbours_dict_props = {c: [c] for a,b,c in prop_triples}
#     for e1, e2, p in prop_triples:
#         neighbours_dict_props[p].extend([e1, e2])

    #neighbours_dict = {**neighbours_dict, **neighbours_dict_props}
    
    # for elem in ont_obj.get_entities() + ont_obj.get_object_properties() + ont_obj.get_data_properties():
    #     if elem not in neighbours_dict:
    #         neighbours_dict[elem] = [elem]

    neighbours_dict = {el: neighbours_dict[el][:1] + sorted(list(set(neighbours_dict[el][1:])))
                       for el in neighbours_dict}
#     neighbours_dict = {el: neighbours_dict[el][:23] for el in neighbours_dict if len( neighbours_dict[el]) > 2}
#     ont = ont.split("/")[-1].split(".")[0]
    neighbours_dict = {ont + "#" + el: [ont + "#" + e for e in neighbours_dict[el]] for el in neighbours_dict}
    return neighbours_dict

neighbours_dicts = {ont.split("/")[-1].rsplit(".",1)[0].replace(".", "_").lower(): get_one_hop_neighbours(ont) for ont in list(set(flatten(ontologies_in_alignment)))}
max_neighbours = np.max(flatten([[len(el[e]) for e in el] for el in neighbours_dicts.values()]))
neighbours_lens = {ont: {key: len(neighbours_dicts[ont][key]) for key in neighbours_dicts[ont]}
                   for ont in neighbours_dicts}
neighbours_dicts = {ont: {key: neighbours_dicts[ont][key] + ["<UNK>" for i in range(max_neighbours -len(neighbours_dicts[ont][key]))]
              for key in neighbours_dicts[ont]} for ont in neighbours_dicts}

data_items = data.items()
data_shuffled_t = [elem for elem in data_items if elem[1]]
data_shuffled_f = [elem for elem in data_items if not elem[1]]

false_indices = np.random.permutation(len(data_shuffled_f))
# indices = np.random.permutation(len(data_shuffled_t) + len(data_shuffled_f[:130000-len(data_shuffled_t)]))

data_shuffled = data_shuffled_t + data_shuffled_f

# indices = np.random.arange(len(data_shuffled))

data = OrderedDict(list(np.array(data_shuffled)[indices]))

ontologies_in_alignment_rev = [[el.split("/")[-1].rsplit(".",1)[0].replace(".", "_").lower() for el in ont] for ont in ontologies_in_alignment]
f = open("data_german_dataset_phrase.pkl", "wb")
pickle.dump([data, emb_indexer, emb_indexer_inv, emb_vals, reference_alignments, neighbours_dicts, ontologies_in_alignment_rev], f)


dmoz
web
google
yahoo_small
google_freizeit
google_lebensmittel
dmoz_freizeit
web_lebensmittel


In [34]:
data, emb_indexer, emb_indexer_inv, emb_vals, reference_alignments, neighbours_dicts, ontologies_in_alignment_rev = pickle.load(open("Input/data_german_dataset.pkl", "rb"))

In [None]:
data.keys()

In [None]:
data, emb_indexer, emb_indexer_inv, emb_vals, reference_alignments, neighbours_dicts, ontologies_in_alignment_rev = pickle.load(open("data_german_dataset.pkl", "rb"))

In [None]:
glob.glob("german_datasets/*/*.owl")

In [29]:
# AML test
def is_test(test_onto, key):
    return tuple([el.split("#")[0] for el in key]) in test_onto

results = []
# ontologies_in_alignment
for ont_pair in ontologies_in_alignment_new:
    t = time.time()
    a, b, c = ont_pair[0], ont_pair[1], ont_pair[0].split("/")[-1].rsplit(".",1)[0].replace(".", "_").lower() + "-" + ont_pair[1].split("/")[-1].rsplit(".",1)[0].replace(".", "_").lower()
    java_command = "java -jar AML_v3.1/AgreementMakerLight.jar -s " + a + " -t " + b + " -o AML-test-results/" + c + ".rdf -a"
    process = subprocess.Popen(java_command.split(), stdout=subprocess.PIPE)
    output, error = process.communicate()
    print ("Took {} seconds for {}".format(time.time()-t, ont_pair))
pred_aml = load_alignments("AML-test-results/")
# pred_aml
# all_ont_pairs = list(set([tuple([el.split("#")[0] for el in l]) for l in data.keys()]))
# # all_ont_pairs = [["conference", "confOf"]]
# for i in list(range(0, len(ontologies_in_alignment), 3)):
#     test_onto = all_ont_pairs[i+1:i+3]
#     for ont_pair in test_onto:
#         a, b, c = ont_pair[0], ont_pair[1], ont_pair[0] + "-" + ont_pair[1]
#         java_command = "java -jar AML_v3.1/AgreementMakerLight.jar -s conference_ontologies/" + a + ".owl" + \
#                             " -t conference_ontologies/" + b + ".owl -o AML-test-results/" + c + ".rdf -a"
#         process = subprocess.Popen(java_command.split(), stdout=subprocess.PIPE)
#         output, error = process.communicate()
#     print (os.listdir("AML-test-results/"))
#     pred_aml = load_alignments("AML-test-results/")
#     pred_aml = [tuple([el.split("/")[-1] for el in key]) for key in pred_aml]
#     tp = len([elem for elem in pred_aml if data[elem]])
#     fn = len([key for key in gt_mappings if key not in set(pred_aml) and is_test(test_onto, key)])
#     fp = len([elem for elem in pred_aml if not data[elem]])

#     precision = tp/(tp+fp)
#     recall = tp/(tp+fn)
#     f1score = 2 * precision * recall / (precision + recall)
#     f2score = 5 * precision * recall / (4 * precision + recall)
#     f0_5score = 1.25 * precision * recall / (0.25 * precision + recall)
#     print (precision, recall, f1score, f2score, f0_5score)
    
#     metrics = [precision, recall, f1score, f2score, f0_5score]
#     results.append(metrics)
    
#     _ = [os.remove(f) for f in glob.glob('AML-test-results/*')]
    
# print ("Final Results:", np.mean(results, axis=0))

Took 23236.648596525192 seconds for ('german_datasets_copy/lebensmittel/Google.Lebensmittel.owl', 'german_datasets_copy/lebensmittel/web.Lebensmittel.owl')
Took 867.720184803009 seconds for ('german_datasets_copy/freizeit/dmoz.Freizeit.owl', 'german_datasets_copy/freizeit/Google.Freizeit.owl')


KeyboardInterrupt: 

In [31]:
c

'dmoz-google'

In [None]:

# AML test
def is_test(test_onto, key):
    return tuple([el.split("#")[0] for el in key]) in test_onto

results = []
prefix = "/data/Vivek/IBM/IBM-Internship/conference_ontologies/"
for i in list(range(0, len(ontologies_in_alignment), 3)):
    test_onto = ontologies_in_alignment[i:i+3]
    tp_tot, fn_tot, fp_tot = [], [], []
    for ont_pair in test_onto:
        a, b, c = prefix + ont_pair[0], prefix + ont_pair[1], ont_pair[0] + "-" + ont_pair[1]
        !mkdir $c
        java_command = "java -jar logmap-matcher/target/logmap-matcher-4.0.jar MATCHER file:" +  a + ".owl file:" + b + ".owl " + \
                        "/data/Vivek/IBM/IBM-Internship/" + c + "/ false"
        process = subprocess.Popen(java_command.split(), stdout=subprocess.PIPE)
        output, error = process.communicate()
        
        pred_aml = [l.strip().split("\t")[:2] for l in open(c + "/logmap2_mappings.tsv", "r").read().split("\n")[:-1]]
        pred_aml = [tuple([el.split("/")[-1] for el in key]) for key in pred_aml]
        tp = [elem for elem in pred_aml if data[elem]]
        fn = [key for key in gt_mappings if key not in set(pred_aml) and is_test([tuple(ont_pair)], key)]
        fp = [elem for elem in pred_aml if not data[elem]]
        
        tp_tot.extend(tp)
        fn_tot.extend(fn)
        fp_tot.extend(fp)
        
        !rm -rf $c
   
    precision = len(tp_tot)/(len(tp_tot)+len(fp_tot))
    recall = len(tp_tot)/(len(tp_tot)+len(fn_tot))
    f1score = 2 * precision * recall / (precision + recall)
    f2score = 5 * precision * recall / (4 * precision + recall)
    f0_5score = 1.25 * precision * recall / (0.25 * precision + recall)
    print (precision, recall, f1score, f2score, f0_5score)
    
    metrics = [precision, recall, f1score, f2score, f0_5score]
    results.append(metrics)
    
    
    
print ("Final Results:", np.mean(results, axis=0))

In [18]:
ontologies_in_alignment_new = \
[('german_datasets_copy/lebensmittel/Google.Lebensmittel.owl',
	'german_datasets_copy/lebensmittel/web.Lebensmittel.owl'),
 ('german_datasets_copy/freizeit/dmoz.Freizeit.owl',
	'german_datasets_copy/freizeit/Google.Freizeit.owl'),
 ('german_datasets_copy/webdirectory/dmoz.owl',
	'german_datasets_copy/webdirectory/google.owl'),
 ('german_datasets_copy/webdirectory/dmoz.owl',
	'german_datasets_copy/webdirectory/web.owl'),
 ('german_datasets_copy/webdirectory/dmoz.owl',
	'german_datasets_copy/webdirectory/yahoo.small.owl'),
 ('german_datasets_copy/webdirectory/google.owl',
	'german_datasets_copy/webdirectory/web.owl'),
 ('german_datasets_copy/webdirectory/google.owl',
	'german_datasets_copy/webdirectory/yahoo.small.owl'),
 ('german_datasets_copy/webdirectory/web.owl',
	'german_datasets_copy/webdirectory/yahoo.small.owl')]



In [27]:
max_neighbours

51

In [23]:
all_pred = []
for ont_pair in ontologies_in_alignment_new:
    ont_name1 = ont_pair[0].split("/")[-1].rsplit(".",1)[0].replace(".", "_").lower()
    ont_name2 = ont_pair[1].split("/")[-1].rsplit(".",1)[0].replace(".", "_").lower()
    c = ont_pair[0].split("/")[-1].rsplit(".",1)[0].replace(".", "_").lower() + "-" + ont_pair[1].split("/")[-1].rsplit(".",1)[0].replace(".", "_").lower()
    c = "/data/Vivek/IBM/IBM-Internship/" + c
    pred_aml = [l.strip().split("\t")[:2] for l in open(c + "/logmap2_mappings.tsv", "r").read().split("\n")[:-1]]
    pred_aml = [tuple([el.split("#")[-1] for el in key]) for key in pred_aml]
    pred_aml = [tuple((ont_name1 + "#" + key[0], ont_name2 + "#" + key[1])) for key in pred_aml]
    all_pred.extend(pred_aml)


logmap_results = OrderedDict()
for key in data:
    logmap_results[key] = False
for key in all_pred:
    if key not in logmap_results:
        print (key)
    else:
        logmap_results[key] = True

('google_lebensmittel#World_Deutsch_Online-Shops_Essen-und-Trinken_Getraenke_Wein_Suedafrikanisch_http://www.hanseaten-select.de/', 'web_lebensmittel#Verzeichnis_Einkaufen-Sparen_Nahrungs-Genussmittel_Spezialitaeten_http://www.hanseaten-select.de')
('google_lebensmittel#World_Deutsch_Online-Shops_Essen-und-Trinken_Getraenke_Kaffee-und-Tee_Kaffee_http://www.coffea-store.de/', 'web_lebensmittel#Verzeichnis_Einkaufen-Sparen_Nahrungs-Genussmittel_Naturkost_http://www.oneworld.de/scripts/shop.prg/dritteweltpartner')
('google_lebensmittel#World_Deutsch_Online-Shops_Essen-und-Trinken_Getraenke_Kaffee-und-Tee_Kaffee_http://www.coffea-store.de/', 'web_lebensmittel#Verzeichnis_Einkaufen-Sparen_Nahrungs-Genussmittel_Getraenke_Tee-Kaffee_http://www.cappuccino-laden.de')
('google_lebensmittel#World_Deutsch_Online-Shops_Essen-und-Trinken_Getraenke_Kaffee-und-Tee_Kaffee_http://www.coffea-store.de/', 'web_lebensmittel#Verzeichnis_Einkaufen-Sparen_Nahrungs-Genussmittel_Getraenke_Tee-Kaffee_http://www.g

('dmoz#Top_World_Deutsch_Online-Shops_Gesundheit_Ernaehrung_Nahrungsergaenzung_http://www.vitasavia.de/', 'google#World_Deutsch_Online-Shops_Gesundheit_Ernaehrung_Nahrungsergaenzung_http://www.vitasavia.de/')
('dmoz#Top_World_Deutsch_Online-Shops_Essen_und_Trinken_Honig_http://www.honig-seite.de/', 'google#World_Deutsch_Online-Shops_Essen-und-Trinken_Honig_http://www.honig-seite.de/')
('dmoz#Top_World_Deutsch_Online-Shops_Publikationen_Buecher_Antiquarisch_http://www.antiquariat-ludwig.de/', 'google#World_Deutsch_Online-Shops_Publikationen_Buecher_Antiquariate_http://www.antiquariat-ludwig.de/')
('dmoz#Top_World_Deutsch_Online-Shops_Fahrzeuge_Autos_Ersatzteile_und_Zubehoer_Tuning_http://www.bifu-design.ch/', 'google#World_Deutsch_Online-Shops_Fahrzeuge_Autos_Ersatzteile-und-Zubehoer_Tuning_http://www.bifu-design.ch/')
('dmoz#Top_World_Deutsch_Online-Shops_Sport_Pferdesport_http://www.turnierbedarf-schaefer.de', 'google#World_Deutsch_Online-Shops_Sport_Pferdesport_http://www.turnierbeda

('dmoz#Top_World_Deutsch_Online-Shops_Freizeit_Basteln_http://www.creativelaedle.de/', 'google#World_Deutsch_Online-Shops_Freizeit_Basteln_http://www.creativelaedle.de/')
('dmoz#Top_World_Deutsch_Online-Shops_Tabak_Zigarren_https://ssl.kundenserver.de/s74664755.einsundeinsshop.de/', 'google#World_Deutsch_Online-Shops_Tabak_Zigarren_https://ssl.kundenserver.de/s74664755.einsundeinsshop.de/')
('dmoz#Top_World_Deutsch_Online-Shops_Haus_und_Garten_Moebel_Lampen_und_Leuchten_http://www.lud.de/', 'google#World_Deutsch_Online-Shops_Haus-und-Garten_Moebel_Lampen-und-Leuchten_http://www.lud.de/')
('dmoz#Top_World_Deutsch_Online-Shops_Haus_und_Garten_Dekorationsartikel_http://www.design-muss-sein.de/', 'google#World_Deutsch_Online-Shops_Haus-und-Garten_Dekorationsartikel_http://www.design-muss-sein.de/')
('dmoz#Top_World_Deutsch_Online-Shops_Fahrzeuge_Autos_Ersatzteile_und_Zubehoer_Tuning_http://www.autopeinelt.de', 'google#World_Deutsch_Online-Shops_Fahrzeuge_Autos_Ersatzteile-und-Zubehoer_Tuni

('dmoz#Top_World_Deutsch_Online-Shops_Kinder_Babys_http://www.sling.ch/', 'google#World_Deutsch_Online-Shops_Kinder_Babys_http://www.sling.ch/')
('dmoz#Top_World_Deutsch_Online-Shops_Essen_und_Trinken_Getraenke_Spirituosen_Whisky_http://www.world-of-whisky.ch/', 'google#World_Deutsch_Online-Shops_Essen-und-Trinken_Getraenke_Spirituosen_Whisky_http://www.world-of-whisky.ch/')
('dmoz#Top_World_Deutsch_Online-Shops_Haus_und_Garten_Pflanzen_Bonsai_http://www.niedbonsai.de', 'google#World_Deutsch_Online-Shops_Haus-und-Garten_Pflanzen_Bonsai_http://www.niedbonsai.de/')
('dmoz#Top_World_Deutsch_Online-Shops_Sport_Motorsport_http://www.simpson-deutschland.de', 'google#World_Deutsch_Online-Shops_Sport_Motorsport_http://www.simpson-deutschland.de/')
('dmoz#Top_World_Deutsch_Online-Shops_Freizeit_Basteln_http://www.bastelonlineshop.de/', 'google#World_Deutsch_Online-Shops_Freizeit_Basteln_http://www.bastelonlineshop.de/')
('dmoz#Top_World_Deutsch_Online-Shops_Fahrzeuge_Autos_Ersatzteile_und_Zubeh

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [24]:
data_items = list(data.items())
logmap_items = list(logmap_results.items())
results = []
for i in range(6):
    
    test_data = dict(data_items[int(0.15*i*len(data)):int((0.15*i + 0.1)*len(data))])
    logmap_data = dict(logmap_items[int(0.15*i*len(logmap_results)):int((0.15*i + 0.1)*len(logmap_results))])
    
    s = set(all_pred)
    tp_tot = [elem for elem in logmap_data if test_data[elem] and logmap_data[elem]]
    fp_tot = [elem for elem in logmap_data if not test_data[elem] and logmap_data[elem]]
#     print (len(tp_tot), len(fp_tot))
    fn_tot = [elem for elem in test_data if test_data[elem] and not logmap_data[elem]]
    precision = len(tp_tot)/(len(tp_tot)+len(fp_tot))
    recall = len(tp_tot)/(len(tp_tot)+len(fn_tot))
    f1score = 2 * precision * recall / (precision + recall)
    f2score = 5 * precision * recall / (4 * precision + recall)
    f0_5score = 1.25 * precision * recall / (0.25 * precision + recall)
    print (precision, recall, f1score, f2score, f0_5score)
    
    metrics = [precision, recall, f1score, f2score, f0_5score]
    results.append(metrics)
    
print ("Final Results:", np.mean(results, axis=0))
print (len(all_pred))

0.7764705882352941 0.616822429906542 0.6875 0.6432748538011696 0.738255033557047
0.7692307692307693 0.5357142857142857 0.6315789473684211 0.5703422053231938 0.7075471698113207
0.7692307692307693 0.5106382978723404 0.6138107416879794 0.5474452554744526 0.6984866123399301
0.8106508875739645 0.5956521739130435 0.6867167919799498 0.6290174471992656 0.7560706401766006
0.8445945945945946 0.5841121495327103 0.6906077348066298 0.6225099601593624 0.7754342431761786
0.7596899224806202 0.47115384615384615 0.5816023738872403 0.509885535900104 0.6767955801104972
Final Results: [0.78831126 0.55234886 0.6486361  0.58707921 0.72543155]
39611


In [20]:
logmap_results = OrderedDict()
for key in data:
    logmap_results[key] = False
for key in all_pred:
    if key not in logmap_results:
        print (key)
    else:
        logmap_results[key] = True

In [21]:
len(data), len(logmap_results)

(3308848, 3308848)

In [None]:
inp

In [None]:
glob

In [None]:
fn_spellchecked, fp_spellchecked = [dict(el) for el in pickle.load(open("test_v2.pkl", "rb"))]
fn_baseline, fp_baseline = [dict(el) for el in pickle.load(open("test_best.pkl", "rb"))]
fn_unhas, fp_unhas = [dict(el) for el in pickle.load(open("test_unhas.pkl", "rb"))]
fn_resolved, fp_resolved = [dict(el) for el in pickle.load(open("test_resolved.pkl", "rb"))]

fn_dict, fp_dict = {}, {}
def create_comparison_file(file, idx):
    fn, fp = [dict(el) for el in pickle.load(open(file, "rb"))]
    
    for key in fn:
        if key in fn_dict:
            fn_dict[key][idx] = fn[key]
        else:
            fn_dict[key] = ["N/A" for i in range(4)]
            fn_dict[key][idx] = fn[key]
    
    for key in fp:
        if key in fp_dict:
            fp_dict[key][idx] = fp[key]
        else:
            fp_dict[key] = ["N/A" for i in range(4)]
            fp_dict[key][idx] = fp[key]
    

create_comparison_file("test_best.pkl", 0)
create_comparison_file("test_unhas.pkl", 1)
create_comparison_file("test_v2.pkl", 2)
create_comparison_file("test_resolved.pkl", 3)

open("fn - comparison.tsv", "w+").write("\n".join(["\t".join([str(el) for el in flatten(el)]) for el in fn_dict.items()]))
open("fp - comparison.tsv", "w+").write("\n".join(["\t".join([str(el) for el in flatten(el)]) for el in fp_dict.items()]))

In [None]:
ontologies_in_alignment = pickle.load(open("data_path.pkl", "rb"))[-1]
ontologies_in_alignment

In [None]:
d = {('confOf#Organization', 'sigkdd#Organizator'): (1,2,3,4),
 ('iasted#Document', 'sigkdd#Document'): (5,6,78,8)}
[[str(el) for el in flatten(el)] for el in d.items()]

In [None]:
abbreviations_dict = {}
final_dict = {}

for mapping in all_mappings:
    mapping = tuple([el.split("#")[1] for el in mapping])
    is_abb = re.search("[A-Z][A-Z]+", mapping[0])
    if is_abb:
        abbreviation = "".join([el[0].upper() for el in mapping[1].split("_")])
        if is_abb.group() in abbreviation:
            
            start = abbreviation.find(is_abb.group())
            end = start + len(is_abb.group())
            fullform = "_".join(mapping[1].split("_")[start:end])
            print ("left", mapping, abbreviation, fullform)
            
            rest_first = " ".join([el for el in mapping[0].replace(is_abb.group(), "").split("_") if el]).lower()
            rest_second = " ".join(mapping[1].split("_")[:start] + mapping[1].split("_")[end:])
            if is_abb.group() not in final_dict:
                final_dict[is_abb.group()] = [(fullform, rest_first, rest_second)]
            else:
                final_dict[is_abb.group()].append((fullform, rest_first, rest_second))

    is_abb = re.search("[A-Z][A-Z]+", mapping[1])
    if is_abb:
        abbreviation = "".join([el[0].upper() for el in mapping[0].split("_")])
        
        if is_abb.group() in abbreviation:
            start = abbreviation.find(is_abb.group())
            end = start + len(is_abb.group())
            fullform = "_".join(mapping[0].split("_")[start:end])
            print ("right", mapping, abbreviation, fullform)

            rest_first = " ".join([el for el in mapping[1].replace(is_abb.group(), "").split("_") if el]).lower()
            rest_second = " ".join(mapping[0].split("_")[:start] + mapping[0].split("_")[end:])
            if is_abb.group() not in final_dict:
                final_dict[is_abb.group()] = [(fullform, rest_first, rest_second)]
            else:
                final_dict[is_abb.group()].append((fullform, rest_first, rest_second))

keys = [el for el in list(set(flatten([flatten([tup[1:] for tup in final_dict[key]]) for key in final_dict]))) if el]
abb_embeds = dict(zip(keys, extractUSEEmbeddings(keys)))

scored_dict = {}
for abbr in final_dict:
    sim_list = [(tup[0], tup[1], tup[2], cos_sim(abb_embeds[tup[1]], abb_embeds[tup[2]])) if tup[1] and tup[2]
                else (tup[0], tup[1], tup[2], 0) for tup in final_dict[abbr]]
    scored_dict[abbr] = sorted(list(set(sim_list)), key=lambda x:x[-1], reverse=True)

resolved_dict = {key: scored_dict[key][0] for key in scored_dict}
filtered_dict = {key: " ".join(resolved_dict[key][0].split("_")) for key in resolved_dict if resolved_dict[key][-1] > 0.9}
inp_resolved = []
for concept in inp:
    for key in filtered_dict:
        concept = concept.replace(key, filtered_dict[key])
    inp_resolved.append(concept)
inp_resolved

In [None]:
len()

In [None]:
cos_sim(*extractUSEEmbeddings(["Conference Banquet", "Dinner Banquet"]))

In [None]:
np.array(data.items())

In [None]:
scored_dict = {}
for abbr in final_dict:
    sim_list = [(tup[0], tup[1], tup[2], cos_sim(abb_embeds[tup[1]], abb_embeds[tup[2]])) if tup[1] and tup[2]
                else (tup[0], tup[1], tup[2], 0) for tup in final_dict[abbr]]
    scored_dict[abbr] = sorted(list(set(sim_list)), key=lambda x:x[-1], reverse=True)


In [None]:
inp_case_handled = []
for concept in inp:
    final_list = []
    for word in concept.split(" "):
        if not re.search("[A-Z][A-Z]+", concept):
            final_list.append(word.lower())
        else:
            final_list.append(word)
    case_resolved = " ".join(final_list)
    inp_case_handled.append(case_resolved)
    
inp_case_handled

In [17]:
indices

array([ 92749,  29890,  49076, ..., 121868,  56566,  66092])

In [None]:
Ontology("conference_ontologies/conference.owl").mapping_dict

In [None]:
# from transformers import XLNetTokenizer, XLNetModel
# import torch
# import scipy
# import torch.nn as nn
# import torch.nn.functional as F

# tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
# model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

import time

t = time.time()
input_ids = torch.tensor(tokenizer.encode("fastigial nucleus", add_special_tokens=True)).unsqueeze(0)
outputs = model(input_ids)
last_hidden_states = outputs[0].mean(1)

print (t-time.time())
# input_ids = torch.tensor(tokenizer.encode("femur", add_special_tokens=True)).unsqueeze(0) 

# outputs1 = model(input_ids)
# last_hidden_states1 = outputs1[0].mean(1)

# cos = nn.CosineSimilarity(dim=1, eps=1e-6)
# cos(last_hidden_states, last_hidden_states1)

In [None]:
Ontology("german_datasets/mapping freizeit/Google.Freizeit.owl").mapping_dict

In [None]:
def load_german_mappings(file):
    mappings = [content.strip() for content in open(file).read().split("--------------------------------------------------------") if content.strip()]
    for mapping in mappings:
        src = [line.split(":")[-1].strip() for line in mapping.split("\n") if line.startswith(" + Source: ")][0]
        targ = [line.split(":")[-1].strip() for line in mapping.split("\n") if line.startswith(" + Target: ")][0]
        
        lines = [[row.strip().split(":")[0].split(".")[-1] for row in line.split("-",1)[1].strip().split("<->")]
                 for line in mapping.split("\n") if line.startswith(" -")]
        lines = [[src.rsplit(".",1)[0].replace(".", "_").lower() + "#" + line[0],
                  targ.rsplit(".",1)[0].replace(".", "_").lower() + "#" + line[1]] for line in lines]
    return lines
    
len(load_german_mappings("german_datasets/mapping lebensmittel/mapping.txt"))

In [None]:
import tensorflow_text

In [None]:
open("Output_a")