In [232]:
# Construction of dataset

import os, itertools, time, pickle
import subprocess
from xml.dom import minidom
from collections import Counter, OrderedDict
from operator import itemgetter
from nltk.corpus import wordnet
import tensorflow as tf
import tensorflow_hub as hub
from scipy import spatial
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import numpy as np
import scipy.sparse as sp
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from math import ceil, exp
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

USE_folder = "/home/vlead/USE"
alignment_folder = "reference-alignment/"

# Load reference alignments 
def load_alignments(folder):
    alignments = []
    for f in os.listdir(folder):
        doc = minidom.parse(folder + f)
        ls = list(zip(doc.getElementsByTagName('entity1'), doc.getElementsByTagName('entity2')))
        alignments.extend([(a.getAttribute('rdf:resource'), b.getAttribute('rdf:resource')) for (a,b) in ls])
    return alignments
        
reference_alignments = load_alignments(alignment_folder)

In [14]:
flatten = lambda l: [item for sublist in l for item in sublist]

class Ontology():
    def __init__(self, ontology):
        self.ontology = ontology
        self.ontology_obj = minidom.parse(ontology)
        self.root = self.ontology_obj.documentElement
        self.subclasses = self.parse_subclasses()
        self.object_properties = self.parse_object_properties()
        self.data_properties = self.parse_data_properties()
        self.triples = self.parse_triples()
        self.classes = self.parse_classes()
    
    def get_child_node(self, element, tag):
        return [e for e in element._get_childNodes() if type(e)==minidom.Element and e._get_tagName() == tag]
        
    def has_attribute_value(self, element, attribute, value):
        return True if element.getAttribute(attribute).split("#")[-1] == value else False
    
    def get_subclass_triples(self):
        return [(a,b,"subclass_of") for (a,b) in self.get_subclasses()]
    
    def parse_triples(self, union_flag=0, subclass_of=True):
        obj_props = self.object_properties
        data_props = self.data_properties
        props = obj_props + data_props
        all_triples = []
        for prop in props:
            domain_children = self.get_child_node(prop, "rdfs:domain")
            range_children = self.get_child_node(prop, "rdfs:range")
            domain_prop = self.filter_null([self.extract_ID(el) for el in domain_children])
            range_prop = self.filter_null([self.extract_ID(el) for el in range_children])
            if not domain_children or not range_children:
                continue
            if not domain_prop:
                domain_prop = self.filter_null([self.extract_ID(el) for el in domain_children[0].getElementsByTagName("owl:Class")])
            if not range_prop:
                range_prop = self.filter_null([self.extract_ID(el) for el in range_children[0].getElementsByTagName("owl:Class")])
            if domain_prop and range_prop:
                if union_flag == 0:
                    all_triples.extend([(el[0], el[1], self.extract_ID(prop)) for el in list(itertools.product(domain_prop, range_prop))])
                else:
                    all_triples.append(("###".join(domain_prop), "###".join(range_prop), self.extract_ID(prop)))
        if subclass_of:
            all_triples.extend(self.get_subclass_triples())
        return list(set(all_triples))
    
    def get_triples(self, union_flag=0, subclass_of=True):
        return self.parse_triples(union_flag, subclass_of)

    def parse_subclasses(self, union_flag=0):
        subclasses = self.root.getElementsByTagName("rdfs:subClassOf")
        subclass_pairs = []
        for el in subclasses:
            inline_subclasses = self.extract_ID(el)
            if inline_subclasses:
                subclass_pairs.append((el, el.parentNode))
            else:
                level1_class = self.get_child_node(el, "owl:Class")
                if not level1_class:
                    continue
                if self.extract_ID(level1_class[0]):
                    subclass_pairs.append((level1_class[0], el.parentNode))
                else:
                    level2classes = level1_class[0].getElementsByTagName("owl:Class")
                    
                    subclass_pairs.extend([(elem, el.parentNode) for elem in level2classes if self.extract_ID(elem)])
        return subclass_pairs
        
    def get_subclasses(self):
        return [(self.extract_ID(a), self.extract_ID(b)) for (a,b) in self.subclasses]
    
    def filter_null(self, data):
        return [el for el in data if el]
    
    def extract_ID(self, element):
        element_id = element.getAttribute("rdf:ID") or element.getAttribute("rdf:resource") or element.getAttribute("rdf:about")
        return element_id.split("#")[-1]
    
    def parse_classes(self):
        class_elems = [self.extract_ID(el) for el in self.root.getElementsByTagName("owl:Class")]
        subclass_classes = list(set(flatten([el[:-1] for el in self.triples])))
        return list(set(self.filter_null(class_elems + subclass_classes)))
    
    def get_classes(self):
        return self.classes
    
    def get_entities(self):
        entities = [self.extract_ID(el) for el in self.root.getElementsByTagName("owl:Class")]
        return list(set(self.filter_null(entities)))

    def parse_data_properties(self):
        data_properties = [el for el in self.get_child_node(self.root, 'owl:DatatypeProperty')]
        fn_data_properties = [el for el in self.get_child_node(self.root, 'owl:FunctionalProperty') if el]
        fn_data_properties = [el for el in fn_data_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "DatatypeProperty")]]
        inv_fn_data_properties = [el for el in self.get_child_node(self.root, 'owl:InverseFunctionalProperty') if el]
        inv_fn_data_properties = [el for el in inv_fn_data_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "DatatypeProperty")]]
        return data_properties + fn_data_properties + inv_fn_data_properties
        
    def parse_object_properties(self):
        obj_properties = [el for el in self.get_child_node(self.root, 'owl:ObjectProperty')]
        fn_obj_properties = [el for el in self.get_child_node(self.root, 'owl:FunctionalProperty') if el]
        fn_obj_properties = [el for el in fn_obj_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "ObjectProperty")]]
        inv_fn_obj_properties = [el for el in self.get_child_node(self.root, 'owl:InverseFunctionalProperty') if el]
        inv_fn_obj_properties = [el for el in inv_fn_obj_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "ObjectProperty")]]
        return obj_properties + fn_obj_properties + inv_fn_obj_properties
    
    def get_object_properties(self):
        obj_props = [self.extract_ID(el) for el in self.object_properties]
        return list(set(self.filter_null(obj_props)))
    
    def get_data_properties(self):
        data_props = [self.extract_ID(el) for el in self.data_properties]
        return list(set(self.filter_null(data_props)))




In [37]:
# Extracting USE embeddings

ontologies_in_alignment = [l.split(".")[0].split("-") for l in os.listdir("reference-alignment/")]

def extractUSEEmbeddings(words):
    try:
        embed = hub.KerasLayer(USE_folder)
    except Exception as e:
        !mkdir $USE_folder
        !curl -L "https://tfhub.dev/google/universal-sentence-encoder-large/5?tf-hub-format=compressed" | tar -zxvC $USE_folder
        embed = hub.KerasLayer(USE_folder)
        pass
    word_embeddings = embed(words)
    return word_embeddings.numpy()

def cos_sim(a,b):
    return 1 - spatial.distance.cosine(a, b)

def camel_case_split(identifier):
    matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
    return [m.group(0).lower() for m in matches]

def parse(word):
    return flatten([el.split("_") for el in camel_case_split(word)])
    

extracted_elems = []

for ont_name in list(set(flatten(ontologies_in_alignment))):
    ont = Ontology("conference_ontologies/" + ont_name + ".owl")
    entities = ont.get_entities()
    props = ont.get_object_properties() + ont.get_data_properties()
    triples = list(set(flatten(ont.get_triples())))
    extracted_elems.extend([ont_name + "#" + elem for elem in entities + props + triples])

extracted_elems = list(set(extracted_elems))

inp = [" ".join(parse(word.split("#")[1])) for word in extracted_elems]
vectorizer = TfidfVectorizer(token_pattern=r"(?u)\S+")
X = vectorizer.fit_transform(inp)
word2idx_tfidf = {word: i for (i, word)  in enumerate(vectorizer.get_feature_names())}
entity2idx_tfidf = {word.split("#")[1]: i for (i, word)  in enumerate(extracted_elems)}


print ("Total number of extracted unique classes and properties from entire RA set: ", len(extracted_elems))

inp = ["<UNK>"] + inp
extracted_elems = ["<UNK>"] + extracted_elems

embeds = extractUSEEmbeddings(inp)
embeddings = dict(zip(extracted_elems, embeds))    

Total number of extracted unique classes and properties from entire RA set:  834


In [4]:
# Type storage

types_dict = {}

def get_tfidf_score(word, phrase):
    return np.sum([X[entity2idx_tfidf[phrase]][:,word2idx_tfidf[word]][0,0] for word in parse(phrase)])
    
for ont_name in list(set(flatten(ontologies_in_alignment))):
    ont = Ontology("conference_ontologies/" + ont_name + ".owl")
    
    entities = ont.get_entities()
    props = ont.get_object_properties() + ont.get_data_properties()

    for entity in entities:
        types_dict[entity] = {"type": "entity"}
    for prop in props:
        types_dict[prop] = {"type": "property"}


In [5]:
# Combinatorial mapping generation

all_mappings = []
for l in ontologies_in_alignment:
    ont1 = Ontology("conference_ontologies/" + l[0] + ".owl")
    ont2 = Ontology("conference_ontologies/" + l[1] + ".owl")
    
    ent1 = ont1.get_entities()
    ent2 = ont2.get_entities()
    
    obj1 = ont1.get_object_properties()
    obj2 = ont2.get_object_properties()
    
    data1 = ont1.get_data_properties()
    data2 = ont2.get_data_properties()
    
    mappings = list(itertools.product(ent1, ent2)) + list(itertools.product(obj1, obj2)) + list(itertools.product(data1, data2))
    
    all_mappings.extend([(l[0] + "#" + el[0], l[1] + "#" + el[1]) for el in mappings])
    

In [6]:
gt_mappings = [tuple([elem.split("/")[-1] for elem in el]) for el in reference_alignments]

data = {}
for mapping in all_mappings:
    if mapping in gt_mappings:
        data[(mapping[0], mapping[1])] = True
    else:
        data[(mapping[0], mapping[1])] = False


In [706]:
ind_test, inp_test1, inp_test2 = None, None, None
def greedy_matching():
    global batch_size, test_data_t, test_data_f, model, optimizer, emb_indexer_inv, gt_mappings, all_metrics
    all_results = OrderedDict()
    with torch.no_grad():
        all_pred = []
        batch_size = min(batch_size, len(test_data_t))
        num_batches = int(ceil(len(test_data_t)/batch_size))

        np.random.shuffle(test_data_t)
        np.random.shuffle(test_data_f)

        for batch_idx in range(num_batches):
            batch_start = batch_idx * batch_size
            batch_end = (batch_idx+1) * batch_size

            batch_start_f = batch_idx * batch_size_f
            batch_end_f = (batch_idx+1) * batch_size_f
            
            pos_elems = np.array(test_data_t)[batch_start:batch_end]
            neg_elems = np.array(test_data_f)[batch_start_f:batch_end_f]
            optimizer.zero_grad()

            inputs = np.array([generate_data(elem) for elem in list(pos_elems) + list(neg_elems)])
            
            targets = np.array([1 for i in range(len(pos_elems))] + [0 for i in range(len(neg_elems))])
            
            
            indices = np.random.permutation(inputs.shape[0])
            inputs, targets = inputs[indices].transpose(1,0,2), targets[indices]
#             inputs = inputs.transpose(1,0,2)
            inputs_elem = inputs.copy()
            
#             print ("1", inputs_elem)
            
            nonzero_elems = np.count_nonzero(inputs, axis=-1)
            indices = np.flip(np.argsort(nonzero_elems, axis=-1), axis=-1)
            seq_lens = np.flip(np.sort(nonzero_elems, axis=-1), axis=-1)
            inputs = np.stack((inputs[0][[indices[0]]], inputs[1][[indices[1]]]), axis=0)
            
            d1 = {elem:i for i,elem in enumerate(indices[0])}
            d2 = {elem:i for i,elem in enumerate(indices[1])}
            rev_indices = np.stack(([d1[k] for k in range(inputs_elem.shape[1])], [d2[k] for k in range(inputs_elem.shape[1])]))
#             print ("2", rev_indices)
#             print ("3", indices)
            rev_indices = torch.LongTensor(rev_indices)
            inputs = torch.LongTensor(inputs)
            targets = torch.DoubleTensor(targets)

            outputs = model(inputs, rev_indices)
            outputs /= torch.sum(outputs, dim=1).view(-1, 1)
            outputs = [(1-el[1].item()) for el in outputs]
#             return
#             print ("2", inputs)
#             print ("3", seq_lens)
#             print ("4", rev_indices)
            
            targets = [True if el.item() else False for el in targets]
#             print (inputs)
            for idx, pred_elem in enumerate(outputs):
                ent1 = emb_indexer_inv[inputs_elem[0][idx][0]]
                ent2 = emb_indexer_inv[inputs_elem[1][idx][0]]
                if (ent1, ent2) in all_results:
                    print ("Error: ", ent1, ent2, "already present")
                all_results[(ent1, ent2)] = (pred_elem, targets[idx])
        optimum_metrics, opt_threshold = [-1000 for i in range(5)], -1000
        low_threshold = np.min([el[0] for el in all_results.values()]) - 0.01
        high_threshold = np.max([el[0] for el in all_results.values()]) + 0.01
        for j,threshold in enumerate(np.arange(low_threshold, high_threshold, 0.01)):
            res = []
            for i,key in enumerate(all_results):
                if all_results[key][0] > threshold:
                    res.append(key)
            fn_list = [key for key in gt_mappings if key not in set(res) and not is_valid(test_onto, key)]
            fp_list = [elem for elem in res if not all_results[elem][1]]
            tp_list = [elem for elem in res if all_results[elem][1]]
            
            tp, fn, fp = len(tp_list), len(fn_list), len(fp_list)
            
            
            try:
                precision = tp/(tp+fp)
                recall = tp/(tp+fn)
                f1score = 2 * precision * recall / (precision + recall)
                f2score = 5 * precision * recall / (4 * precision + recall)
                f0_5score = 1.25 * precision * recall / (0.25 * precision + recall)
            except Exception as e:
                print (e)
                continue
            print ("Threshold: ", threshold, precision, recall, f1score, f2score, f0_5score)

            if f1score > optimum_metrics[2]:
                optimum_metrics = [precision, recall, f1score, f2score, f0_5score]
                opt_threshold = threshold
        
        print ("Precision: {} Recall: {} F1-Score: {} F2-Score: {} F0.5-Score: {}".format(*optimum_metrics))
        all_metrics.append((opt_threshold, optimum_metrics))
    
def write(elem):
    f = open("Logs", "a+")
    if type(elem) == list or type(elem) == tuple:
        string = str("\n".join([str(s) for s in elem]))
    else:
        string = str(elem)
    f.write("\n"+string)
    f.close()
    
inputs3, results3 = None, None

class SiameseNetwork(nn.Module):
    def __init__(self, embedding_dim, hidden_dim):
        super().__init__() 
        
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim

        self.name_embedding = nn.Embedding(len(embeddings), self.embedding_dim)
        self.name_embedding.load_state_dict({'weight': torch.from_numpy(np.array(emb_vals))})

        self.dropout = dropout
        
        self.layer1 = nn.Bilinear(self.hidden_dim, self.hidden_dim, 2)

    def forward(self, inputs, rev_indices):
        results = []
        for i in range(2):
            x = self.name_embedding(inputs[i])
            x = x[rev_indices[i],:,:]
            results.append(x.reshape(-1, self.embedding_dim))
        global inputs3, results3
        results3 = results
        inputs3 = inputs
        x = self.layer1(results[0], results[1])
        x = F.log_softmax(x)
        return x
emb_indexer = {word: i for i, word in enumerate(list(embeddings.keys()))}
emb_indexer_inv = {i: word for i, word in enumerate(list(embeddings.keys()))}
emb_vals = list(embeddings.values())


def get_one_hop_neighbours(ont, K=1):
    ont_obj = Ontology("conference_ontologies/" + ont + ".owl")
    triples = ont_obj.get_triples()
    entities = [(a,b) for (a,b,c) in triples]
    neighbours_dict = {elem: [elem] for elem in list(set(flatten(entities)))}
    for e1, e2 in entities:
        neighbours_dict[e1].append(e2)
        neighbours_dict[e2].append(e1)
    
    prop_triples = ont_obj.get_triples(subclass_of=False)
    neighbours_dict_props = {c: [c] for a,b,c in prop_triples}
    for e1, e2, p in prop_triples:
        neighbours_dict_props[p].extend([e1, e2])

    neighbours_dict = {**neighbours_dict, **neighbours_dict_props}
    
    for elem in ont_obj.get_entities() + ont_obj.get_object_properties() + ont_obj.get_data_properties():
        if elem not in neighbours_dict:
            neighbours_dict[elem] = [elem]

    neighbours_dict = {el: neighbours_dict[el][:1] + sorted(list(set(neighbours_dict[el][1:])))
                       for el in neighbours_dict}
    neighbours_dict = {el: neighbours_dict[el][:1] for el in neighbours_dict}
    neighbours_dict = {ont + "#" + el: [ont + "#" + e for e in neighbours_dict[el]] for el in neighbours_dict}
    return neighbours_dict

def is_valid(test_onto, key):
    return tuple([el.split("#")[0] for el in key]) not in test_onto

def generate_data(elem_tuple):
    op = np.array([[emb_indexer[el] for el in neighbours_dicts[elem.split("#")[0]][elem]] for elem in elem_tuple])
    return op

def generate_input(elems, target):
    inputs = np.array([generate_data(elem) for elem in list(elems)])
    targets = np.array([target for i in range(len(elems))])
    return inputs, targets

neighbours_dicts = {ont: get_one_hop_neighbours(ont) for ont in list(set(flatten(ontologies_in_alignment)))}
max_neighbours = np.max(flatten([[len(el[e]) for e in el] for el in neighbours_dicts.values()]))
neighbours_lens = {ont: {key: len(neighbours_dicts[ont][key]) for key in neighbours_dicts[ont]}
                   for ont in neighbours_dicts}
neighbours_dicts = {ont: {key: neighbours_dicts[ont][key] + ["<UNK>" for i in range(max_neighbours -len(neighbours_dicts[ont][key]))]
              for key in neighbours_dicts[ont]} for ont in neighbours_dicts}

data_items = data.items()
np.random.shuffle(list(data_items))
data = OrderedDict(data_items)

print ("Number of entities:", len(data))
all_ont_pairs = list(set([tuple([el.split("#")[0] for el in l]) for l in data.keys()]))

all_metrics = []

for i in list(range(0, len(all_ont_pairs), 3)):
    
    test_onto = all_ont_pairs[i:i+3]
    
    train_data = {elem: data[elem] for elem in data if tuple([el.split("#")[0] for el in elem]) not in test_onto}
    test_data = {elem: data[elem] for elem in data if tuple([el.split("#")[0] for el in elem]) in test_onto}

    torch.set_default_dtype(torch.float64)
    
    train_test_split = 0.9

    train_data_t = [key for key in train_data if data[key]]
    train_data_f = [key for key in train_data if not data[key]][:len(train_data_t)]
    np.random.shuffle(train_data_f)
    
    lr = 0.001
    num_epochs = 50
    weight_decay = 0.001
    batch_size = 10
    dropout = 0.3
    batch_size = min(batch_size, len(train_data_t))
    num_batches = int(ceil((2 * len(train_data_t))/batch_size))
    batch_size_f = int(ceil((len(train_data_t) + len(train_data_f))/num_batches))
    
    model = SiameseNetwork(512, 512)

    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    for epoch in range(num_epochs):
        inputs_pos, targets_pos = generate_input(train_data_t, 1)
        inputs_neg, targets_neg = generate_input(train_data_f, 0)
        
        indices = np.random.permutation(len(inputs_pos) + len(inputs_neg))
        
        inputs = np.array(list(inputs_pos) + list(inputs_neg))[indices]
        targets = np.array(list(targets_pos) + list(targets_neg))[indices]

#         inputs = np.array(list(inputs_pos) + list(inputs_neg))
#         targets = np.array(list(targets_pos) + list(targets_neg))

        for batch_idx in range(num_batches):
            batch_start = batch_idx * batch_size
            batch_end = (batch_idx+1) * batch_size
            
            inp = inputs[batch_start:batch_end].transpose(1,0,2)
            nonzero_elems = np.count_nonzero(inp, axis=-1)
            indices = np.flip(np.argsort(nonzero_elems, axis=-1), axis=-1)
            seq_lens = np.flip(np.sort(nonzero_elems, axis=-1), axis=-1)
            inp_elems = np.stack((inp[0][[indices[0]]], inp[1][[indices[1]]]), axis=0)
            
            inp_elems = torch.LongTensor(inp_elems)
            targ_elems = torch.LongTensor(targets[batch_start:batch_end])
            optimizer.zero_grad()
            
            d1 = {elem:i for i,elem in enumerate(indices[0])}
            d2 = {elem:i for i,elem in enumerate(indices[1])}
            rev_indices = np.stack(([d1[k] for k in range(inp.shape[1])], 
                                    [d2[k] for k in range(inp.shape[1])]))

            rev_indices = torch.LongTensor(rev_indices)

            outputs = model(inp_elems, rev_indices)
            loss = F.nll_loss(outputs, targ_elems)
            loss.backward()
#             break
            optimizer.step()

            if batch_idx%10 == 0:
                print ("Epoch: {} Idx: {} Loss: {}".format(epoch, batch_idx, loss.item()))

    model.eval()

    
    test_data_t = [key for key in test_data if data[key]]
    test_data_f = [key for key in test_data if not data[key]]
    
    greedy_matching()
#     break
print ("Final Results: ", np.mean([el[1] for el in all_metrics], axis=0))
print ("Best threshold: ", all_metrics[np.argmax([el[1][2] for el in all_metrics])][0])

Number of entities: 122893




Epoch: 0 Idx: 0 Loss: 0.6995322843552064
Epoch: 0 Idx: 10 Loss: 0.4984583117822264
Epoch: 0 Idx: 20 Loss: 0.3565040250272042
Epoch: 0 Idx: 30 Loss: 0.17558499186878196
Epoch: 0 Idx: 40 Loss: 0.31425531926467626
Epoch: 0 Idx: 50 Loss: 0.13205946715479894
Epoch: 1 Idx: 0 Loss: 0.1576966455572074
Epoch: 1 Idx: 10 Loss: 0.07575139088663561
Epoch: 1 Idx: 20 Loss: 0.022100954998390522
Epoch: 1 Idx: 30 Loss: 0.10250761907074893
Epoch: 1 Idx: 40 Loss: 0.024376242431523733
Epoch: 1 Idx: 50 Loss: 0.03412695335826166
Epoch: 2 Idx: 0 Loss: 0.07155500629691999
Epoch: 2 Idx: 10 Loss: 0.033245050683720995
Epoch: 2 Idx: 20 Loss: 0.024096971425307444
Epoch: 2 Idx: 30 Loss: 0.05224747541646224
Epoch: 2 Idx: 40 Loss: 0.07171383987828263
Epoch: 2 Idx: 50 Loss: 0.07072736022394449
Epoch: 3 Idx: 0 Loss: 0.035033985718335395
Epoch: 3 Idx: 10 Loss: 0.014988281172227863
Epoch: 3 Idx: 20 Loss: 0.018623797971562356
Epoch: 3 Idx: 30 Loss: 0.07328571794390831
Epoch: 3 Idx: 40 Loss: 0.0172257183468625
Epoch: 3 Idx:

Epoch: 31 Idx: 20 Loss: 0.028832651815761123
Epoch: 31 Idx: 30 Loss: 0.009620555757664526
Epoch: 31 Idx: 40 Loss: 0.016745852527870794
Epoch: 31 Idx: 50 Loss: 0.024360664785691176
Epoch: 32 Idx: 0 Loss: 0.014604038098866495
Epoch: 32 Idx: 10 Loss: 0.014016997563245866
Epoch: 32 Idx: 20 Loss: 0.0367201100386665
Epoch: 32 Idx: 30 Loss: 0.028769078576625577
Epoch: 32 Idx: 40 Loss: 0.021866606565948295
Epoch: 32 Idx: 50 Loss: 0.009851235277384168
Epoch: 33 Idx: 0 Loss: 0.019502246220139102
Epoch: 33 Idx: 10 Loss: 0.02612806435529107
Epoch: 33 Idx: 20 Loss: 0.011256972072089458
Epoch: 33 Idx: 30 Loss: 0.01931910207230119
Epoch: 33 Idx: 40 Loss: 0.014318169731261496
Epoch: 33 Idx: 50 Loss: 0.010964401101939681
Epoch: 34 Idx: 0 Loss: 0.015152058879062926
Epoch: 34 Idx: 10 Loss: 0.021331247251740577
Epoch: 34 Idx: 20 Loss: 0.029711163484113227
Epoch: 34 Idx: 30 Loss: 0.019202555801760234
Epoch: 34 Idx: 40 Loss: 0.02938994466140462
Epoch: 34 Idx: 50 Loss: 0.012740507459861317
Epoch: 35 Idx: 0 L



0.8088235294117646 0.5326876513317191
Threshold:  0.380473786798873 0.4782608695652174 0.9777777777777777 0.6423357664233577 0.8088235294117646 0.5326876513317191
Threshold:  0.390473786798873 0.4782608695652174 0.9777777777777777 0.6423357664233577 0.8088235294117646 0.5326876513317191
Threshold:  0.400473786798873 0.4782608695652174 0.9777777777777777 0.6423357664233577 0.8088235294117646 0.5326876513317191
Threshold:  0.41047378679887303 0.4782608695652174 0.9777777777777777 0.6423357664233577 0.8088235294117646 0.5326876513317191
Threshold:  0.420473786798873 0.4782608695652174 0.9777777777777777 0.6423357664233577 0.8088235294117646 0.5326876513317191
Threshold:  0.430473786798873 0.4782608695652174 0.9777777777777777 0.6423357664233577 0.8088235294117646 0.5326876513317191
Threshold:  0.440473786798873 0.4782608695652174 0.9777777777777777 0.6423357664233577 0.8088235294117646 0.5326876513317191
Threshold:  0.450473786798873 0.4782608695652174 0.9777777777777777 0.642335766423357

Epoch: 1 Idx: 30 Loss: 0.06024295851917487
Epoch: 1 Idx: 40 Loss: 0.09596288798722909
Epoch: 1 Idx: 50 Loss: 0.0294364489481798
Epoch: 2 Idx: 0 Loss: 0.05617008393498096
Epoch: 2 Idx: 10 Loss: 0.0843099256187258
Epoch: 2 Idx: 20 Loss: 0.01625119671731211
Epoch: 2 Idx: 30 Loss: 0.09670680753303365
Epoch: 2 Idx: 40 Loss: 0.020690355368063212
Epoch: 2 Idx: 50 Loss: 0.09138146257698765
Epoch: 3 Idx: 0 Loss: 0.02512272425126414
Epoch: 3 Idx: 10 Loss: 0.02428728103354829
Epoch: 3 Idx: 20 Loss: 0.07124470906019306
Epoch: 3 Idx: 30 Loss: 0.014494064321882185
Epoch: 3 Idx: 40 Loss: 0.08171772277325826
Epoch: 3 Idx: 50 Loss: 0.04662982286958075
Epoch: 4 Idx: 0 Loss: 0.012505484914602186
Epoch: 4 Idx: 10 Loss: 0.02562597995331236
Epoch: 4 Idx: 20 Loss: 0.0429470626714928
Epoch: 4 Idx: 30 Loss: 0.030116591074091908
Epoch: 4 Idx: 40 Loss: 0.02761399273601193
Epoch: 4 Idx: 50 Loss: 0.02304995025910463
Epoch: 5 Idx: 0 Loss: 0.017446388652275137
Epoch: 5 Idx: 10 Loss: 0.015106443622756196
Epoch: 5 Idx

KeyboardInterrupt: 

In [648]:
results3, inputs3

([tensor([[ 0.0032, -0.0120, -0.0256,  ...,  0.0150,  0.0228,  0.0096],
          [-0.0372,  0.0105,  0.0198,  ...,  0.0073, -0.0330,  0.0013],
          [-0.0492,  0.0126,  0.0293,  ...,  0.0084, -0.0416,  0.0049],
          ...,
          [-0.0437,  0.0118,  0.0254,  ...,  0.0073, -0.0382,  0.0037],
          [-0.0437,  0.0118,  0.0254,  ...,  0.0073, -0.0382,  0.0037],
          [-0.0437,  0.0118,  0.0254,  ...,  0.0073, -0.0382,  0.0037]]),
  tensor([[ 0.0042, -0.0068, -0.0204,  ...,  0.0087,  0.0235,  0.0017],
          [-0.0475,  0.0136,  0.0255,  ...,  0.0089, -0.0422,  0.0017],
          [-0.0284,  0.0080,  0.0144,  ...,  0.0054, -0.0248,  0.0010],
          ...,
          [-0.0284,  0.0076,  0.0137,  ...,  0.0053, -0.0242,  0.0009],
          [-0.0369,  0.0091,  0.0161,  ...,  0.0094, -0.0307, -0.0003],
          [-0.0378,  0.0103,  0.0195,  ...,  0.0078, -0.0332,  0.0005]])],
 tensor([[[415],
          [ 41],
          [587],
          [102],
          [723],
          [145],

In [699]:
results3, inputs3

([tensor([[-0.0073,  0.0086, -0.0216,  ...,  0.0091,  0.0191,  0.0037],
          [-0.0330,  0.0077,  0.0163,  ...,  0.0117, -0.0306,  0.0014],
          [-0.0438,  0.0085,  0.0255,  ...,  0.0136, -0.0399,  0.0059],
          ...,
          [-0.0394,  0.0073,  0.0218,  ...,  0.0124, -0.0365,  0.0032],
          [-0.0394,  0.0073,  0.0218,  ...,  0.0124, -0.0365,  0.0032],
          [-0.0394,  0.0073,  0.0218,  ...,  0.0124, -0.0365,  0.0032]]),
  tensor([[ 0.0026,  0.0071, -0.0202,  ...,  0.0056,  0.0269,  0.0004],
          [-0.0414,  0.0088,  0.0200,  ...,  0.0141, -0.0383,  0.0016],
          [-0.0220,  0.0047,  0.0094,  ...,  0.0066, -0.0202,  0.0005],
          ...,
          [-0.0274,  0.0066,  0.0135,  ...,  0.0094, -0.0249,  0.0008],
          [-0.0338,  0.0076,  0.0153,  ...,  0.0121, -0.0306,  0.0014],
          [-0.0336,  0.0069,  0.0170,  ...,  0.0108, -0.0317,  0.0010]])],
 tensor([[[433],
          [433],
          [ 41],
          [587],
          [102],
          [723],

In [677]:

nonzero_elems = np.count_nonzero(inputs3.numpy(), axis=-1)
indices = np.flip(np.argsort(nonzero_elems, axis=-1), axis=-1)
seq_lens = np.flip(np.sort(nonzero_elems, axis=-1), axis=-1)
inputs = np.stack((inputs3.numpy()[0][[indices[0]]], inputs3.numpy()[1][[indices[1]]]), axis=0)

d1 = {elem:i for i,elem in enumerate(indices[0])}
d2 = {elem:i for i,elem in enumerate(indices[1])}
rev_indices = np.stack(([d1[k] for k in range(inputs3.numpy().shape[1])], [d2[k] for k in range(inputs3.numpy().shape[1])]))
rev_indices, indices

  after removing the cwd from sys.path.


(array([[19,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
         17, 18,  1,  0],
        [19,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
         17, 18,  1,  0]]),
 array([[19, 18,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
         15, 16, 17,  0],
        [19, 18,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
         15, 16, 17,  0]]))

In [691]:
b = [[19,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,  1,  0],
 [19,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,  1,  0]]

In [697]:
name_embedding(inputs3[0]), name_embedding(inputs3[0])

tensor([[[ 0.0502,  0.0084,  0.0664,  ...,  0.0754, -0.0262, -0.0038]],

        [[ 0.0502,  0.0084,  0.0664,  ...,  0.0754, -0.0262, -0.0038]],

        [[-0.0577,  0.0921,  0.0310,  ...,  0.0930, -0.0749, -0.0654]],

        ...,

        [[ 0.0502,  0.0084,  0.0664,  ...,  0.0754, -0.0262, -0.0038]],

        [[ 0.0502,  0.0084,  0.0664,  ...,  0.0754, -0.0262, -0.0038]],

        [[-0.0298, -0.0007,  0.0153,  ...,  0.0390, -0.0074, -0.0011]]],
       grad_fn=<EmbeddingBackward>)

In [599]:
d1 = {i:elem for i,elem in enumerate(indices[0])}
d2 = {i:elem for i,elem in enumerate(indices[1])}
np.stack(([d1[k] for k in range(len(inputs))], [d2[k] for k in range(len(inputs))]))

array([[19, 18],
       [19, 18]])

In [608]:
inputs

array([[[415],
        [ 41],
        [587],
        [102],
        [723],
        [145],
        [661],
        [699],
        [624],
        [146],
        [433],
        [433],
        [433],
        [433],
        [433],
        [433],
        [433],
        [433],
        [433],
        [433]],

       [[320],
        [590],
        [585],
        [ 45],
        [389],
        [ 35],
        [553],
        [118],
        [707],
        [446],
        [552],
        [370],
        [382],
        [413],
        [463],
        [553],
        [118],
        [607],
        [473],
        [558]]])

In [605]:
inputs.shape[1]

20

In [330]:
a.shape

torch.Size([10, 9, 600])

In [358]:
ht

tensor([[[-0.0042,  0.0046,  0.0312,  ..., -0.0403, -0.0290,  0.0218],
         [-0.0244,  0.0178,  0.0141,  ..., -0.0362, -0.0058,  0.0050],
         [-0.0244,  0.0178,  0.0141,  ..., -0.0362, -0.0058,  0.0050],
         ...,
         [ 0.0005,  0.0105,  0.0303,  ..., -0.0308,  0.0008,  0.0163],
         [-0.0048,  0.0180,  0.0217,  ..., -0.0327, -0.0089,  0.0021],
         [-0.0125,  0.0143,  0.0126,  ..., -0.0325, -0.0004, -0.0023]],

        [[-0.0314, -0.0136, -0.0008,  ..., -0.0333,  0.0325,  0.0191],
         [-0.0254, -0.0057,  0.0208,  ..., -0.0210,  0.0378,  0.0223],
         [-0.0254, -0.0057,  0.0208,  ..., -0.0210,  0.0378,  0.0223],
         ...,
         [-0.0154, -0.0258,  0.0088,  ..., -0.0134,  0.0296,  0.0125],
         [-0.0154, -0.0117,  0.0062,  ..., -0.0159,  0.0435,  0.0102],
         [-0.0195, -0.0164,  0.0113,  ..., -0.0198,  0.0297,  0.0169]],

        [[ 0.0105,  0.0355,  0.0199,  ..., -0.0124,  0.0177, -0.0182],
         [ 0.0083,  0.0385,  0.0171,  ..., -0

Number of entities: 122893




Epoch: 0 Idx: 0 Loss: 0.6901825471972951
Epoch: 0 Idx: 10 Loss: 0.632783273945459
Epoch: 0 Idx: 20 Loss: 0.15516196685204944
Epoch: 0 Idx: 30 Loss: 0.008385121021466376
Epoch: 0 Idx: 40 Loss: 0.010705372808332718
Epoch: 0 Idx: 50 Loss: 0.07782922478323444
Epoch: 1 Idx: 0 Loss: 0.005234113552719269
Epoch: 1 Idx: 10 Loss: 0.007897245050926375
Epoch: 1 Idx: 20 Loss: 0.24377740624334546
Epoch: 1 Idx: 30 Loss: 0.009755866724708052
Epoch: 1 Idx: 40 Loss: 0.01610127987127922
Epoch: 1 Idx: 50 Loss: 0.028530565349582204
Epoch: 2 Idx: 0 Loss: 0.07858914300204341
Epoch: 2 Idx: 10 Loss: 0.018341311132211126
Epoch: 2 Idx: 20 Loss: 0.007653223799473571
Epoch: 2 Idx: 30 Loss: 0.02810791131378626
Epoch: 2 Idx: 40 Loss: 0.009804850449572128
Epoch: 2 Idx: 50 Loss: 0.012913974919637786
Epoch: 3 Idx: 0 Loss: 0.005839805199661826
Epoch: 3 Idx: 10 Loss: 0.006620761441466526
Epoch: 3 Idx: 20 Loss: 0.014172022322806722
Epoch: 3 Idx: 30 Loss: 0.21837544494253414
Epoch: 3 Idx: 40 Loss: 0.002747977490588878
Epoc



OrderedDict([(('edas#Excursion', 'sigkdd#Deadline_Abstract_Submission'), (0.9980584859041326, False)), (('conference#Passive_conference_participant', 'iasted#Double_hotel_room'), (0.9976114447907444, True)), (('cmt#Document', 'confOf#Administrative_event'), (0.9976142246316437, False)), (('conference#Extended_abstract', 'iasted#Renting'), (0.9976101137314057, True)), (('conference#Tutorial', 'iasted#Tutorial'), (0.9976100609006393, False)), (('conference#Submitted_contribution', 'iasted#Submission'), (0.9989115704333449, True)), (('cmt#hasBeenAssigned', 'confOf#reviewes'), (0.9990762717052015, False)), (('cmt#Rejection', 'confOf#Banquet'), (0.9993375543007785, False)), (('cmt#writePaper', 'confOf#writes'), (0.9994793807604053, True)), (('edas#NonAcademicEvent', 'sigkdd#Registration_SIGMOD_Member'), (0.9996559659126631, True))])
0.5 0.10869565217391304 0.17857142857142855 0.12886597938144329 0.29069767441860467
0.5555555555555556 0.10638297872340426 0.17857142857142855 0.126903553299492

KeyError: 8

In [468]:
s1

array([[[418],
        [326],
        [618],
        [346],
        [269],
        [203],
        [228],
        [730],
        [228],
        [203]],

       [[431],
        [ 93],
        [ 26],
        [128],
        [450],
        [702],
        [702],
        [666],
        [779],
        [756]]])

Number of entities: 122893




Epoch: 0 Idx: 0 Loss: 0.7027855014170932
Epoch: 0 Idx: 10 Loss: 0.6013261592834631
Epoch: 0 Idx: 20 Loss: 0.35400320300819466
Epoch: 0 Idx: 30 Loss: 0.3823155482411284
Epoch: 0 Idx: 40 Loss: 0.3744361424747733
Epoch: 0 Idx: 50 Loss: 0.1124086340666749
Epoch: 1 Idx: 0 Loss: 0.12089360428376206
Epoch: 1 Idx: 10 Loss: 0.20777004326621898
Epoch: 1 Idx: 20 Loss: 0.08943225002757184
Epoch: 1 Idx: 30 Loss: 0.02610880004636081
Epoch: 1 Idx: 40 Loss: 0.06452785238501882
Epoch: 1 Idx: 50 Loss: 0.023307785929949566
Epoch: 2 Idx: 0 Loss: 0.05670621042313041
Epoch: 2 Idx: 10 Loss: 0.03542058039128783
Epoch: 2 Idx: 20 Loss: 0.07095501000160956
Epoch: 2 Idx: 30 Loss: 0.05312747238236454
Epoch: 2 Idx: 40 Loss: 0.09959019381580898
Epoch: 2 Idx: 50 Loss: 0.030048140021503543
Epoch: 3 Idx: 0 Loss: 0.03198925152940667
Epoch: 3 Idx: 10 Loss: 0.017471486865092856
Epoch: 3 Idx: 20 Loss: 0.014144193504768468
Epoch: 3 Idx: 30 Loss: 0.029508805925761737
Epoch: 3 Idx: 40 Loss: 0.017466878833406897
Epoch: 3 Idx: 

Epoch: 31 Idx: 30 Loss: 0.0397896236370402
Epoch: 31 Idx: 40 Loss: 0.022719579535720562
Epoch: 31 Idx: 50 Loss: 0.021956767885878338
Epoch: 32 Idx: 0 Loss: 0.01776921784619671
Epoch: 32 Idx: 10 Loss: 0.02054673303948875
Epoch: 32 Idx: 20 Loss: 0.01074549028708992
Epoch: 32 Idx: 30 Loss: 0.02473211921955725
Epoch: 32 Idx: 40 Loss: 0.08724528106464473
Epoch: 32 Idx: 50 Loss: 0.05892685950496586
Epoch: 33 Idx: 0 Loss: 0.010621584231940078
Epoch: 33 Idx: 10 Loss: 0.09598599952968237
Epoch: 33 Idx: 20 Loss: 0.017069134297129262
Epoch: 33 Idx: 30 Loss: 0.011680918976127462
Epoch: 33 Idx: 40 Loss: 0.009401815295945902
Epoch: 33 Idx: 50 Loss: 0.02187846514908301
Epoch: 34 Idx: 0 Loss: 0.009828264219607162
Epoch: 34 Idx: 10 Loss: 0.008503830907682768
Epoch: 34 Idx: 20 Loss: 0.01155266458943546
Epoch: 34 Idx: 30 Loss: 0.00928531409678759
Epoch: 34 Idx: 40 Loss: 0.021686218888978787
Epoch: 34 Idx: 50 Loss: 0.012529057448528802
Epoch: 35 Idx: 0 Loss: 0.00963220160675227
Epoch: 35 Idx: 10 Loss: 0.0

0.46808510638297873 0.9777777777777777 0.6330935251798561 0.802919708029197 0.5225653206650831
OrderedDict([(('conference#Review', 'iasted#Transport_vehicle'), (0.9545049777715447, False)), (('cmt#hasAuthor', 'confOf#writtenBy'), (0.981768357971644, True)), (('conference#Rejected_contribution', 'iasted#Form'), (0.9545027373382383, False)), (('conference#Conference_document', 'iasted#Document'), (0.9937995569249196, True)), (('conference#Conference_proceedings', 'iasted#Publication'), (0.9545064525477154, True)), (('conference#is_given_by', 'iasted#is_made_from'), (0.9545049698971669, False)), (('edas#OperatingTopicsystems', 'sigkdd#Deadline_Abstract_Submission'), (0.9545049698559426, False)), (('cmt#hasBeenAssigned', 'confOf#expertOn'), (0.9545050756352315, False)), (('conference#Topic', 'iasted#Worker_non_speaker'), (0.9545046166667072, False)), (('conference#Camera_ready_contribution', 'iasted#Final_manuscript'), (0.9545049698559428, True)), (('edas#ComputerNetworksAapplicationsTopic

Epoch: 0 Idx: 0 Loss: 0.6988837529863161
Epoch: 0 Idx: 10 Loss: 0.5197235571984484
Epoch: 0 Idx: 20 Loss: 0.3867295582367691
Epoch: 0 Idx: 30 Loss: 0.23607041158758965
Epoch: 0 Idx: 40 Loss: 0.3781697610293449
Epoch: 0 Idx: 50 Loss: 0.14206135714808416
Epoch: 1 Idx: 0 Loss: 0.07375528090946594
Epoch: 1 Idx: 10 Loss: 0.10239096099035543
Epoch: 1 Idx: 20 Loss: 0.01387859842681175
Epoch: 1 Idx: 30 Loss: 0.03869628635401241
Epoch: 1 Idx: 40 Loss: 0.07781380618633355
Epoch: 1 Idx: 50 Loss: 0.15426434110410214
Epoch: 2 Idx: 0 Loss: 0.038201628276355704
Epoch: 2 Idx: 10 Loss: 0.03728378135452573
Epoch: 2 Idx: 20 Loss: 0.027765935993762737
Epoch: 2 Idx: 30 Loss: 0.07508786984246238
Epoch: 2 Idx: 40 Loss: 0.0648527731375801
Epoch: 2 Idx: 50 Loss: 0.03328702035849977
Epoch: 3 Idx: 0 Loss: 0.03819008111528788
Epoch: 3 Idx: 10 Loss: 0.01980458040325587
Epoch: 3 Idx: 20 Loss: 0.027009807358938525
Epoch: 3 Idx: 30 Loss: 0.04281869563299839
Epoch: 3 Idx: 40 Loss: 0.061787014880789706
Epoch: 3 Idx: 50

Epoch: 31 Idx: 20 Loss: 0.009280524060671086
Epoch: 31 Idx: 30 Loss: 0.028927628887155915
Epoch: 31 Idx: 40 Loss: 0.03209319294735528
Epoch: 31 Idx: 50 Loss: 0.04301458936024678
Epoch: 32 Idx: 0 Loss: 0.021847629923194466
Epoch: 32 Idx: 10 Loss: 0.018360452738317795
Epoch: 32 Idx: 20 Loss: 0.012898848265851179
Epoch: 32 Idx: 30 Loss: 0.01163490257958426
Epoch: 32 Idx: 40 Loss: 0.012059715779454848
Epoch: 32 Idx: 50 Loss: 0.024675089931710876
Epoch: 33 Idx: 0 Loss: 0.010361999689748276
Epoch: 33 Idx: 10 Loss: 0.009941899376541528
Epoch: 33 Idx: 20 Loss: 0.01813415059314938
Epoch: 33 Idx: 30 Loss: 0.015564919218156902
Epoch: 33 Idx: 40 Loss: 0.005528458750208154
Epoch: 33 Idx: 50 Loss: 0.020920874219696985
Epoch: 34 Idx: 0 Loss: 0.018616167822000033
Epoch: 34 Idx: 10 Loss: 0.010920458244099306
Epoch: 34 Idx: 20 Loss: 0.03235070577619071
Epoch: 34 Idx: 30 Loss: 0.010265474280415781
Epoch: 34 Idx: 40 Loss: 0.02101201538205909
Epoch: 34 Idx: 50 Loss: 0.020975064060765835
Epoch: 35 Idx: 0 Lo

KeyboardInterrupt: 

In [555]:
print ("Precision: {} Recall: {} F1-Score: {} F2-Score: {} F0.5-Score: {}".format(*[1,2,3,4,5]))

Precision: 1 Recall: 2 F1-Score: 3 F2-Score: 4 F0.5-Score: 5
