In [1]:
# Construction of dataset

import os, itertools, time, pickle, sys
import subprocess
from xml.dom import minidom
from collections import Counter, OrderedDict
from operator import itemgetter
from nltk.corpus import wordnet
import tensorflow as tf
import tensorflow_hub as hub
from scipy import spatial
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import numpy as np
import scipy.sparse as sp
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from math import ceil, exp
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline  


USE_folder = "/home/vlead/USE"
alignment_folder = "reference-alignment/"

# Load reference alignments 
def load_alignments(folder):
    alignments = []
    for f in os.listdir(folder):
        doc = minidom.parse(folder + f)
        ls = list(zip(doc.getElementsByTagName('entity1'), doc.getElementsByTagName('entity2')))
        alignments.extend([(a.getAttribute('rdf:resource'), b.getAttribute('rdf:resource')) for (a,b) in ls])
    return alignments
        
reference_alignments = load_alignments(alignment_folder)

In [2]:
flatten = lambda l: [item for sublist in l for item in sublist]

class Ontology():
    def __init__(self, ontology):
        self.ontology = ontology
        self.ontology_obj = minidom.parse(ontology)
        self.root = self.ontology_obj.documentElement
        self.subclasses = self.parse_subclasses()
        self.object_properties = self.parse_object_properties()
        self.data_properties = self.parse_data_properties()
        self.triples = self.parse_triples()
        self.classes = self.parse_classes()
    
    def get_child_node(self, element, tag):
        return [e for e in element._get_childNodes() if type(e)==minidom.Element and e._get_tagName() == tag]
        
    def has_attribute_value(self, element, attribute, value):
        return True if element.getAttribute(attribute).split("#")[-1] == value else False
    
    def get_subclass_triples(self):
        return [(b,a,"subclass_of") for (a,b) in self.get_subclasses()]
    
    def parse_triples(self, union_flag=0, subclass_of=True):
        obj_props = self.object_properties
        data_props = self.data_properties
        props = obj_props + data_props
        all_triples = []
        for prop in props:
            domain_children = self.get_child_node(prop, "rdfs:domain")
            range_children = self.get_child_node(prop, "rdfs:range")
            domain_prop = self.filter_null([self.extract_ID(el) for el in domain_children])
            range_prop = self.filter_null([self.extract_ID(el) for el in range_children])
            if not domain_children or not range_children:
                continue
            if not domain_prop:
                domain_prop = self.filter_null([self.extract_ID(el) for el in domain_children[0].getElementsByTagName("owl:Class")])
            if not range_prop:
                range_prop = self.filter_null([self.extract_ID(el) for el in range_children[0].getElementsByTagName("owl:Class")])
            if domain_prop and range_prop:
                if union_flag == 0:
                    all_triples.extend([(el[0], el[1], self.extract_ID(prop)) for el in list(itertools.product(domain_prop, range_prop))])
                else:
                    all_triples.append(("###".join(domain_prop), "###".join(range_prop), self.extract_ID(prop)))
        if subclass_of:
            all_triples.extend(self.get_subclass_triples())
        return list(set(all_triples))
    
    def get_triples(self, union_flag=0, subclass_of=True, include_inv=True):
        return self.parse_triples(union_flag, subclass_of)

    def parse_subclasses(self, union_flag=0):
        subclasses = self.root.getElementsByTagName("rdfs:subClassOf")
        subclass_pairs = []
        for el in subclasses:
            inline_subclasses = self.extract_ID(el)
            if inline_subclasses:
                subclass_pairs.append((el, el.parentNode))
            else:
                level1_class = self.get_child_node(el, "owl:Class")
                if not level1_class:
                    continue
                if self.extract_ID(level1_class[0]):
                    subclass_pairs.append((level1_class[0], el.parentNode))
                else:
                    level2classes = level1_class[0].getElementsByTagName("owl:Class")
                    
                    subclass_pairs.extend([(elem, el.parentNode) for elem in level2classes if self.extract_ID(elem)])
        return subclass_pairs
        
    def get_subclasses(self):
        return [(self.extract_ID(a), self.extract_ID(b)) for (a,b) in self.subclasses]
    
    def filter_null(self, data):
        return [el for el in data if el]
    
    def extract_ID(self, element):
        element_id = element.getAttribute("rdf:ID") or element.getAttribute("rdf:resource") or element.getAttribute("rdf:about")
        return element_id.split("#")[-1]
    
    def parse_classes(self):
        class_elems = [self.extract_ID(el) for el in self.root.getElementsByTagName("owl:Class")]
        subclass_classes = list(set(flatten([el[:-1] for el in self.triples])))
        return list(set(self.filter_null(class_elems + subclass_classes)))
    
    def get_classes(self):
        return self.classes
    
    def get_entities(self):
        entities = [self.extract_ID(el) for el in self.root.getElementsByTagName("owl:Class")]
        return list(set(self.filter_null(entities)))

    def parse_data_properties(self):
        data_properties = [el for el in self.get_child_node(self.root, 'owl:DatatypeProperty')]
        fn_data_properties = [el for el in self.get_child_node(self.root, 'owl:FunctionalProperty') if el]
        fn_data_properties = [el for el in fn_data_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "DatatypeProperty")]]
        inv_fn_data_properties = [el for el in self.get_child_node(self.root, 'owl:InverseFunctionalProperty') if el]
        inv_fn_data_properties = [el for el in inv_fn_data_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "DatatypeProperty")]]
        return data_properties + fn_data_properties + inv_fn_data_properties
        
    def parse_object_properties(self):
        obj_properties = [el for el in self.get_child_node(self.root, 'owl:ObjectProperty')]
        fn_obj_properties = [el for el in self.get_child_node(self.root, 'owl:FunctionalProperty') if el]
        fn_obj_properties = [el for el in fn_obj_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "ObjectProperty")]]
        inv_fn_obj_properties = [el for el in self.get_child_node(self.root, 'owl:InverseFunctionalProperty') if el]
        inv_fn_obj_properties = [el for el in inv_fn_obj_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "ObjectProperty")]]
        return obj_properties + fn_obj_properties + inv_fn_obj_properties
    
    def get_object_properties(self):
        obj_props = [self.extract_ID(el) for el in self.object_properties]
        return list(set(self.filter_null(obj_props)))
    
    def get_data_properties(self):
        data_props = [self.extract_ID(el) for el in self.data_properties]
        return list(set(self.filter_null(data_props)))




In [3]:
# Extracting USE embeddings

ontologies_in_alignment = [l.split(".")[0].split("-") for l in os.listdir("reference-alignment/")]

def extractUSEEmbeddings(words):
    try:
        embed = hub.KerasLayer(USE_folder)
    except Exception as e:
        !mkdir $USE_folder
        !curl -L "https://tfhub.dev/google/universal-sentence-encoder-large/5?tf-hub-format=compressed" | tar -zxvC $USE_folder
        embed = hub.KerasLayer(USE_folder)
        pass
    word_embeddings = embed(words)
    return word_embeddings.numpy()

def cos_sim(a,b):
    return 1 - spatial.distance.cosine(a, b)

def camel_case_split(identifier):
    matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
    return [m.group(0).lower() for m in matches]

def parse(word):
    return flatten([el.split("_") for el in camel_case_split(word)])
    

extracted_elems = []

for ont_name in list(set(flatten(ontologies_in_alignment))):
    ont = Ontology("conference_ontologies/" + ont_name + ".owl")
    entities = ont.get_entities()
    props = ont.get_object_properties() + ont.get_data_properties()
    triples = list(set(flatten(ont.get_triples())))
    extracted_elems.extend([ont_name + "#" + elem for elem in entities + props + triples])

extracted_elems = list(set(extracted_elems))

inp = [" ".join(parse(word.split("#")[1])) for word in extracted_elems]
vectorizer = TfidfVectorizer(token_pattern=r"(?u)\S+")
X = vectorizer.fit_transform(inp)
word2idx_tfidf = {word: i for (i, word)  in enumerate(vectorizer.get_feature_names())}
entity2idx_tfidf = {word.split("#")[1]: i for (i, word)  in enumerate(extracted_elems)}


print ("Total number of extracted unique classes and properties from entire RA set: ", len(extracted_elems))

inp = ["<UNK>"] + inp
extracted_elems = ["<UNK>"] + extracted_elems

embeds = extractUSEEmbeddings(inp)
embeddings = dict(zip(extracted_elems, embeds))    

Total number of extracted unique classes and properties from entire RA set:  834


In [4]:
# Type storage

types_dict = {}

def get_tfidf_score(word, phrase):
    return np.sum([X[entity2idx_tfidf[phrase]][:,word2idx_tfidf[word]][0,0] for word in parse(phrase)])
    
for ont_name in list(set(flatten(ontologies_in_alignment))):
    ont = Ontology("conference_ontologies/" + ont_name + ".owl")
    
    entities = ont.get_entities()
    props = ont.get_object_properties() + ont.get_data_properties()

    for entity in entities:
        types_dict[entity] = {"type": "entity"}
    for prop in props:
        types_dict[prop] = {"type": "property"}


In [5]:
# Combinatorial mapping generation

all_mappings = []
for l in ontologies_in_alignment:
    ont1 = Ontology("conference_ontologies/" + l[0] + ".owl")
    ont2 = Ontology("conference_ontologies/" + l[1] + ".owl")
    
    ent1 = ont1.get_entities()
    ent2 = ont2.get_entities()
    
    obj1 = ont1.get_object_properties()
    obj2 = ont2.get_object_properties()
    
    data1 = ont1.get_data_properties()
    data2 = ont2.get_data_properties()
    
    mappings = list(itertools.product(ent1, ent2)) + list(itertools.product(obj1, obj2)) + list(itertools.product(data1, data2))
    
    all_mappings.extend([(l[0] + "#" + el[0], l[1] + "#" + el[1]) for el in mappings])
    

In [6]:
gt_mappings = [tuple([elem.split("/")[-1] for elem in el]) for el in reference_alignments]

data = {}
for mapping in all_mappings:
    if mapping in gt_mappings:
        data[(mapping[0], mapping[1])] = True
    else:
        data[(mapping[0], mapping[1])] = False


In [53]:

def greedy_matching():
    global batch_size, test_data_t, test_data_f, model, optimizer, emb_indexer_inv, gt_mappings, all_metrics
    all_results = OrderedDict()
    with torch.no_grad():
        all_pred = []
        batch_size = min(batch_size, len(test_data_t))
        num_batches = int(ceil(len(test_data_t)/batch_size))
        batch_size_f = int(ceil(len(test_data_f)/num_batches))
        
        np.random.shuffle(test_data_t)
        np.random.shuffle(test_data_f)

        for batch_idx in range(num_batches):
            batch_start = batch_idx * batch_size
            batch_end = (batch_idx+1) * batch_size

            batch_start_f = batch_idx * batch_size_f
            batch_end_f = (batch_idx+1) * batch_size_f
            
            pos_elems = np.array(test_data_t)[batch_start:batch_end]
            neg_elems = np.array(test_data_f)[batch_start_f:batch_end_f]
            optimizer.zero_grad()

            inputs = np.array([generate_data(elem) for elem in list(pos_elems) + list(neg_elems)])     
            targets = np.array([1 for i in range(len(pos_elems))] + [0 for i in range(len(neg_elems))])
            
            indices = np.random.permutation(inputs.shape[0])
            inputs, targets = inputs[indices].transpose(1,0,2), targets[indices]
            inputs = inputs.transpose(1,0,2)
            inputs_elem = inputs.copy()
            
            nonzero_elems = np.count_nonzero(inputs, axis=-1)
            indices = np.flip(np.argsort(nonzero_elems, axis=-1), axis=-1)
            seq_lens = np.flip(np.sort(nonzero_elems, axis=-1), axis=-1)
            inputs = np.stack((inputs[0][[indices[0]]], inputs[1][[indices[1]]]), axis=0)
            
            d1 = {elem:i for i,elem in enumerate(indices[0])}
            d2 = {elem:i for i,elem in enumerate(indices[1])}
            rev_indices = np.stack(([d1[k] for k in range(inputs_elem.shape[1])], [d2[k] for k in range(inputs_elem.shape[1])]))

            rev_indices = torch.LongTensor(rev_indices)
            inputs = torch.LongTensor(inputs)
            seq_lens = torch.LongTensor(seq_lens.copy())
            targets = torch.DoubleTensor(targets)

            outputs = model(inputs, seq_lens, rev_indices)
            #             outputs /= torch.sum(outputs, dim=1).view(-1, 1)
# #             write (("Outputs Finally: ", str([str(s) for s in outputs])))
#             outputs = [(1-el[1].item()) for el in outputs]

#             return
#             print ("2", inputs)
#             print ("3", seq_lens)
#             print ("4", rev_indices)
            
            targets = [True if el.item() else False for el in targets]
#             print (inputs)
            for idx, pred_elem in enumerate(outputs):
                ent1 = emb_indexer_inv[inputs_elem[0][idx][0]]
                ent2 = emb_indexer_inv[inputs_elem[1][idx][0]]
                if (ent1, ent2) in all_results:
                    print ("Error: ", ent1, ent2, "already present")
                all_results[(ent1, ent2)] = (pred_elem, targets[idx])
        
        all_results = OrderedDict(sorted(all_results.items(), key=lambda x: x[0], reverse=True))
        filtered_results = dict()
        
        entities_to_assign = set([el[0] for el in list(all_results.keys())])
        for pair in all_results:
            if pair[0] in entities_to_assign:
                filtered_results[pair] = all_results[pair]
                entities_to_assign.remove(pair[0])
                
        entities_to_assign = set([el[1] for el in list(all_results.keys())])
        for pair in all_results:
            if pair[1] in entities_to_assign:
                filtered_results[pair] = all_results[pair]
                entities_to_assign.remove(pair[1])        

        filtered_results = OrderedDict(sorted(filtered_results.items(), key=lambda x: x[1][0], reverse=True))
        
        optimum_metrics, opt_threshold = [-1000 for i in range(5)], -1000
        low_threshold = np.min([el[0] for el in all_results.values()]) - 0.01
        high_threshold = np.max([el[0] for el in all_results.values()]) + 0.01
        for j,threshold in enumerate(np.arange(low_threshold, high_threshold, 0.01)):
            res = []
            for i,key in enumerate(all_results):
                if all_results[key][0] > threshold:
                    res.append(key)
            fn_list = [key for key in gt_mappings if key not in set(res) and is_test(test_onto, key)]
            fp_list = [elem for elem in res if not all_results[elem][1]]
            tp_list = [elem for elem in res if all_results[elem][1]]
            
            tp, fn, fp = len(tp_list), len(fn_list), len(fp_list)
            
            
            try:
                precision = tp/(tp+fp)
                recall = tp/(tp+fn)
                f1score = 2 * precision * recall / (precision + recall)
                f2score = 5 * precision * recall / (4 * precision + recall)
                f0_5score = 1.25 * precision * recall / (0.25 * precision + recall)
            except Exception as e:
                print (e)
                continue
            print ("Threshold: ", threshold, precision, recall, f1score, f2score, f0_5score)

            if f1score > optimum_metrics[2]:
                optimum_metrics = [precision, recall, f1score, f2score, f0_5score]
                opt_threshold = threshold
        
        print ("Precision: {} Recall: {} F1-Score: {} F2-Score: {} F0.5-Score: {}".format(*optimum_metrics))
        all_metrics.append((opt_threshold, optimum_metrics))
    return all_results


dict_keys(['cmt#acceptPaper', 'edas#AcceptRating', 'iasted#Author', 'edas#MealEvent', 'confOf#Administrator', 'cmt#SubjectArea', 'edas#ReviewRating', 'cmt#setMaxPapers', 'conference#Publisher', 'iasted#is_used_by', 'iasted#need', 'iasted#Speaker_lecture', 'cmt#addProgramCommitteeMember', 'sigkdd#Abstract', 'confOf#Paper', 'confOf#Member', 'confOf#hasFirstName', 'cmt#rejectPaper', 'confOf#hasPhone', 'edas#string', 'iasted#Departure_tax', 'conference#has_an_organizing_committee', 'cmt#Meta-Review', 'iasted#Listener', 'conference#has_a_commtitee', 'iasted#prepare', 'confOf#Workshop', 'cmt#readPaper', 'edas#DiningPlace', 'cmt#ConferenceChair', 'cmt#acceptsHardcopySubmissions', 'conference#is_a_date_of_camera_ready_paper_submission', 'edas#isReviewHistoryOf', 'edas#hasCostCurrency', 'ekaw#University', 'iasted#Form', 'edas#ReviewForm', 'confOf#maxChoice', 'ekaw#Individual_Presentation', 'sigkdd#Best_Applications_Paper_Award', 'conference#Call_for_paper', 'iasted#Presenter_house', 'confOf#has

In [54]:
ind_test, inp_test1, inp_test2 = None, None, None

def write(elem):
    f = open("Logs", "a+")
    if type(elem) == list or type(elem) == tuple:
        string = str("\n".join([str(s) for s in elem]))
    else:
        string = str(elem)
    f.write("\n"+string)
    f.close()
    
inputs3, results3 = None, None

class SiameseNetwork(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_layers):
        super().__init__() 
        
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.name_embedding = nn.Embedding(len(embeddings), self.embedding_dim)
        self.name_embedding.load_state_dict({'weight': torch.from_numpy(np.array(emb_vals))})

        self.dropout = dropout
        
        self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim, self.num_layers, batch_first=True)
        self.cosine_sim_layer = nn.CosineSimilarity(dim=1)
        self.layer1 = nn.Bilinear(self.hidden_dim, self.hidden_dim, 2)

    def forward(self, inputs, seq_lens, rev_indices):
        results = []
        for i in range(2):
            x = self.name_embedding(inputs[i])
#             print ("Embeddings", x)
            packed_inp = pack_padded_sequence(x, seq_lens[i].numpy(), batch_first=True)
            op, (ht, ct) = self.lstm(x)
            x = ht[2*(self.num_layers-1):].permute(1,0,2)
            x = x[rev_indices[i],:,:]
            results.append(x.reshape(-1, self.hidden_dim))
        global inputs3, results3
        results3 = results
        inputs3 = inputs
        x = self.cosine_sim_layer(results[0], results[1])
        return x

In [361]:
import os, itertools, time, pickle
import subprocess
from xml.dom import minidom
from collections import Counter, OrderedDict
from operator import itemgetter
import tensorflow as tf
import tensorflow_hub as hub
from scipy import spatial
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import numpy as np
import scipy.sparse as sp
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from math import ceil, exp
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

f = open("data.pkl", "rb")
data, emb_indexer, emb_indexer_inv, emb_vals, gt_mappings  = pickle.load(f)

ontologies_in_alignment = [l.split(".")[0].split("-") for l in os.listdir("reference-alignment/")]
flatten = lambda l: [item for sublist in l for item in sublist]

ind_test, inp_test1, inp_test2 = None, None, None

class Ontology():
    def __init__(self, ontology):
        self.ontology = ontology
        self.ontology_obj = minidom.parse(ontology)
        self.root = self.ontology_obj.documentElement
        self.subclasses = self.parse_subclasses()
        self.object_properties = self.parse_object_properties()
        self.data_properties = self.parse_data_properties()
        self.triples = self.parse_triples()
        self.classes = self.parse_classes()
    
    def get_child_node(self, element, tag):
        return [e for e in element._get_childNodes() if type(e)==minidom.Element and e._get_tagName() == tag]
        
    def has_attribute_value(self, element, attribute, value):
        return True if element.getAttribute(attribute).split("#")[-1] == value else False
    
    def get_subclass_triples(self):
        return [(b,a,"subclass_of") for (a,b) in self.get_subclasses()]
    
    def parse_triples(self, union_flag=0, subclass_of=True):
        obj_props = self.object_properties
        data_props = self.data_properties
        props = obj_props + data_props
        all_triples = []
        for prop in props:
            domain_children = self.get_child_node(prop, "rdfs:domain")
            range_children = self.get_child_node(prop, "rdfs:range")
            domain_prop = self.filter_null([self.extract_ID(el) for el in domain_children])
            range_prop = self.filter_null([self.extract_ID(el) for el in range_children])
            if not domain_children or not range_children:
                continue
            if not domain_prop:
                domain_prop = self.filter_null([self.extract_ID(el) for el in domain_children[0].getElementsByTagName("owl:Class")])
            if not range_prop:
                range_prop = self.filter_null([self.extract_ID(el) for el in range_children[0].getElementsByTagName("owl:Class")])
            if domain_prop and range_prop:
                if union_flag == 0:
                    all_triples.extend([(el[0], el[1], self.extract_ID(prop)) for el in list(itertools.product(domain_prop, range_prop))])
                else:
                    all_triples.append(("###".join(domain_prop), "###".join(range_prop), self.extract_ID(prop)))
        if subclass_of:
            all_triples.extend(self.get_subclass_triples())
        return list(set(all_triples))
    
    def get_triples(self, union_flag=0, subclass_of=True, include_inv=True):
        return self.parse_triples(union_flag, subclass_of)

    def parse_subclasses(self, union_flag=0):
        subclasses = self.root.getElementsByTagName("rdfs:subClassOf")
        subclass_pairs = []
        for el in subclasses:
            inline_subclasses = self.extract_ID(el)
            if inline_subclasses:
                subclass_pairs.append((el, el.parentNode))
            else:
                level1_class = self.get_child_node(el, "owl:Class")
                if not level1_class:
                    continue
                if self.extract_ID(level1_class[0]):
                    subclass_pairs.append((level1_class[0], el.parentNode))
                else:
                    level2classes = level1_class[0].getElementsByTagName("owl:Class")
                    
                    subclass_pairs.extend([(elem, el.parentNode) for elem in level2classes if self.extract_ID(elem)])
        return subclass_pairs
        
    def get_subclasses(self):
        return [(self.extract_ID(a), self.extract_ID(b)) for (a,b) in self.subclasses]
    
    def filter_null(self, data):
        return [el for el in data if el]
    
    def extract_ID(self, element):
        element_id = element.getAttribute("rdf:ID") or element.getAttribute("rdf:resource") or element.getAttribute("rdf:about")
        return element_id.split("#")[-1]
    
    def parse_classes(self):
        class_elems = [self.extract_ID(el) for el in self.root.getElementsByTagName("owl:Class")]
        subclass_classes = list(set(flatten([el[:-1] for el in self.triples])))
        return list(set(self.filter_null(class_elems + subclass_classes)))
    
    def get_classes(self):
        return self.classes
    
    def get_entities(self):
        entities = [self.extract_ID(el) for el in self.root.getElementsByTagName("owl:Class")]
        return list(set(self.filter_null(entities)))

    def parse_data_properties(self):
        data_properties = [el for el in self.get_child_node(self.root, 'owl:DatatypeProperty')]
        fn_data_properties = [el for el in self.get_child_node(self.root, 'owl:FunctionalProperty') if el]
        fn_data_properties = [el for el in fn_data_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "DatatypeProperty")]]
        inv_fn_data_properties = [el for el in self.get_child_node(self.root, 'owl:InverseFunctionalProperty') if el]
        inv_fn_data_properties = [el for el in inv_fn_data_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "DatatypeProperty")]]
        return data_properties + fn_data_properties + inv_fn_data_properties
        
    def parse_object_properties(self):
        obj_properties = [el for el in self.get_child_node(self.root, 'owl:ObjectProperty')]
        fn_obj_properties = [el for el in self.get_child_node(self.root, 'owl:FunctionalProperty') if el]
        fn_obj_properties = [el for el in fn_obj_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "ObjectProperty")]]
        inv_fn_obj_properties = [el for el in self.get_child_node(self.root, 'owl:InverseFunctionalProperty') if el]
        inv_fn_obj_properties = [el for el in inv_fn_obj_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "ObjectProperty")]]
        return obj_properties + fn_obj_properties + inv_fn_obj_properties
    
    def get_object_properties(self):
        obj_props = [self.extract_ID(el) for el in self.object_properties]
        return list(set(self.filter_null(obj_props)))
    
    def get_data_properties(self):
        data_props = [self.extract_ID(el) for el in self.data_properties]
        return list(set(self.filter_null(data_props)))


def greedy_matching():
    global batch_size, test_data_t, test_data_f, model, optimizer, emb_indexer_inv, all_metrics
    all_results = OrderedDict()
    with torch.no_grad():
        all_pred = []
        batch_size = min(batch_size, len(test_data_t))
        num_batches = int(ceil(len(test_data_t)/batch_size))
        batch_size_f = int(ceil(len(test_data_f)/num_batches))
        
        np.random.shuffle(test_data_t)
        np.random.shuffle(test_data_f)
        gt_mappings_filt = []
        for batch_idx in range(num_batches):
            batch_start = batch_idx * batch_size
            batch_end = (batch_idx+1) * batch_size

            batch_start_f = batch_idx * batch_size_f
            batch_end_f = (batch_idx+1) * batch_size_f

            pos_elems = np.array(test_data_t)[batch_start:batch_end]
            neg_elems = np.array(test_data_f)[batch_start_f:batch_end_f]
            optimizer.zero_grad()

            inputs = np.array([generate_data(elem) for elem in list(pos_elems) + list(neg_elems)])
            targets = np.array([1 for i in range(len(pos_elems))] + [0 for i in range(len(neg_elems))])
            
            indices = np.random.permutation(inputs.shape[0])
            inputs, targets = inputs[indices].transpose(1,0,2), targets[indices]
            
            nonzero_elems = np.count_nonzero(inputs, axis=-1) - 1
            
            inputs = torch.LongTensor(inputs.transpose(1,0,2))
            seq_lens = torch.LongTensor(nonzero_elems.T)
            targets = torch.DoubleTensor(targets)

            outputs = model(inputs, seq_lens)
            outputs = [el.item() for el in outputs]
            #outputs /= torch.sum(outputs, dim=1).view(-1, 1)
            #outputs = [(1-el[1].item()) for el in outputs]
            gt_mappings_filt.extend([el for el in gt_mappings if el in inputs])
            
            targets = [True if el.item() else False for el in targets]
#             print (inputs)
            for idx, pred_elem in enumerate(outputs):
                ent1 = emb_indexer_inv[inputs_elem[0][idx][0]]
                ent2 = emb_indexer_inv[inputs_elem[1][idx][0]]
                if (ent1, ent2) in all_results:
                    print ("Error: ", ent1, ent2, "already present")
                all_results[(ent1, ent2)] = (pred_elem, targets[idx])
        print ("Len of test data", len(test_data_t), "Filtered gt", len(gt_mappings_filt))
        #all_results = OrderedDict(sorted(all_results.items(), key=lambda x: x[0], reverse=True))
        #filtered_results = dict()
        
        #entities_to_assign = set([el[0] for el in list(all_results.keys())])
        #for pair in all_results:
        #    if pair[0] in entities_to_assign:
        #        filtered_results[pair] = all_results[pair]
        #        entities_to_assign.remove(pair[0])
                
        #entities_to_assign = set([el[1] for el in list(all_results.keys())])
        #for pair in all_results:
        #    if pair[1] in entities_to_assign:
        #        filtered_results[pair] = all_results[pair]
        #        entities_to_assign.remove(pair[1])        

        #filtered_results = OrderedDict(sorted(filtered_results.items(), key=lambda x: x[1][0], reverse=True))
        
        optimum_metrics, opt_threshold = [-1000 for i in range(5)], -1000
        low_threshold = np.min([el[0] for el in all_results.values()]) - 0.01
        high_threshold = np.max([el[0] for el in all_results.values()]) + 0.01
        low_threshold, high_threshold = 0.9, 1.02
        for j,threshold in enumerate(np.arange(low_threshold, high_threshold, 0.01)):
            res = []
            for i,key in enumerate(all_results):
                if all_results[key][0] > threshold:
                    res.append(key)
            fn_list = [key for key in gt_mappings_filt if key not in set(res) and is_test(test_onto, key)]
            fp_list = [elem for elem in res if not all_results[elem][1]]
            tp_list = [elem for elem in res if all_results[elem][1]]
            
            tp, fn, fp = len(tp_list), len(fn_list), len(fp_list)
            
            
            try:
                precision = tp/(tp+fp)
                recall = tp/(tp+fn)
                f1score = 2 * precision * recall / (precision + recall)
                f2score = 5 * precision * recall / (4 * precision + recall)
                f0_5score = 1.25 * precision * recall / (0.25 * precision + recall)
            except Exception as e:
                print (e)
                continue
            print ("Threshold: ", threshold, precision, recall, f1score, f2score, f0_5score)

            if f1score > optimum_metrics[2]:
                optimum_metrics = [precision, recall, f1score, f2score, f0_5score]
                opt_threshold = threshold
        
        print ("Precision: {} Recall: {} F1-Score: {} F2-Score: {} F0.5-Score: {}".format(*optimum_metrics))
        all_metrics.append((opt_threshold, optimum_metrics))
    return all_results

def write(elem):
    f = open("Logs", "a+")
    if type(elem) == list or type(elem) == tuple:
        string = str("\n".join([str(s) for s in elem]))
    else:
        string = str(elem)
    f.write("\n"+string)
    f.close()
    
inputs3, results3 = None, None

class SiameseNetwork(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_layers):
        super().__init__() 
        
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.num_directions = 2
        
        self.name_embedding = nn.Embedding(len(emb_vals), self.embedding_dim)
        self.name_embedding.load_state_dict({'weight': torch.from_numpy(np.array(emb_vals))})
        self.name_embedding.weight.requires_grad = False

        self.dropout = dropout
        
        self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim, self.num_layers, bidirectional=True, batch_first=True)
        self.cosine_sim_layer = nn.CosineSimilarity(dim=1)
        self.attn = nn.Linear(1024, 1)
        self.bilinear = nn.Bilinear(self.hidden_dim, self.hidden_dim, 1)

    def forward(self, inputs, seq_lens):
        results = []
        inputs = inputs.permute(1,0,2)
        seq_lens = seq_lens.T

        for i in range(2):
            x = self.name_embedding(inputs[i])
            node = x.permute(1,0,2)[:1].permute(1,0,2)
            neighbours = x.permute(1,0,2)[1:].permute(1,0,2)
            context = torch.DoubleTensor()
            
            for j,elem in enumerate(neighbours):
                curr_context = torch.DoubleTensor()
                for neighbour in elem[:seq_lens[i][j],:]:
                    attention = self.attn(torch.cat((node[j].reshape(512), neighbour.reshape(512))))
                    attention = attention * neighbour.reshape(512)
                    curr_context = torch.cat((curr_context, attention.unsqueeze(0)))
                context = torch.cat((context, torch.mean(curr_context, dim=0).unsqueeze(0)))
            
            x = torch.cat((node.reshape(-1, 512), context.reshape(-1, 512)), dim=1)
            results.append(x)
        #global inputs3, results3
        #results3 = results
        #inputs3 = inputs
        #x = self.layer1(results[0], results[1])
        #x = F.log_softmax(x)
#         print (results[0].shape)
        x = self.cosine_sim_layer(results[0], results[1])
        return x


def get_one_hop_neighbours(ont, K=1):
    ont_obj = Ontology("conference_ontologies/" + ont + ".owl")
    triples = ont_obj.get_triples()
    entities = [(a,b) for (a,b,c) in triples]
    neighbours_dict = {elem: [elem] for elem in list(set(flatten(entities)))}
    for e1, e2 in entities:
        neighbours_dict[e1].append(e2)
        neighbours_dict[e2].append(e1)
    
    prop_triples = ont_obj.get_triples(subclass_of=False)
    neighbours_dict_props = {c: [c] for a,b,c in prop_triples}
    for e1, e2, p in prop_triples:
        neighbours_dict_props[p].extend([e1, e2])

    neighbours_dict = {**neighbours_dict, **neighbours_dict_props}
    
#     for elem in ont_obj.get_entities() + ont_obj.get_object_properties() + ont_obj.get_data_properties():
#         if elem not in neighbours_dict:
#             neighbours_dict[elem] = [elem]

    neighbours_dict = {el: neighbours_dict[el][:1] + sorted(list(set(neighbours_dict[el][1:])))
                       for el in neighbours_dict}
#     neighbours_dict = {el: neighbours_dict[el][:10] for el in neighbours_dict}
    neighbours_dict = {ont + "#" + el: [ont + "#" + e for e in neighbours_dict[el]] for el in neighbours_dict}
    return neighbours_dict

def is_test(test_onto, key):
    return tuple([el.split("#")[0] for el in key]) in test_onto

def generate_data(elem_tuple):
    op = np.array([[emb_indexer[el] for el in neighbours_dicts[elem.split("#")[0]][elem]] for elem in elem_tuple])
    return op

def generate_input(elems, target):
    inputs, targets = [], []
    for elem in list(elems):
        try:
            inputs.append(generate_data(elem))
            targets.append(target)
        except:
            continue
    print ("Filtered len: ", len(inputs), "Original len:", len(elems))
    return np.array(inputs), np.array(targets)

neighbours_dicts = {ont: get_one_hop_neighbours(ont) for ont in list(set(flatten(ontologies_in_alignment)))}
max_neighbours = np.max(flatten([[len(el[e]) for e in el] for el in neighbours_dicts.values()]))
neighbours_lens = {ont: {key: len(neighbours_dicts[ont][key]) for key in neighbours_dicts[ont]}
                   for ont in neighbours_dicts}
neighbours_dicts = {ont: {key: neighbours_dicts[ont][key] + ["<UNK>" for i in range(max_neighbours -len(neighbours_dicts[ont][key]))]
              for key in neighbours_dicts[ont]} for ont in neighbours_dicts}

# data_items = data.items()
# np.random.shuffle(list(data_items))
# data = OrderedDict(data_items)

# print ("Number of entities:", len(data))
# all_ont_pairs = list(set([tuple([el.split("#")[0] for el in l]) for l in data.keys()]))

# all_metrics = []

# for i in list(range(0, len(all_ont_pairs), 3)):
    
#     test_onto = all_ont_pairs[i:i+3]
    
#     train_data = {elem: data[elem] for elem in data if tuple([el.split("#")[0] for el in elem]) not in test_onto}
#     test_data = {elem: data[elem] for elem in data if tuple([el.split("#")[0] for el in elem]) in test_onto}

#     torch.set_default_dtype(torch.float64)
    
#     train_test_split = 0.9

#     train_data_t = [key for key in train_data if data[key]]
#     train_data_f = [key for key in train_data if not data[key]]
#     #train_data_f = train_data_f[:int(len(train_data_t))]
# #     [:int(0.1*(len(train_data) - len(train_data_t)) )]
# #     np.random.shuffle(train_data_f)
    
#     lr = 0.001
#     num_epochs = 50
#     weight_decay = 0.001
#     batch_size = 8
#     dropout = 0.3
#     batch_size = min(batch_size, len(train_data_t))
#     num_batches = int(ceil(len(train_data_t)/batch_size))
#     batch_size_f = int(ceil(len(train_data_f)/num_batches))
#     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
#     model = nn.DataParallel(SiameseNetwork(512, 250, 1)).to(device)

#     optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

#     for epoch in range(num_epochs):
#         inputs_pos, targets_pos = generate_input(train_data_t, 1)
#         inputs_neg, targets_neg = generate_input(train_data_f, 0)
#         indices_pos = np.random.permutation(len(inputs_pos))
#         indices_neg = np.random.permutation(len(inputs_neg))

#         inputs_pos, targets_pos = inputs_pos[indices_pos], targets_pos[indices_pos]
#         inputs_neg, targets_neg = inputs_neg[indices_neg], targets_neg[indices_neg]

# #        indices = np.random.permutation(len(inputs_pos) + len(inputs_neg))
        
# #        inputs = np.array(list(inputs_pos) + list(inputs_neg))[indices]
# #        targets = np.array(list(targets_pos) + list(targets_neg))[indices]

# #         inputs = np.array(list(inputs_pos) + list(inputs_neg))
# #         targets = np.array(list(targets_pos) + list(targets_neg))

#         for batch_idx in range(num_batches):
#             batch_start = batch_idx * batch_size
#             batch_end = (batch_idx+1) * batch_size
            
#             batch_start_f = batch_idx * batch_size_f
#             batch_end_f = (batch_idx+1) * batch_size_f

#             inputs = np.concatenate((inputs_pos[batch_start: batch_end], inputs_neg[batch_start_f: batch_end_f]))
#             targets = np.concatenate((targets_pos[batch_start: batch_end], targets_neg[batch_start_f: batch_end_f]))
            
#             inp = inputs.transpose(1,0,2)
#             nonzero_elems = np.count_nonzero(inp, axis=-1) - 1
            
#             inp_elems = torch.LongTensor(inputs)
#             targ_elems = torch.DoubleTensor(targets).to(device)
#             optimizer.zero_grad()

#             seq_lens = torch.LongTensor(nonzero_elems.T)
#             outputs = model(inp_elems, seq_lens)
#             loss = F.mse_loss(outputs, targ_elems)
#             loss.backward()
# #             break
#             optimizer.step()

#             if batch_idx%10 == 0:
#                 print ("Epoch: {} Idx: {} Loss: {}".format(epoch, batch_idx, loss.item()))

#     model.eval()

    
#     test_data_t = [key for key in test_data if data[key]]
#     test_data_f = [key for key in test_data if not data[key]]
    
#     res = greedy_matching()

# print ("Final Results: ", np.mean([el[1] for el in all_metrics], axis=0))
# print ("Best threshold: ", all_metrics[np.argmax([el[1][2] for el in all_metrics])][0])


ValueError: too many values to unpack (expected 5)

In [365]:
import operator
sorted([(1, "sinal", (1)), (1, "al", (1, 2)), (1, "al", (1, 5))], key=operator.itemgetter(1, 2))

[(1, 'al', (1, 2)), (1, 'al', (1, 5)), (1, 'sinal', 1)]

In [20]:
# curr_context = torch.DoubleTensor([])
# curr_context = torch.cat((curr_context, torch.randn(10).unsqueeze(0)))
torch.sum(curr_context, dim=0) 

tensor([ 1.5465,  1.0060, -1.9793,  1.6608,  0.2812,  1.3197, -1.7668, -4.4346,
        -0.4951, -0.3795])

In [31]:
len(gt_mappings)

305

In [52]:

def djikstra(n, adj_matrix, start_node): 

    distances = [sys.maxsize] * n 
    path = [False] * n
    
    distances[start_node] = 0
    
    for node in range(n): 
        
        distances_dict = {elem: i for (i,elem) in enumerate(distances) if not path[i]}
        closest_node = distances_dict[min(list(distances_dict.keys()))]

        path[closest_node] = True
 
        for curr_node in range(n): 
            if adj_matrix[closest_node][curr_node] > 0 and not path[curr_node] and \
            distances[curr_node] > distances[closest_node] + adj_matrix[closest_node][curr_node]: 
                distances[curr_node] = distances[closest_node] + adj_matrix[closest_node][curr_node]

    return distances

all_triples = Ontology("conference_ontologies/conference.owl").get_triples()

entities = {entity:i for i,entity in enumerate(list(set(flatten([(el[0], el[1]) for el in all_triples]))))}
entities_inv = {entities[entity]:entity for entity in entities}

adj_mat = np.zeros((len(entities), len(entities)))

for (a,b,_) in all_triples:
    adj_mat[entities[a]][entities[b]] = 1
    adj_mat[entities[b]][entities[a]] = 1

src = entities["Conference_part"]
sorted([(entities_inv[i], entity) for i,entity in enumerate(djikstra(len(entities), adj_mat, src))], key=lambda x:x[1])

[('Conference_part', 0),
 ('Workshop', 1.0),
 ('string', 1.0),
 ('Tutorial', 1.0),
 ('Topic', 1.0),
 ('Track', 1.0),
 ('Track-workshop_chair', 1.0),
 ('Conference_volume', 1.0),
 ('Program_committee', 2.0),
 ('Person', 2.0),
 ('Conference', 2.0),
 ('Organizing_committee', 2.0),
 ('Steering_committee', 2.0),
 ('Important_dates', 2.0),
 ('Review_preference', 2.0),
 ('Conference_www', 2.0),
 ('Conference_contribution', 2.0),
 ('Committee', 2.0),
 ('Conference_proceedings', 2.0),
 ('Publisher', 2.0),
 ('Committee_member', 3.0),
 ('Submitted_contribution', 3.0),
 ('Written_contribution', 3.0),
 ('date', 3.0),
 ('Conference_contributor', 3.0),
 ('Poster', 3.0),
 ('int', 3.0),
 ('Co-chair', 3.0),
 ('Conference_document', 3.0),
 ('Conference_participant', 3.0),
 ('Chair', 3.0),
 ('Conference_applicant', 3.0),
 ('Reviewer', 3.0),
 ('Presentation', 3.0),
 ('Thing', 3.0),
 ('Review_expertise', 4.0),
 ('Invited_speaker', 4.0),
 ('Reviewed_contribution', 4.0),
 ('Invited_talk', 4.0),
 ('Registeered

In [46]:
class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        from torch.nn import TransformerEncoder, TransformerEncoderLayer
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            device = src.device
            mask = self._generate_square_subsequent_mask(len(src)).to(device)
            self.src_mask = mask

        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(output)
        return output

['Workshop',
 'string',
 'Tutorial',
 'Topic',
 'Track',
 'Track-workshop_chair',
 'Conference_volume']

In [115]:
global batch_size, test_data_t, test_data_f, model, optimizer, emb_indexer_inv, gt_mappings, all_metrics
all_results = OrderedDict()
with torch.no_grad():
    all_pred = []
    batch_size = min(batch_size, len(test_data_t))
    num_batches = int(ceil(len(test_data_t)/batch_size))
    batch_size_f = int(ceil(len(test_data_f)/num_batches))

    np.random.shuffle(test_data_t)
    np.random.shuffle(test_data_f)

    for batch_idx in range(num_batches):
        batch_start = batch_idx * batch_size
        batch_end = (batch_idx+1) * batch_size

        batch_start_f = batch_idx * batch_size_f
        batch_end_f = (batch_idx+1) * batch_size_f

        pos_elems = np.array(test_data_t)[batch_start:batch_end]
        neg_elems = np.array(test_data_f)[batch_start_f:batch_end_f]
        optimizer.zero_grad()

        inputs = np.array([generate_data(elem) for elem in list(pos_elems) + list(neg_elems)])     
        targets = np.array([1 for i in range(len(pos_elems))] + [0 for i in range(len(neg_elems))])

        break

In [123]:
inputs

array([[[270],
        [354]],

       [[693],
        [166]],

       [[ 48],
        [433]],

       ...,

       [[582],
        [ 82]],

       [[434],
        [150]],

       [[612],
        [746]]])

In [24]:
pickle.dump([data, emb_indexer, emb_indexer_inv, emb_vals, gt_mappings], open("data.pkl", "wb"))

In [19]:
embeddings = OrderedDict(embeddings.items())

In [37]:
a = torch.DoubleTensor([[[1,2,3]], [[4,5,6]], [[7,8,9]]])
b = torch.DoubleTensor([[[10,2,30], [5,6,7]], [[4,5,6], [5,6,7]], [[7,8,9], [5,6,7]]])
F.cosine_similarity(a,b,dim=2)

tensor([[0.8772, 0.9683],
        [1.0000, 0.9996],
        [1.0000, 0.9994]])

In [50]:
a = torch.randn(3373, 9, 1)
b = torch.randn(3373, 9, 512)
c = a * b
print(c.shape, torch.equal(c,b))

torch.Size([3373, 9, 512]) False


In [47]:
t = torch.randn(3373, 1, 512)
print (t)
print (t.reshape(-1, 512))

tensor([[[-0.1377, -1.0327, -0.6323,  ..., -2.2149,  2.5233,  0.6922]],

        [[ 0.5923, -1.5585,  0.7242,  ..., -1.8788, -0.8722, -0.7855]],

        [[ 0.4043,  1.5124, -0.3650,  ..., -1.1102,  0.7800, -2.1955]],

        ...,

        [[ 2.7878, -0.3074,  0.4295,  ..., -0.0570,  1.4719,  0.1476]],

        [[-0.3127,  0.3369, -0.9186,  ..., -0.0083, -0.2215,  0.5971]],

        [[-2.0341, -0.4788, -1.1528,  ..., -0.3162,  1.2564,  1.4677]]])
tensor([[-0.1377, -1.0327, -0.6323,  ..., -2.2149,  2.5233,  0.6922],
        [ 0.5923, -1.5585,  0.7242,  ..., -1.8788, -0.8722, -0.7855],
        [ 0.4043,  1.5124, -0.3650,  ..., -1.1102,  0.7800, -2.1955],
        ...,
        [ 2.7878, -0.3074,  0.4295,  ..., -0.0570,  1.4719,  0.1476],
        [-0.3127,  0.3369, -0.9186,  ..., -0.0083, -0.2215,  0.5971],
        [-2.0341, -0.4788, -1.1528,  ..., -0.3162,  1.2564,  1.4677]])


In [38]:
# c = torch.DoubleTensor()
c = torch.cat((c, torch.randn(3,4).unsqueeze(0)))
c.shape

torch.Size([3, 3, 4])

In [172]:
import os, itertools, time, pickle
import subprocess
from xml.dom import minidom
from collections import Counter, OrderedDict
from operator import itemgetter
from scipy import spatial
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
import re, sys
import numpy as np
import scipy.sparse as sp
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from math import ceil, exp
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

f = open("data.pkl", "rb")
data, emb_indexer, emb_indexer_inv, emb_vals, gt_mappings  = pickle.load(f)

ontologies_in_alignment = [l.split(".")[0].split("-") for l in os.listdir("reference-alignment/")]
flatten = lambda l: [item for sublist in l for item in sublist]

ind_test, inp_test1, inp_test2 = None, None, None

def write(statement):
    op_file = open("Logs", "a+")
    op_file.write("\n" + str(statement) + "\n")
    op_file.close()

class Ontology():
    def __init__(self, ontology):
        self.ontology = ontology
        self.ontology_obj = minidom.parse(ontology)
        self.root = self.ontology_obj.documentElement
        self.subclasses = self.parse_subclasses()
        self.object_properties = self.parse_object_properties()
        self.data_properties = self.parse_data_properties()
        self.triples = self.parse_triples()
        self.classes = self.parse_classes()
    
    def get_child_node(self, element, tag):
        return [e for e in element._get_childNodes() if type(e)==minidom.Element and e._get_tagName() == tag]
        
    def has_attribute_value(self, element, attribute, value):
        return True if element.getAttribute(attribute).split("#")[-1] == value else False
    
    def get_subclass_triples(self):
        return [(b,a,"subclass_of") for (a,b) in self.get_subclasses()]
    
    def parse_triples(self, union_flag=0, subclass_of=True):
        obj_props = self.object_properties
        data_props = self.data_properties
        props = obj_props + data_props
        all_triples = []
        for prop in props:
            domain_children = self.get_child_node(prop, "rdfs:domain")
            range_children = self.get_child_node(prop, "rdfs:range")
            domain_prop = self.filter_null([self.extract_ID(el) for el in domain_children])
            range_prop = self.filter_null([self.extract_ID(el) for el in range_children])
            if not domain_children or not range_children:
                continue
            if not domain_prop:
                domain_prop = self.filter_null([self.extract_ID(el) for el in domain_children[0].getElementsByTagName("owl:Class")])
            if not range_prop:
                range_prop = self.filter_null([self.extract_ID(el) for el in range_children[0].getElementsByTagName("owl:Class")])
            if domain_prop and range_prop:
                if union_flag == 0:
                    all_triples.extend([(el[0], el[1], self.extract_ID(prop)) for el in list(itertools.product(domain_prop, range_prop))])
                else:
                    all_triples.append(("###".join(domain_prop), "###".join(range_prop), self.extract_ID(prop)))
        if subclass_of:
            all_triples.extend(self.get_subclass_triples())
        return list(set(all_triples))
    
    def get_triples(self, union_flag=0, subclass_of=True, include_inv=True):
        return self.parse_triples(union_flag, subclass_of)

    def parse_subclasses(self, union_flag=0):
        subclasses = self.root.getElementsByTagName("rdfs:subClassOf")
        subclass_pairs = []
        for el in subclasses:
            inline_subclasses = self.extract_ID(el)
            if inline_subclasses:
                subclass_pairs.append((el, el.parentNode))
            else:
                level1_class = self.get_child_node(el, "owl:Class")
                if not level1_class:
                    continue
                if self.extract_ID(level1_class[0]):
                    subclass_pairs.append((level1_class[0], el.parentNode))
                else:
                    level2classes = level1_class[0].getElementsByTagName("owl:Class")
                    
                    subclass_pairs.extend([(elem, el.parentNode) for elem in level2classes if self.extract_ID(elem)])
        return subclass_pairs
        
    def get_subclasses(self):
        return [(self.extract_ID(a), self.extract_ID(b)) for (a,b) in self.subclasses]
    
    def filter_null(self, data):
        return [el for el in data if el]
    
    def extract_ID(self, element):
        element_id = element.getAttribute("rdf:ID") or element.getAttribute("rdf:resource") or element.getAttribute("rdf:about")
        return element_id.split("#")[-1]
    
    def parse_classes(self):
        class_elems = [self.extract_ID(el) for el in self.root.getElementsByTagName("owl:Class")]
        subclass_classes = list(set(flatten([el[:-1] for el in self.triples])))
        return list(set(self.filter_null(class_elems + subclass_classes)))
    
    def get_classes(self):
        return self.classes
    
    def get_entities(self):
        entities = [self.extract_ID(el) for el in self.root.getElementsByTagName("owl:Class")]
        return list(set(self.filter_null(entities)))

    def parse_data_properties(self):
        data_properties = [el for el in self.get_child_node(self.root, 'owl:DatatypeProperty')]
        fn_data_properties = [el for el in self.get_child_node(self.root, 'owl:FunctionalProperty') if el]
        fn_data_properties = [el for el in fn_data_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "DatatypeProperty")]]
        inv_fn_data_properties = [el for el in self.get_child_node(self.root, 'owl:InverseFunctionalProperty') if el]
        inv_fn_data_properties = [el for el in inv_fn_data_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "DatatypeProperty")]]
        return data_properties + fn_data_properties + inv_fn_data_properties
        
    def parse_object_properties(self):
        obj_properties = [el for el in self.get_child_node(self.root, 'owl:ObjectProperty')]
        fn_obj_properties = [el for el in self.get_child_node(self.root, 'owl:FunctionalProperty') if el]
        fn_obj_properties = [el for el in fn_obj_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "ObjectProperty")]]
        inv_fn_obj_properties = [el for el in self.get_child_node(self.root, 'owl:InverseFunctionalProperty') if el]
        inv_fn_obj_properties = [el for el in inv_fn_obj_properties if type(el)==minidom.Element and 
            [el for el in self.get_child_node(el, "rdf:type") if 
             self.has_attribute_value(el, "rdf:resource", "ObjectProperty")]]
        return obj_properties + fn_obj_properties + inv_fn_obj_properties
    
    def get_object_properties(self):
        obj_props = [self.extract_ID(el) for el in self.object_properties]
        return list(set(self.filter_null(obj_props)))
    
    def get_data_properties(self):
        data_props = [self.extract_ID(el) for el in self.data_properties]
        return list(set(self.filter_null(data_props)))


def greedy_matching():
    global batch_size, test_data_t, test_data_f, model, optimizer, emb_indexer_inv, gt_mappings, all_metrics
    all_results = OrderedDict()
    with torch.no_grad():
        all_pred = []
        batch_size = min(batch_size, len(test_data_t))
        num_batches = int(ceil(len(test_data_t)/batch_size))
        batch_size_f = int(ceil(len(test_data_f)/num_batches))
        
        np.random.shuffle(test_data_t)
        np.random.shuffle(test_data_f)
        
        gt_mappings_filt = []

        inputs_pos, targets_pos = generate_input(test_data_t, 1)
        inputs_neg, targets_neg = generate_input(test_data_f, 0)

        indices_pos = np.random.permutation(len(inputs_pos))
        indices_neg = np.random.permutation(len(inputs_neg))

        inputs_pos, targets_pos = inputs_pos[indices_pos], targets_pos[indices_pos]
        inputs_neg, targets_neg = inputs_neg[indices_neg], targets_neg[indices_neg]

        gt_mappings_filt = [el for el in gt_mappings if el in inputs_pos]

        for batch_idx in range(num_batches):
            batch_start = batch_idx * batch_size
            batch_end = (batch_idx+1) * batch_size

            batch_start_f = batch_idx * batch_size_f
            batch_end_f = (batch_idx+1) * batch_size_f

            inputs = np.concatenate((inputs_pos[batch_start: batch_end], inputs_neg[batch_start_f: batch_end_f]))
            targets = np.concatenate((targets_pos[batch_start: batch_end], targets_neg[batch_start_f: batch_end_f]))
            
            inp = inputs.transpose(1,0,2)
            
            nonzero_elems = np.count_nonzero(inp, axis=-1) - 1

            inp_elems = torch.LongTensor(inputs).to(device)
            seq_lens = torch.LongTensor(nonzero_elems.T).to(device)
            targ_elems = torch.DoubleTensor(targets)

            outputs = model(inp_elems, seq_lens)
            outputs = [el.item() for el in outputs]
            #outputs /= torch.sum(outputs, dim=1).view(-1, 1)
            #outputs = [(1-el[1].item()) for el in outputs]
            

            targets = [True if el.item() else False for el in targets]
#             print (inputs)
            for idx, pred_elem in enumerate(outputs):
                ent1 = emb_indexer_inv[inp[0][idx][0]]
                ent2 = emb_indexer_inv[inp[1][idx][0]]
                if (ent1, ent2) in all_results:
                    print ("Error: ", ent1, ent2, "already present")
                all_results[(ent1, ent2)] = (pred_elem, targets[idx])
        
        #all_results = OrderedDict(sorted(all_results.items(), key=lambda x: x[0], reverse=True))
        #filtered_results = dict()
        
        #entities_to_assign = set([el[0] for el in list(all_results.keys())])
        #for pair in all_results:
        #    if pair[0] in entities_to_assign:
        #        filtered_results[pair] = all_results[pair]
        #        entities_to_assign.remove(pair[0])
                
        #entities_to_assign = set([el[1] for el in list(all_results.keys())])
        #for pair in all_results:
        #    if pair[1] in entities_to_assign:
        #        filtered_results[pair] = all_results[pair]
        #        entities_to_assign.remove(pair[1])        

        #filtered_results = OrderedDict(sorted(filtered_results.items(), key=lambda x: x[1][0], reverse=True))
        
        optimum_metrics, opt_threshold = [-1000 for i in range(5)], -1000
        low_threshold = np.min([el[0] for el in all_results.values()]) - 0.02
        high_threshold = np.max([el[0] for el in all_results.values()]) + 0.02
        #low_threshold, high_threshold = 0.9, 1.02
        threshold = low_threshold
        step = 0.01
        while threshold < high_threshold:
            res = []
            for i,key in enumerate(all_results):
                if all_results[key][0] > threshold:
                    res.append(key)
            fn_list = [key for key in gt_mappings_filt if key not in set(res) and is_test(test_onto, key)]
            fp_list = [elem for elem in res if not all_results[elem][1]]
            tp_list = [elem for elem in res if all_results[elem][1]]
            
            tp, fn, fp = len(tp_list), len(fn_list), len(fp_list)
            exception = False
            
            try:
                precision = tp/(tp+fp)
                recall = tp/(tp+fn)
                f1score = 2 * precision * recall / (precision + recall)
                f2score = 5 * precision * recall / (4 * precision + recall)
                f0_5score = 1.25 * precision * recall / (0.25 * precision + recall)
            except Exception as e:
                print (e)
                exception = True
                step = 0.01
                threshold += step
                continue
            print ("Threshold: ", threshold, precision, recall, f1score, f2score, f0_5score)

            if f1score > optimum_metrics[2]:
                optimum_metrics = [precision, recall, f1score, f2score, f0_5score]
                opt_threshold = threshold
            
            if threshold > 0.98 and not exception:
                step = 0.0001
            else:
                step = 0.01
            print (step, threshold, exception)
            threshold += step 
        print ("Precision: {} Recall: {} F1-Score: {} F2-Score: {} F0.5-Score: {}".format(*optimum_metrics))
        if optimum_metrics[2] != -1000:
            all_metrics.append((opt_threshold, optimum_metrics))
    return all_results

def write(elem):
    f = open("Logs", "a+")
    if type(elem) == list or type(elem) == tuple:
        string = str("\n".join([str(s) for s in elem]))
    else:
        string = str(elem)
    f.write("\n"+string)
    f.close()
    
inputs3, results3 = None, None

class SiameseNetwork(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_layers):
        super().__init__() 
        
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.num_directions = 2
        
        self.name_embedding = nn.Embedding(len(emb_vals), self.embedding_dim)
        self.name_embedding.load_state_dict({'weight': torch.from_numpy(np.array(emb_vals))})
        self.name_embedding.weight.requires_grad = False

        self.dropout = dropout
        
        self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim, self.num_layers, bidirectional=True, batch_first=True)
        self.cosine_sim_layer = nn.CosineSimilarity(dim=1)
        self.attn = nn.Linear(1024, 1)
        self.bilinear = nn.Bilinear(self.hidden_dim, self.hidden_dim, 1)

    def forward(self, inputs, seq_lens):
        results = []
        inputs = inputs.permute(1,0,2)
        seq_lens = seq_lens.T
        #print ("input len: {} seq len: {}, rev len: {}".format(inputs.shape, seq_lens.shape, rev_indices.shape))
        for i in range(2):
            x = self.name_embedding(inputs[i])
            node = x.permute(1,0,2)[:1].permute(1,0,2)
            neighbours = x.permute(1,0,2)[1:].permute(1,0,2)
            context = torch.DoubleTensor().to(device)
            
            for j,elem in enumerate(neighbours):
                curr_context = torch.DoubleTensor().to(device)
                for neighbour in elem[:seq_lens[i][j],:]:
                    attention = self.attn(torch.cat((node[j].reshape(512), neighbour.reshape(512))))
                    attention = attention * neighbour.reshape(512)
                    curr_context = torch.cat((curr_context, attention.unsqueeze(0)))
                context = torch.cat((context, torch.mean(curr_context, dim=0).unsqueeze(0)))
            
            x = torch.cat((node.reshape(-1, 512), context.reshape(-1, 512)), dim=1)
            results.append(x)
        #global inputs3, results3
        #results3 = results
        #inputs3 = inputs
        #x = self.layer1(results[0], results[1])
        #x = F.log_softmax(x)
        x = self.cosine_sim_layer(results[0], results[1])
        return x


def get_one_hop_neighbours(ont, K=1):
    ont_obj = Ontology("conference_ontologies/" + ont + ".owl")
    triples = ont_obj.get_triples()
    entities = [(a,b) for (a,b,c) in triples]
    neighbours_dict = {elem: [elem] for elem in list(set(flatten(entities)))}
    for e1, e2 in entities:
        neighbours_dict[e1].append(e2)
        neighbours_dict[e2].append(e1)
    
    prop_triples = ont_obj.get_triples(subclass_of=False)
    neighbours_dict_props = {c: [c] for a,b,c in prop_triples}
    for e1, e2, p in prop_triples:
        neighbours_dict_props[p].extend([e1, e2])

    neighbours_dict = {**neighbours_dict, **neighbours_dict_props}
    
    # for elem in ont_obj.get_entities() + ont_obj.get_object_properties() + ont_obj.get_data_properties():
    #     if elem not in neighbours_dict:
    #         neighbours_dict[elem] = [elem]

    neighbours_dict = {el: neighbours_dict[el][:1] + sorted(list(set(neighbours_dict[el][1:])))
                       for el in neighbours_dict}
    neighbours_dict = {el: neighbours_dict[el][:int(sys.argv[1])] for el in neighbours_dict}
    neighbours_dict = {ont + "#" + el: [ont + "#" + e for e in neighbours_dict[el]] for el in neighbours_dict}
    return neighbours_dict

def generate_data(elem_tuple):
    op = np.array([[emb_indexer[el] for el in neighbours_dicts[elem.split("#")[0]][elem]] for elem in elem_tuple])
    return op

def generate_input(elems, target):
    inputs, targets = [], []
    for elem in list(elems):
        try:
            inputs.append(generate_data(elem))
            targets.append(target)
        except:
            continue
    print ("Filtered len: ", len(inputs), "Original len:", len(elems))
    return np.array(inputs), np.array(targets)

neighbours_dicts = {ont: get_one_hop_neighbours(ont) for ont in list(set(flatten(ontologies_in_alignment)))}
max_neighbours = np.max(flatten([[len(el[e]) for e in el] for el in neighbours_dicts.values()]))
neighbours_lens = {ont: {key: len(neighbours_dicts[ont][key]) for key in neighbours_dicts[ont]}
                   for ont in neighbours_dicts}
neighbours_dicts = {ont: {key: neighbours_dicts[ont][key] + ["<UNK>" for i in range(max_neighbours -len(neighbours_dicts[ont][key]))]
              for key in neighbours_dicts[ont]} for ont in neighbours_dicts}

print("Number of neighbours: " + sys.argv[1])

data_items = data.items()
np.random.shuffle(list(data_items))
data = OrderedDict(data_items)

print ("Number of entities:", len(data))
all_ont_pairs = list(set([tuple([el.split("#")[0] for el in l]) for l in data.keys()]))

all_metrics = []

for i in list(range(0, len(all_ont_pairs), 3)):
    
    test_onto = all_ont_pairs[i:i+3]
    
    train_data = {elem: data[elem] for elem in data if tuple([el.split("#")[0] for el in elem]) not in test_onto}
    test_data = {elem: data[elem] for elem in data if tuple([el.split("#")[0] for el in elem]) in test_onto}

    torch.set_default_dtype(torch.float64)
    
    train_test_split = 0.9

    train_data_t = [key for key in train_data if data[key]]
    train_data_f = [key for key in train_data if not data[key]]
    #train_data_f = train_data_f[:int(len(train_data_t))]
#     [:int(0.1*(len(train_data) - len(train_data_t)) )]
#     np.random.shuffle(train_data_f)
    
    lr = 0.001
    num_epochs = 50
    weight_decay = 0.001
    batch_size = 8
    dropout = 0.3
    batch_size = min(batch_size, len(train_data_t))
    num_batches = int(ceil(len(train_data_t)/batch_size))
    batch_size_f = int(ceil(len(train_data_f)/num_batches))
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
    model = SiameseNetwork(512, 250, 1).to(device)
    model.load_state_dict(torch.load("/attention.pt"))
    model.eval()

    
    test_data_t = [key for key in test_data if data[key]]
    test_data_f = [key for key in test_data if not data[key]]
    
    res = greedy_matching()
    break

print ("Final Results: " + str(np.mean([el[1] for el in all_metrics], axis=0)))
print ("Best threshold: " + str(all_metrics[np.argmax([el[1][2] for el in all_metrics])][0]))

Number of neighbours: -f
Number of entities: 122893
Filtered len:  261 Original len: 263
Filtered len:  108145 Original len: 112782
Epoch: 0 Idx: 0 Loss: 0.3495687231833145
Epoch: 0 Idx: 10 Loss: 0.04882450803129536


KeyboardInterrupt: 

In [144]:
a = torch.randn(3993, 1, 512)
b = torch.randn(3993, 9, 512)
mult = torch.bmm(b, a.permute(0, 2, 1)).squeeze()
att_weights = masked_softmax(mult).unsqueeze(-1)


In [149]:
c = (att_weights * b)

In [169]:
def masked_softmax(inp):
    inp = inp.double()
    mask = ((inp != 0).double() - 1) * 9999  # for -inf
    return F.softmax(inp + mask, dim=-1)
masked_softmax(torch.DoubleTensor([[1,1,0], [0.5, 0,0 ], [0.2, 0,0.8 ]])).requires_grad

False

In [145]:
mult

torch.Size([3993, 9, 1])

In [175]:
print ((att_weights * b).shape)
torch.sum(att_weights * b, dim=1).shape
# torch.cat((a.reshape(-1, 512), context.reshape(-1, 512)), dim=1).shape

torch.Size([3993, 9, 512])


torch.Size([3993, 512])

In [198]:
final = []
for file in os.listdir("."):
    if file.startswith("Output_att"):
        neighbours = int(''.join(filter(str.isdigit, file)))
        intent = "USE + dot Attn + " + neighbours + " neighbours + Cos Sim "
        threshold = str(round(float([l for l in open(file).read().split("\n") if "Best threshold:" in l][0]), 3))

        if "ent_prop" in file:
            intent += "(avg, ent+prop)"
            desc = "Optimum threshold " + threshold + ", Dot product of node with neighbours, softmax, weighted average, concat with entity, both entities and props"
        elif "sum" in file:
            intent += "(sum)"
            desc = "Optimum threshold " + threshold + ", Dot product of node with neighbours, softmax, weighted sum, concat with entity"
        elif "unsoftmax" in file:
            intent += "(avg, no softmax)"
            desc = "Optimum threshold " + threshold + ", Dot product of node with neighbours, weighted average, concat with entity"
        elif "context" in file:
            intent += "(only context)"
            desc = "Optimum threshold " + threshold + ", Dot product of node with neighbours, softmax, weighted average, context is directly output"
        elif "normalize" in file:
            intent += "(avg, normalized)"
            desc = "Optimum threshold " + threshold + ", Dot product of node with neighbours, softmax, weighted average, normalized, concat with entity"
        else:
            desc = "Optimum threshold " + threshold + ", Dot product of node with neighbours, softmax, weighted average, concat with entity"
        results = [l for l in open(file).read().split("\n") if "Final Results:" in l][0]
        
        results = results.split("[")[1].split("]")[0].strip().split()
        line = "\t".join([intent] + results + [desc])
        final.append(line)
final = sorted(final, key=lambda x:x.split("\t")[0])
open("results.tsv", "w+").write("\n".join(final))

tensor([[-0.2673, -0.5345,  0.8018],
        [ 0.2673,  0.5345,  0.8018]])

In [370]:
from sentence_transformers import SentenceTransformer
# roberta_large = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')
# bert_large = SentenceTransformer("bert-large-nli-stsb-mean-tokens")
roberta_large_nli = SentenceTransformer('roberta-large-nli-mean-tokens')
bert_large_nli = SentenceTransformer('bert-large-nli-mean-tokens')
distilbert_large_nli = SentenceTransformer('distilbert-base-nli-mean-tokens')

distilbert_large = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')


  0%|          | 0.00/1.31G [00:00<?, ?B/s][A
  0%|          | 32.8k/1.31G [00:00<1:40:43, 217kB/s][A
  0%|          | 49.2k/1.31G [00:00<2:12:21, 165kB/s][A
  0%|          | 98.3k/1.31G [00:00<1:53:24, 193kB/s][A
  0%|          | 164k/1.31G [00:00<1:34:52, 230kB/s] [A
  0%|          | 246k/1.31G [00:00<1:18:51, 277kB/s][A
  0%|          | 344k/1.31G [00:00<1:05:33, 333kB/s][A
  0%|          | 475k/1.31G [00:01<53:39, 407kB/s]  [A
  0%|          | 623k/1.31G [00:01<44:26, 492kB/s][A
  0%|          | 819k/1.31G [00:01<36:17, 602kB/s][A
  0%|          | 1.03M/1.31G [00:01<29:08, 750kB/s][A
  0%|          | 1.28M/1.31G [00:01<23:26, 932kB/s][A
  0%|          | 1.49M/1.31G [00:01<19:55, 1.10MB/s][A
  0%|          | 1.65M/1.31G [00:01<18:50, 1.16MB/s][A
  0%|          | 1.97M/1.31G [00:01<15:39, 1.39MB/s][A
  0%|          | 2.34M/1.31G [00:02<12:56, 1.69MB/s][A
  0%|          | 2.70M/1.31G [00:02<11:06, 1.96MB/s][A
  0%|          | 2.96M/1.31G [00:02<10:58, 1.99MB/s][A
  

 32%|███▏      | 426M/1.31G [00:21<00:37, 23.5MB/s][A
 33%|███▎      | 430M/1.31G [00:22<00:38, 23.0MB/s][A
 33%|███▎      | 434M/1.31G [00:22<00:37, 23.2MB/s][A
 33%|███▎      | 438M/1.31G [00:22<00:37, 23.4MB/s][A
 34%|███▎      | 442M/1.31G [00:22<00:37, 23.4MB/s][A
 34%|███▍      | 446M/1.31G [00:22<00:36, 23.4MB/s][A
 34%|███▍      | 450M/1.31G [00:22<00:37, 23.3MB/s][A
 35%|███▍      | 453M/1.31G [00:23<00:36, 23.3MB/s][A
 35%|███▍      | 456M/1.31G [00:23<00:46, 18.6MB/s][A
 35%|███▌      | 460M/1.31G [00:23<00:43, 19.7MB/s][A
 35%|███▌      | 462M/1.31G [00:23<00:46, 18.3MB/s][A
 35%|███▌      | 465M/1.31G [00:23<00:46, 18.3MB/s][A
 36%|███▌      | 468M/1.31G [00:23<00:45, 18.4MB/s][A
 36%|███▌      | 471M/1.31G [00:24<00:45, 18.5MB/s][A
 36%|███▌      | 474M/1.31G [00:24<00:44, 18.6MB/s][A
 36%|███▋      | 477M/1.31G [00:24<00:44, 18.8MB/s][A
 37%|███▋      | 480M/1.31G [00:24<00:43, 19.0MB/s][A
 37%|███▋      | 483M/1.31G [00:24<00:43, 19.2MB/s][A
 37%|███▋ 

 72%|███████▏  | 938M/1.31G [00:44<00:16, 22.7MB/s][A
 72%|███████▏  | 942M/1.31G [00:45<00:16, 23.0MB/s][A
 72%|███████▏  | 946M/1.31G [00:45<00:15, 23.3MB/s][A
 72%|███████▏  | 950M/1.31G [00:45<00:15, 23.4MB/s][A
 73%|███████▎  | 954M/1.31G [00:45<00:15, 23.5MB/s][A
 73%|███████▎  | 958M/1.31G [00:45<00:14, 23.6MB/s][A
 73%|███████▎  | 962M/1.31G [00:45<00:14, 23.7MB/s][A
 74%|███████▎  | 966M/1.31G [00:46<00:14, 23.5MB/s][A
 74%|███████▍  | 970M/1.31G [00:46<00:14, 23.7MB/s][A
 74%|███████▍  | 974M/1.31G [00:46<00:14, 23.7MB/s][A
 75%|███████▍  | 978M/1.31G [00:46<00:14, 23.7MB/s][A
 75%|███████▍  | 982M/1.31G [00:46<00:14, 23.5MB/s][A
 75%|███████▌  | 986M/1.31G [00:46<00:13, 23.4MB/s][A
 75%|███████▌  | 990M/1.31G [00:47<00:13, 23.4MB/s][A
 76%|███████▌  | 994M/1.31G [00:47<00:13, 23.4MB/s][A
 76%|███████▌  | 998M/1.31G [00:47<00:13, 23.5MB/s][A
 76%|███████▋  | 1.00G/1.31G [00:47<00:13, 23.6MB/s][A
 77%|███████▋  | 1.01G/1.31G [00:47<00:13, 23.5MB/s][A
 77%|███

  5%|▍         | 60.1M/1.24G [00:06<00:58, 20.2MB/s][A
  5%|▍         | 62.2M/1.24G [00:06<01:02, 18.8MB/s][A
  5%|▌         | 65.1M/1.24G [00:06<00:59, 19.7MB/s][A
  5%|▌         | 68.3M/1.24G [00:06<00:52, 22.3MB/s][A
  6%|▌         | 70.7M/1.24G [00:06<00:55, 21.1MB/s][A
  6%|▌         | 72.9M/1.24G [00:07<01:01, 19.1MB/s][A
  6%|▌         | 76.1M/1.24G [00:07<00:55, 21.1MB/s][A
  6%|▋         | 78.5M/1.24G [00:07<00:55, 20.8MB/s][A
  6%|▋         | 80.7M/1.24G [00:07<00:58, 19.9MB/s][A
  7%|▋         | 83.8M/1.24G [00:07<00:57, 20.3MB/s][A
  7%|▋         | 87.0M/1.24G [00:07<00:50, 22.8MB/s][A
  7%|▋         | 89.4M/1.24G [00:07<00:53, 21.7MB/s][A
  7%|▋         | 91.7M/1.24G [00:07<01:00, 19.2MB/s][A
  8%|▊         | 95.0M/1.24G [00:08<00:57, 20.1MB/s][A
  8%|▊         | 98.4M/1.24G [00:08<00:50, 22.8MB/s][A
  8%|▊         | 101M/1.24G [00:08<00:56, 20.1MB/s] [A
  8%|▊         | 103M/1.24G [00:08<00:59, 19.1MB/s][A
  9%|▊         | 107M/1.24G [00:08<00:52, 21.6MB/

 43%|████▎     | 531M/1.24G [00:27<00:31, 22.5MB/s][A
 43%|████▎     | 533M/1.24G [00:28<00:33, 21.3MB/s][A
 43%|████▎     | 536M/1.24G [00:28<00:34, 20.6MB/s][A
 43%|████▎     | 540M/1.24G [00:28<00:33, 21.2MB/s][A
 44%|████▎     | 544M/1.24G [00:28<00:32, 21.6MB/s][A
 44%|████▍     | 548M/1.24G [00:28<00:32, 21.6MB/s][A
 44%|████▍     | 552M/1.24G [00:28<00:31, 21.8MB/s][A
 45%|████▍     | 556M/1.24G [00:29<00:31, 21.9MB/s][A
 45%|████▌     | 560M/1.24G [00:29<00:31, 22.0MB/s][A
 45%|████▌     | 564M/1.24G [00:29<00:30, 22.3MB/s][A
 46%|████▌     | 568M/1.24G [00:29<00:29, 22.6MB/s][A
 46%|████▌     | 572M/1.24G [00:29<00:29, 22.7MB/s][A
 46%|████▋     | 575M/1.24G [00:29<00:26, 25.3MB/s][A
 46%|████▋     | 578M/1.24G [00:30<00:26, 24.8MB/s][A
 47%|████▋     | 580M/1.24G [00:30<00:30, 21.4MB/s][A
 47%|████▋     | 584M/1.24G [00:30<00:31, 21.2MB/s][A
 47%|████▋     | 586M/1.24G [00:30<00:29, 22.1MB/s][A
 47%|████▋     | 589M/1.24G [00:30<00:28, 23.3MB/s][A
 48%|████▊

 82%|████████▏ | 1.02G/1.24G [00:50<00:10, 20.9MB/s][A
 83%|████████▎ | 1.03G/1.24G [00:51<00:10, 21.2MB/s][A
 83%|████████▎ | 1.03G/1.24G [00:51<00:08, 23.9MB/s][A
 83%|████████▎ | 1.03G/1.24G [00:51<00:10, 20.9MB/s][A
 83%|████████▎ | 1.04G/1.24G [00:51<00:09, 21.5MB/s][A
 84%|████████▎ | 1.04G/1.24G [00:51<00:09, 21.8MB/s][A
 84%|████████▍ | 1.04G/1.24G [00:51<00:08, 24.2MB/s][A
 84%|████████▍ | 1.05G/1.24G [00:51<00:09, 21.4MB/s][A
 84%|████████▍ | 1.05G/1.24G [00:51<00:08, 21.8MB/s][A
 85%|████████▍ | 1.05G/1.24G [00:52<00:08, 22.2MB/s][A
 85%|████████▍ | 1.06G/1.24G [00:52<00:07, 24.2MB/s][A
 85%|████████▌ | 1.06G/1.24G [00:52<00:08, 21.3MB/s][A
 85%|████████▌ | 1.06G/1.24G [00:52<00:08, 20.9MB/s][A
 86%|████████▌ | 1.06G/1.24G [00:52<00:08, 21.4MB/s][A
 86%|████████▌ | 1.07G/1.24G [00:52<00:08, 21.6MB/s][A
 86%|████████▌ | 1.07G/1.24G [00:53<00:07, 22.0MB/s][A
 86%|████████▋ | 1.08G/1.24G [00:53<00:07, 21.9MB/s][A
 87%|████████▋ | 1.08G/1.24G [00:53<00:07, 22.3M

HTTPError: 404 Client Error: Not Found for url: https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/distilbert-large-nli-mean-tokens.zip

In [369]:
inp

array([[[149, 106, 359, ...,   0,   0,   0],
        [288, 406,   0, ...,   0,   0,   0],
        [787, 460, 755, ...,   0,   0,   0],
        ...,
        [132, 650,   0, ...,   0,   0,   0],
        [369, 506,   0, ...,   0,   0,   0],
        [755, 787, 254, ...,   0,   0,   0]],

       [[245, 377,   8, ...,   0,   0,   0],
        [ 31, 203,   0, ...,   0,   0,   0],
        [204, 677,   0, ...,   0,   0,   0],
        ...,
        [539, 785,   0, ...,   0,   0,   0],
        [519, 357, 134, ...,   0,   0,   0],
        [ 92, 524,   0, ...,   0,   0,   0]]])

tensor([[  0,   1,   2,  ..., 509, 510, 511],
        [  1,   2,   3,  ..., 510, 511, 512],
        [  2,   3,   4,  ..., 511, 512, 513],
        ...,
        [  7,   8,   9,  ..., 516, 517, 518],
        [  8,   9,  10,  ..., 517, 518, 519],
        [  9,  10,  11,  ..., 518, 519, 520]])

In [298]:
a = torch.randn(30, 24, 512)
b = torch.randn(30, 24, 512)

c = torch.bmm(torch.sum(b, dim=1).unsqueeze(1), a.permute(0,2,1)).squeeze()


In [325]:
def masked_softmax(inp):
    inp = inp.double()
    mask = ((inp != 0).double() - 1) * 9999  # for -inf
    return (inp + mask).softmax(dim=-1)

masked_softmax(c)

tensor([[6.9270e-131, 3.4944e-131, 7.1421e-145,  2.9580e-54,  8.2338e-89,
         3.9982e-186,  1.7342e-17, 8.9854e-167, 1.4923e-161, 6.5160e-152,
         1.0486e-120, 1.0063e-183, 2.0121e-107, 1.4307e-116,  9.9897e-01,
          4.1657e-71, 1.4005e-208,  1.0280e-03, 3.3086e-100, 5.4466e-202,
         1.4825e-168, 1.1698e-115, 2.1451e-122, 1.1619e-146],
        [4.0660e-114,  1.5300e-66,  2.3934e-65, 1.1469e-145,  1.0000e+00,
          1.0844e-56,  2.4817e-64,  4.3483e-39, 1.0502e-113,  3.1929e-95,
          3.2372e-52, 1.0226e-135, 6.1078e-122, 1.5710e-148,  4.8968e-42,
          2.2181e-93, 8.7135e-151, 2.4445e-169, 5.3399e-126,  1.1250e-95,
          2.9161e-55, 1.1800e-151,  5.8789e-78, 1.0597e-119],
        [ 1.0000e+00, 1.5970e-153,  5.6147e-78, 2.3138e-101, 4.1165e-161,
         5.4245e-163,  1.0112e-90,  3.9471e-51, 1.1739e-187, 2.9246e-147,
          1.0554e-75,  9.7374e-29,  3.5493e-71, 4.8693e-133,  2.9032e-43,
         1.2432e-169,  1.4704e-63, 8.1574e-107,  1.3173e-39,  

In [328]:
import torch

a = torch.randn(30, 24, 512)
b = torch.randn(30, 24, 512)

result = (a.unsqueeze(1) * b.unsqueeze(2)).sum(dim=2)

In [357]:
t = torch.randn(3993, 9, 1)
print (t, masked_softmax(t))

tensor([[[-0.1367],
         [ 0.4712],
         [-1.0098],
         ...,
         [ 0.3564],
         [ 0.4959],
         [-0.5302]],

        [[-0.5156],
         [-2.1806],
         [-1.0502],
         ...,
         [-0.2044],
         [ 0.5750],
         [-1.1287]],

        [[ 1.5069],
         [ 0.7505],
         [-1.8575],
         ...,
         [ 0.5256],
         [ 0.5765],
         [-0.1208]],

        ...,

        [[-0.3807],
         [ 0.2224],
         [ 1.4568],
         ...,
         [-1.8325],
         [-0.8720],
         [-0.7922]],

        [[ 0.9426],
         [ 0.6242],
         [-0.5395],
         ...,
         [ 0.7824],
         [-1.6038],
         [ 0.3325]],

        [[ 0.9734],
         [ 0.0363],
         [-0.0815],
         ...,
         [ 0.8168],
         [-0.2826],
         [ 0.3225]]]) tensor([[[1.],
         [1.],
         [1.],
         ...,
         [1.],
         [1.],
         [1.]],

        [[1.],
         [1.],
         [1.],
         ...,
     

In [355]:
np.sum([(a[0][0] * b[0][i]).numpy() for i in range(len(b[0]))], axis=0)

array([-6.35313728e-01, -3.98441866e+00,  4.83618786e+00, -8.96162044e-01,
        6.93504260e-01, -2.40044428e+00, -3.37686708e+00, -2.81878735e+00,
       -3.90242832e+00,  1.86741772e+00, -1.69376359e+00,  2.46347982e+00,
       -2.19717695e+00, -5.03819966e-01,  1.53739391e-02, -3.81705850e+00,
        7.12020371e+00, -9.91619450e-01, -1.19323884e+01, -2.02825630e+00,
        9.53331606e-02, -1.43964033e-01, -2.59777649e-01,  1.59757490e+00,
       -6.77884832e+00,  5.33016878e+00,  1.30698192e+00, -2.35665532e-01,
       -1.01886989e+00, -1.89639585e+00,  4.42655160e+00,  3.01185643e+00,
        1.21648539e+00, -3.22416581e+00, -3.48676405e+00, -2.78747781e+00,
        2.56421665e+00, -2.46496803e-01,  2.30988111e+00,  6.03988253e+00,
       -4.36566272e-01,  2.67983028e+00, -3.90873991e+00, -1.80390903e+00,
       -4.97648965e+00, -1.20563486e+00, -6.58429221e-01, -3.60586970e+00,
        2.68783845e+00, -7.84170619e-01,  3.42336524e+00, -8.30507887e+00,
        9.02814358e+00,  

In [360]:
gt_mappings

[('conference#Information_for_participants', 'ekaw#Programme_Brochure'),
 ('conference#Person', 'ekaw#Person'),
 ('conference#Tutorial', 'ekaw#Tutorial'),
 ('conference#Review', 'ekaw#Review'),
 ('conference#has_a_review', 'ekaw#hasReview'),
 ('conference#Workshop', 'ekaw#Workshop'),
 ('conference#Late_paid_applicant', 'ekaw#Late-Registered_Participant'),
 ('conference#Early_paid_applicant', 'ekaw#Early-Registered_Participant'),
 ('conference#Organization', 'ekaw#Organisation'),
 ('conference#Track-workshop_chair', 'ekaw#Workshop_Chair'),
 ('conference#Abstract', 'ekaw#Abstract'),
 ('conference#Conference_proceedings', 'ekaw#Conference_Proceedings'),
 ('conference#Conference_volume', 'ekaw#Conference'),
 ('conference#Rejected_contribution', 'ekaw#Rejected_Paper'),
 ('conference#Poster', 'ekaw#Poster_Paper'),
 ('conference#Track', 'ekaw#Track'),
 ('conference#Topic', 'ekaw#Research_Topic'),
 ('conference#Conference_www', 'ekaw#Web_Site'),
 ('conference#Invited_speaker', 'ekaw#Invited_Sp