In [133]:
import json
import glob

import spacy

import re

import networkx as nx

from rdflib import URIRef, BNode, Literal, Namespace
from rdflib.namespace import DCTERMS, RDFS

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from transformers import AutoTokenizer
from transformers import AutoModel

import torch
import torch.nn as nn
from torch import optim
from torch_geometric.utils.convert import from_networkx
from torch_geometric.nn import GCNConv
import torch.nn.functional as F

nlp_en = spacy.load("en_core_web_sm")
# if we use embedding only from last layer, this should stay as it is
# it could be changed for some experiments ?
layers = [-1]

#we load the model
#we could experiment with other models as well
model = AutoModel.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
data = [json.load(open(x)) for x in glob.glob('./data/*.json')[:1]]

In [77]:
data[0][0]['triples'][0]

{'sentence_id': 0,
 'predicate': {'boundaries': None,
  'surfaceform': None,
  'uri': 'http://www.wikidata.org/prop/direct/P31',
  'annotator': 'NoSubject-Triple-aligner'},
 'object': {'boundaries': [94, 109],
  'surfaceform': 'language family',
  'uri': 'http://www.wikidata.org/entity/Q25295',
  'annotator': 'Wikidata_Spotlight_Entity_Linker'},
 'dependency_path': None,
 'confidence': None,
 'subject': {'boundaries': [4, 27],
  'surfaceform': 'Austroasiatic languages',
  'uri': 'http://www.wikidata.org/entity/Q33199',
  'annotator': 'Wikidata_Spotlight_Entity_Linker'},
 'annotator': 'NoSubject-Triple-aligner'}

In [127]:
#general functions
#the device variable can be changed in case a GPU is available
device = torch.device('cpu')
#uncomment the next line to use gpu
#device = torch.device('gpu')

#the next two functions are used to extract the embeddings from tokens / sentences
def get_hidden_states(encoded, model, layers):
    with torch.no_grad():
         output = model(**encoded)
    # Get all hidden states
    states = output.hidden_states
    # Stack and sum all requested layers
    output = torch.stack([states[i] for i in layers]).sum(0).squeeze()

    return output

def get_words_vector(sent, tokenizer, model, layers):
    encoded = tokenizer.encode_plus(sent, return_tensors="pt")
    # get all token idxs that belong to the word of interest
    #token_ids_word = np.where(np.array(encoded.word_ids()) == idx)

    return get_hidden_states(encoded, model, layers)

def get_idx(string_list, boundaries, token_offsets):
    ids = []
    for r in range(len(string_list)):
        len_string = len(' '.join(string_list[r:]))
        offset = boundaries[1]-len_string
        ids.append(token_offsets[offset][0])
        
    return ids

In [213]:
dict_sentence = {}

graphs = []
full_rels = []

for enum_doc, doc in enumerate(data[0][:5]):
    g = nx.Graph()
    dict_embeddings = {}
    edge_list = []
    skipped_tokens = 0
    relations_list = []

    string = doc['text']
    triples = doc['triples']
    token_offsets = {}
    
    sent_embeddings = get_words_vector(string, tokenizer, model, layers)

    doc_spacy = nlp_en(string)
    for token in doc_spacy:
        token_offsets[token.idx] = (token.i, token.text)
        
        #bert_token = tokenizer.tokenize(token.string, add_special_tokens=False)
        token_idx = tokenizer.encode(token.text, add_special_tokens=False)
        
        token_embeddings = []
        for enum_idx, token_id in enumerate(token_idx):
            token_embeddings.append(sent_embeddings[token.i+enum_idx+skipped_tokens])
        skipped_tokens += enum_idx

            
        if len(token_embeddings) > 1:
            token_embeddings = torch.stack(token_embeddings).to(device)
            token_embeddings = torch.mean(token_embeddings, -2)

        else:
            try:
                token_embeddings = token_embeddings[0]
            except:
                pass
        token_embeddings = torch.reshape(token_embeddings, (1,768))

            
        dict_embeddings[token.i] = token_embeddings
        g.add_node(token.i, label=token.text, type='token')
        edge_list.append((token.i, token.head.i))
    
    next_i = token.i + 1
    
    for triple in triples:
        try:
            subj_uri = triple['subject']['uri']
            subj_string = triple['subject']['surfaceform']
            subj_string_list = subj_string.split()
            subj_boundaries = triple['subject']['boundaries']
            subj_ids = get_idx(subj_string_list, subj_boundaries, token_offsets)

            obj_uri = triple['object']['uri']
            obj_string = triple['object']['surfaceform']
            obj_string_list = obj_string.split()
            obj_boundaries = triple['object']['boundaries']
            obj_ids = get_idx(obj_string_list, obj_boundaries, token_offsets)
            
            embeddings_subj = []
            embeddings_obj = []
            
            subj_i = int(next_i)
            obj_i = int(next_i+1)
            g.add_node(subj_i, label=subj_string, type='entity')
            for i in subj_ids:
                embeddings_subj.append(dict_embeddings[i])
                edge_list.append((i, subj_i))
                
            g.add_node(obj_i, label=obj_string, type='entity')
            for i in obj_ids:
                embeddings_obj.append(dict_embeddings[i])
                edge_list.append((i, obj_i))
            
            if len(embeddings_subj) > 1:
                embeddings_subj = torch.stack(embeddings_subj).to(device)
                embeddings_subj = torch.mean(embeddings_subj, -2)
            else:
                embeddings_subj = torch.stack(embeddings_subj).to(device)
            
           
            if len(embeddings_obj) > 1:
                embeddings_obj = torch.stack(embeddings_obj).to(device)
                embeddings_obj = torch.mean(embeddings_obj, -2)
            else:
                embeddings_obj = torch.stack(embeddings_obj).to(device) 
                
                
            dict_embeddings[subj_i] = embeddings_subj
            dict_embeddings[obj_i] = embeddings_obj

            next_i += 2

            ##-----these might be useful in future------
            rel_boundaries = triple['predicate']['boundaries']
            rel_surfaceform = triple['predicate']['surfaceform']
            rel_uri = triple['predicate']['uri']
            relations_list.append((subj_i, obj_i, rel_uri))

            text = data[0][triple['sentence_id']]['text']
            ##----------------------------------------------
             
            predicate_string = data[0][0]['text'][min(obj_boundaries[0], subj_boundaries[0]):max(obj_boundaries[1],subj_boundaries[1])] if rel_boundaries == None else data[0][0]['text'][rel_boundaries[0]:rel_boundaries[1]]    
        except TypeError:
            pass
    g.add_edges_from(edge_list)
    graphs.append(g)
    full_rels.append(relations_list)
    dict_sentence[enum_doc] = dict_embeddings

In [178]:
full_rels[0][0]

(252, 253, 'http://www.wikidata.org/prop/direct/P31')

In [179]:
graphs[0].nodes[253]

{'label': 'language family', 'type': 'entity'}

torch.Size([1, 768])

tensor(0.)