In [103]:
import json
import glob
import pandas as pd

import spacy

import re

import networkx as nx

from rdflib import URIRef, BNode, Literal, Namespace
from rdflib.namespace import DCTERMS, RDFS

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from transformers import AutoTokenizer
from transformers import AutoModel

import torch
import torch.nn as nn
from torch import optim
from torch_geometric.utils.convert import from_networkx
from torch_geometric.nn import GCNConv
import torch.nn.functional as F

nlp_en = spacy.load("en_core_web_sm")
# if we use embedding only from last layer, this should stay as it is
# it could be changed for some experiments ?
layers = [-1]

#we load the model
#we could experiment with other models as well
model = AutoModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
data = [json.load(open(x)) for x in glob.glob('./data/*.json')[:1]]

#### TODO

- <del> split the texts into sentences
- <del> filter the sentences that have 2+ entities AND have a relations among these


- baseline prediction (matching the blanks)

In [93]:
#general functions
#the device variable can be changed in case a GPU is available
device = torch.device('cpu')
#uncomment the next line to use gpu
#device = torch.device('gpu')

def split_sentences(sample):
    sentence_boundaries = sample['sentences_boundaries']
    sentences = []
    text = sample["text"]
    for boundary in sentence_boundaries:
        start= boundary[0]
        end = boundary [1]
        sentence = text[start:end]
        sentences.append(sentence)
    return sentences, sentence_boundaries

def get_relations(sample):
    sentence_list = []
    sentences, sentence_boundaries = split_sentences(sample)
    triples = sample['triples']
    
    for i, sentence in enumerate(sentences):
        sentence_dict = {}
        #it looks like some entities do not have boundaries, this would make it difficult to retrieve the tokens
        #let's just not include them for now
        triples_to_get = [x for x in triples if x['sentence_id'] == i and x['object']['boundaries'] != None and x['subject']['boundaries'] != None]
        if len(triples_to_get) >= 1:
            sentence_dict['sentence'] = sentence
            for rel in triples_to_get:
                if rel['predicate']['boundaries'] == None:
                    rel['predicate']['boundaries'] = sentence_boundaries[i]
                    
            sentence_dict['triples'] = triples_to_get
            sentence_dict['boundaries'] = sentence_boundaries[i]
            sentence_list.append(sentence_dict)
    return sentence_list
    
    

#the next two functions are used to extract the embeddings from tokens / sentences
def get_hidden_states(encoded, model, layers):
    with torch.no_grad():
         output = model(**encoded)
    # Get all hidden states
    states = output.hidden_states
    # Stack and sum all requested layers
    output = torch.stack([states[i] for i in layers]).sum(0).squeeze()

    return output

def get_words_vector(sent, tokenizer, model, layers):
    encoded = tokenizer.encode_plus(sent, return_tensors="pt")
    # get all token idxs that belong to the word of interest
    #token_ids_word = np.where(np.array(encoded.word_ids()) == idx)

    return get_hidden_states(encoded, model, layers)

def get_idx(string_list, boundaries, token_offsets):
    ids = []
    for r in range(len(string_list)):
        len_string = len(' '.join(string_list[r:]))
        offset = boundaries[1]-len_string
        ids.append(token_offsets[offset][0])
        
    return ids

In [95]:
data_sentences = []
for file in data:
    for doc in file:
        for sentence in get_relations(doc):
            data_sentences.append(sentence)

for the sentence graph, we need:
- <del> edge index
- <del> edge labels / type
- <del> node features
- (maybe) edge feature


- <del> dataframes [id_sentence, sentence_graph, sentence_string] || [id_sentence, relation, e1_node, e2_node] || 
- <del> max num of nodes, num of dependencies relations (inside the graph), dimensionality (300), num of relations to predict  

In [142]:
dict_sentences = {}
graphs = []
full_rels = []

df_sent_columns = ['id_sentence', 'sentence_graph', 'sentence_string']
df_rel_columns = ['id_sentence', 'relation_uri', 'relation_boundaries', 'e1_node', 'e2_node']

df_sent = []
df_rel = []


for enum_sent, sentence in enumerate(data_sentences[:20]):
    print(enum_sent, len(data_sentences), sep= ' / ', end='\r')
    g = nx.Graph()
    dict_embeddings = {}
    edge_list = []
    starting_token = 0
    relations_list = []

    sentence_string = sentence['sentence'].replace('  ', ' ')
    re.sub('\W+',' ',sentence_string).strip()
    sentence_boundaries = sentence['boundaries']
    triples = sentence['triples']
    token_offsets = {}
    
    sent_embeddings = get_words_vector(sentence_string, tokenizer, model, layers)
    doc_spacy = nlp_en(sentence_string)
    
    for token in doc_spacy:
        #print('>', repr(token))
        enum_idx = 0
        token_offsets[token.idx] = (token.i, token.text)
        token_idx = tokenizer.encode(token.text, add_special_tokens=False)
        
        token_embeddings = []
        for enum_idx, token_id in enumerate(token_idx):
            #print(enum_idx,
            #      token_id,
            #      token.idx,
            #      token.i+skipped_tokens,
            #      sent_embeddings.shape,
            #      tokenizer.convert_ids_to_tokens(token_id))
            token_embeddings.append(sent_embeddings[starting_token + enum_idx])
            
        starting_token += 1
        if len(token_embeddings) > 1:
            token_embeddings = torch.stack(token_embeddings).to(device)
            token_embeddings = torch.mean(token_embeddings, -2)

        elif len(token_embeddings) == 1:
            token_embeddings = torch.stack(token_embeddings).to(device)
        else:
            token_embeddings = torch.rand(1,768)

        token_embeddings = torch.reshape(token_embeddings, (1,768))
        dict_embeddings[token.i] = token_embeddings        
        start_token = token.idx
        end_token = token.idx + len(token.text)
        
        g.add_node(token.i, label=token.text, type='token', features=token_embeddings, boundaries=(start_token, end_token))
        edge_list.append((token.i, token.head.i, token.dep_))
    
    for edge in edge_list:
        g.add_edge(edge[0], edge[1], label=edge[2])
    row_sent = [enum_sent, g, sentence_string]
    df_sent.append(row_sent)
    
    
    next_i = token.i + 1
    
    for triple in triples:

        try:
            subj_uri = triple['subject']['uri']
            subj_string = triple['subject']['surfaceform']
            subj_string_list = subj_string.split()
            subj_boundaries = [x - sentence_boundaries[0] for x in triple['subject']['boundaries']]
            
            subj_ids = get_idx(subj_string_list, subj_boundaries, token_offsets)

            obj_uri = triple['object']['uri']
            obj_string = triple['object']['surfaceform']
            obj_string_list = obj_string.split()
            obj_boundaries = [x - sentence_boundaries[0] for x in triple['object']['boundaries']]
            obj_ids = get_idx(obj_string_list, obj_boundaries, token_offsets)

            subj_i = int(next_i)
            obj_i = int(next_i+1)


            ##-----these might be useful in future------
            rel_boundaries = triple['predicate']['boundaries']
            rel_surfaceform = triple['predicate']['surfaceform']
            rel_uri = triple['predicate']['uri']
            relations_list.append((subj_i, obj_i, rel_uri))


            text = data[0][triple['sentence_id']]['text']
            ##----------------------------------------------

            row_rel = [enum_sent,rel_uri, rel_boundaries, subj_ids, obj_ids]
            df_rel.append(row_rel)
        except KeyError:
            pass

df_rel = pd.DataFrame(df_rel, columns=df_rel_columns)
df_sent = pd.DataFrame(df_sent, columns = df_sent_columns)

19 / 17557

In [143]:
args = {
    "max_num_of_nodes" : max([len(g.nodes) for g in df_sent['sentence_graph']]),
    "num_of_graph_relations" : len(nlp_en.get_pipe("parser").labels),
    "num_of_dimension" : 300,
    "num_of_relations_to_predict": len(set([x for x in df_rel['relation_uri']]))    
}
args

{'max_num_of_nodes': 75,
 'num_of_graph_relations': 45,
 'num_of_dimension': 300,
 'num_of_relations_to_predict': 23}

In [139]:
############## This cell should be ignored

dict_sentences = {}
graphs = []
full_rels = []

df_sent_columns = ['id_sentence', 'sentence_graph', 'sentence_string']
df_rel_columns = ['id_sentence', 'relation', 'e1_node', 'e2_node', 'boundaries']

df_sent = []
df_rel = []

for enum_sent, sentence in enumerate(data_sentences):
    g = nx.Graph()
    dict_embeddings = {}
    edge_list = []
    skipped_tokens = 0
    relations_list = []

    sentence_string = sentence['sentence']
    triples = sentence['triples']
    token_offsets = {}
    
    sent_embeddings = get_words_vector(sentence_string, tokenizer, model, layers)
    doc_spacy = nlp_en(sentence_string)
    
    for token in doc_spacy:
        token_offsets[token.idx] = (token.i, token.text)
        token_idx = tokenizer.encode(token.text, add_special_tokens=False)
        
        token_embeddings = []
        for enum_idx, token_id in enumerate(token_idx):
            token_embeddings.append(sent_embeddings[token.i+enum_idx+skipped_tokens])
            
        skipped_tokens += enum_idx

            
        if len(token_embeddings) > 1:
            token_embeddings = torch.stack(token_embeddings).to(device)
            token_embeddings = torch.mean(token_embeddings, -2)

        else:
            token_embeddings = token_embeddings[0]

        token_embeddings = torch.reshape(token_embeddings, (1,768))

        dict_embeddings[token.i] = token_embeddings
        
        start_token = token.idx
        end_token = token.idx + len(token.text)
        
        g.add_node(token.i, label=token.text, type='token', features=token_embeddings, boundaries=(start_token, end_token))
        
        edge_list.append((token.i, token.head.i, token.dep_))
        
    row_sent = [enum_sent, g, sentence_string]
    df_sent.append(row_sent)
    
    
    next_i = token.i + 1
    
    for triple in triples:
        subj_uri = triple['subject']['uri']
        subj_string = triple['subject']['surfaceform']
        subj_string_list = subj_string.split()
        print(triple)
        subj_boundaries = triple['subject']['boundaries']
        subj_ids = get_idx(subj_string_list, subj_boundaries, token_offsets)

        obj_uri = triple['object']['uri']
        obj_string = triple['object']['surfaceform']
        obj_string_list = obj_string.split()
        obj_boundaries = triple['object']['boundaries']
        obj_ids = get_idx(obj_string_list, obj_boundaries, token_offsets)

        #embeddings_subj = []
        #embeddings_obj = []

        subj_i = int(next_i)
        obj_i = int(next_i+1)
        #g.add_node(subj_i, label=subj_string, type='entity')
        
        #for i in subj_ids:
            #embeddings_subj.append(dict_embeddings[i])
            #edge_list.append((i, subj_i))

        #g.add_node(obj_i, label=obj_string, type='entity')
        #for i in obj_ids:
        #    embeddings_obj.append(dict_embeddings[i])
        #    edge_list.append((i, obj_i))

        #if len(embeddings_subj) > 1:
        #    embeddings_subj = torch.stack(embeddings_subj).to(device)
        #    embeddings_subj = torch.mean(embeddings_subj, -2)
        #else:
        #    embeddings_subj = torch.stack(embeddings_subj).to(device)


        #if len(embeddings_obj) > 1:
        #    embeddings_obj = torch.stack(embeddings_obj).to(device)
        #    embeddings_obj = torch.mean(embeddings_obj, -2)
        #else:
        #    embeddings_obj = torch.stack(embeddings_obj).to(device) 


        #dict_embeddings[subj_i] = embeddings_subj
        #dict_embeddings[obj_i] = embeddings_obj

        #next_i += 2

        ##-----these might be useful in future------
        rel_boundaries = triple['predicate']['boundaries']
        rel_surfaceform = triple['predicate']['surfaceform']
        rel_uri = triple['predicate']['uri']
        relations_list.append((subj_i, obj_i, rel_uri))


        text = data[0][triple['sentence_id']]['text']
        ##----------------------------------------------
        
        row_rel = [enum_sent,subj_ids, obj_ids, rel_boundaries]
        df_rel.append(row_rel)

        #predicate_string = data[0][0]['text'][min(obj_boundaries[0], subj_boundaries[0]):max(obj_boundaries[1],subj_boundaries[1])] if rel_boundaries == None else data[0][0]['text'][rel_boundaries[0]:rel_boundaries[1]]    

    #g.add_edges_from(edge_list)
    #graphs.append(g)
    #full_rels.append(relations_list)
    #dict_sentence[enum_doc] = dict_embeddings

df_rel = pd.DataFrame(df_rel, columns=df_rel_columns)
df_sent = pd.DataFrame(df_sent, columns = df_sent_columns)

{'sentence_id': 0, 'predicate': {'boundaries': [0, 225], 'surfaceform': None, 'uri': 'http://www.wikidata.org/prop/direct/P31', 'annotator': 'NoSubject-Triple-aligner'}, 'object': {'boundaries': [94, 109], 'surfaceform': 'language family', 'uri': 'http://www.wikidata.org/entity/Q25295', 'annotator': 'Wikidata_Spotlight_Entity_Linker'}, 'dependency_path': None, 'confidence': None, 'subject': {'boundaries': [4, 27], 'surfaceform': 'Austroasiatic languages', 'uri': 'http://www.wikidata.org/entity/Q33199', 'annotator': 'Wikidata_Spotlight_Entity_Linker'}, 'annotator': 'NoSubject-Triple-aligner'}
{'sentence_id': 0, 'predicate': {'boundaries': [0, 225], 'surfaceform': None, 'uri': 'http://www.wikidata.org/prop/direct/P31', 'annotator': 'Simple-Aligner'}, 'object': {'boundaries': [94, 109], 'surfaceform': 'language family', 'uri': 'http://www.wikidata.org/entity/Q25295', 'annotator': 'Wikidata_Spotlight_Entity_Linker'}, 'dependency_path': None, 'confidence': None, 'subject': {'boundaries': [4

KeyError: 307