In [1]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from transformers import AutoTokenizer
from transformers import AutoModel
import torch

import numpy as np

import networkx as nx
import spacy

import pandas as pd

import ast


In [2]:
#layers = [-4, -3, -2, -1]
layers = [-1]
model = AutoModel.from_pretrained('bert-base-cased', output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
nlp = spacy.load("en_core_web_sm")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
def get_hidden_states(encoded, model, layers):
    with torch.no_grad():
         output = model(**encoded)
    # Get all hidden states
    states = output.hidden_states
    # Stack and sum all requested layers
    output = torch.stack([states[i] for i in layers]).sum(0).squeeze()

    return output

def get_words_vector(sent, tokenizer, model, layers):
    encoded = tokenizer.encode_plus(sent, return_tensors="pt")
    # get all token idxs that belong to the word of interest
    #token_ids_word = np.where(np.array(encoded.word_ids()) == idx)

    return get_hidden_states(encoded, model, layers)

In [4]:
corpus = open("dev_clean_format.txt").readlines()
corpus = [t.replace('\n', '') for t in corpus]

relations = pd.read_csv("dev_relations.tsv", delimiter='\t', header=None)
relations.fillna('<NONE>')

entities = pd.read_csv('dev_entities.tsv', delimiter='\t', header=0)
column_sentence = entities.columns[0]
column_surface_form = entities.columns[1]
column_pos = entities.columns[2]


In [5]:
list_of_networks = []

for enum, sentence in enumerate(corpus):
    print(enum, "/", len(corpus), end='\r')
    try:
        network = nx.Graph()
        edge_list = []
        id_sentence = enum

        specific_rel = relations.iloc[enum]    

        doc = nlp(sentence)
        tokens = [t for t in doc]

        sent_embeddings = get_words_vector(sentence, tokenizer, model, layers)

        id_token = 0
        for enum, t in enumerate(tokens):
            tokens_bert = tokenizer.tokenize(t.text, add_special_tokens=False)
            token_idx = tokenizer.encode(t.text, add_special_tokens=False)
            token_embeddings = []
            for token_id in token_idx:
                token_embeddings.append(sent_embeddings[id_token])
                id_token += 1

            if len(token_embeddings) > 1:
                token_embeddings = torch.stack(token_embeddings)
            else:
                token_embeddings = token_embeddings[0]

            edge = (t.i, t.head.i, t.dep_)
            edge_list.append(edge)
            network.add_node(t.i, embedding=token_embeddings)

        for edge in edge_list:
            network.add_edge(edge[0], edge[1], label=edge[2])

        rel_label = specific_rel[0]
        rel_subj = [i-1 for i in ast.literal_eval(specific_rel[1])]
        rel_obj = [i-1 for i in ast.literal_eval(specific_rel[2])]

        nodesubj = enum+1
        nodeobj = enum+2

        embeddings_subj = []
        embeddings_obj = []

        for n in range(rel_subj[0], rel_subj[1]+1):
            #test this vs random initialization?
            embeddings_subj.append(network.nodes[n]['embedding'])
            network.add_edge(n, nodesubj, label="in_entity")

        for n in range(rel_obj[0], rel_obj[1]+1):
            embeddings_obj.append(network.nodes[n]['embedding'])
            network.add_edge(n, nodeobj, label="in_entity")

    
        embeddings_subj = torch.stack(embeddings_subj)
        embeddings_obj = torch.stack(embeddings_obj)

        network.nodes[nodesubj]['embedding'] = embeddings_subj
        network.nodes[nodeobj]['embedding'] = embeddings_obj

        network.add_edge(nodesubj, nodeobj, label=rel_label)

        for node in network.nodes():
            network.nodes[node]['embedding'] = network.nodes[node]['embedding']

        list_of_networks.append(network)
        
    except:
        pass

1713 / 1714

In [6]:
nx.write_gpickle(network, 'dev_test_embeddings')