In [262]:
import pandas as pd
import delphin.codecs.eds
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
from tqdm import tqdm

In [38]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [68]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [514]:
train_data = pd.read_csv('./gnn/data/raw/gnn_data_dgl_train_small.csv')

In [517]:
len(train_data)

44671

In [325]:
edses = []
sentences = []
for index, row in train_data.iterrows():
    edses.append(delphin.codecs.eds.decode(row['eds']))
    sentences.append(row['sentence'])

In [25]:
edses[0].nodes

[<Node object (_1:proper_q<0:28>[?]) at 139893443332688>,
 <Node object (e10:compound<0:14>[e]) at 139893443332800>,
 <Node object (_2:proper_q<0:6>[?]) at 139893443332912>,
 <Node object (x9:named<0:6>[x]) at 139893443331904>,
 <Node object (x6:named<7:14>[x]) at 139893443332016>,
 <Node object (e17:measure<15:23>[e]) at 139893443333024>,
 <Node object (_3:udef_q<15:23>[?]) at 139893443333136>,
 <Node object (e22:card<15:17>[e]) at 139893443333248>,
 <Node object (x15:_year_n_1<18:23>[x]) at 139893443332128>,
 <Node object (e16:_old_a_1<24:28>[e]) at 139893443333360>,
 <Node object (e3:_join_v_1<34:38>[e]) at 139893443333472>,
 <Node object (_4:_the_q<39:42>[?]) at 139893443333584>,
 <Node object (x23:_board_n_of<43:48>[x]) at 139893443333696>,
 <Node object (e29:_as_p<49:51>[e]) at 139893443333808>,
 <Node object (_5:_a_q<52:53>[?]) at 139893443333920>,
 <Node object (e35:_nonexecutive/jj_u_unknown<54:66>[e]) at 139893443334032>,
 <Node object (x30:_director_n_of<67:75>[x]) at 139893

In [218]:
edses[0].nodes[0].lnk.data[0]

0

In [29]:
edses[0].nodes[8].properties

{'PERS': '3', 'NUM': 'pl', 'IND': '+'}

In [33]:
for i, c in zip(range(len(sentences[0])), sentences[0]):
    print(c + ' ' + str(i))

P 0
i 1
e 2
r 3
r 4
e 5
  6
V 7
i 8
n 9
k 10
e 11
n 12
, 13
  14
6 15
1 16
  17
y 18
e 19
a 20
r 21
s 22
  23
o 24
l 25
d 26
, 27
  28
w 29
i 30
l 31
l 32
  33
j 34
o 35
i 36
n 37
  38
t 39
h 40
e 41
  42
b 43
o 44
a 45
r 46
d 47
  48
a 49
s 50
  51
a 52
  53
n 54
o 55
n 56
e 57
x 58
e 59
c 60
u 61
t 62
i 63
v 64
e 65
  66
d 67
i 68
r 69
e 70
c 71
t 72
o 73
r 74
  75
N 76
o 77
v 78
. 79
  80
2 81
9 82
. 83


In [460]:
def bert_featurerise(sentences=None):
    # return last hidden layer of bert
    # if no sentence provided, build from training dataset

    longest_sentence = sentences[np.argmax([len(x) for x in sentences])]


    print('Loading tokenizer...')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    # account for <CLS> and <SEP>
    padding_length = len(tokenizer.tokenize(longest_sentence)) + 2

    print('Loading BERT...')
    model = BertModel.from_pretrained('bert-base-uncased',
                                    output_hidden_states = True, # returns all hidden-states.
                                    ).to(device)
    model.eval()
    batch_size = 8

    tokens_list = []
    token_embeddings = []
    for idx in tqdm(range(0, len(sentences), batch_size), total=int(np.ceil(len(sentences)/batch_size))):
        batch = sentences[idx : min(len(sentences), idx+batch_size)]

        encoded = tokenizer.batch_encode_plus(batch,max_length=padding_length, padding='max_length', truncation=True)
        for ids in encoded['input_ids']:
            tks = tokenizer.convert_ids_to_tokens(ids)
            tokens_list.append(list(filter(lambda x: x != '[PAD]', tks)))

        encoded = {key:torch.LongTensor(value).to(device) for key, value in encoded.items()}
        with torch.no_grad():
            outputs = model(**encoded)
        lhs = outputs.last_hidden_state
        attention = encoded['attention_mask'].reshape((lhs.size()[0], lhs.size()[1], -1)).expand(-1, -1, 768)
        mask = (attention > 0)[:,:,0]

        # unbatch
        for i, m in enumerate(mask):
            token_embeddings.append(lhs[i,:][m])
    # <CLS> tokens <SEP>
    return tokens_list, token_embeddings

def eds_nodes_to_surface_char_level(eds, sentence):
    surface_per_node = []
    surface_per_node = {}
    for n in eds.nodes:
        if n.lnk.data:
            start = n.lnk.data[0]
            stop = n.lnk.data[1]
            # surface_per_node.append(sentence[start:stop])
            surface_per_node[n.id] = {'surface': sentence[start:stop]}
            surface_per_node[n.id]['start_char'] = start
            surface_per_node[n.id]['stop_char'] = stop
        else:
            # surface_per_node.append('')
            surface_per_node[n.id] = {'surface': ''}
    return dict(sorted(surface_per_node.items(), key=lambda x: int(x[1]['start_char'])))

# def get_node_span_embedding(eds, sentence, tokens, token_embeddings):
# def get_node_span_embedding(eds, sentence, tokens):
#     surface_per_node = eds_nodes_to_surface_char_level(eds, sentence)
#     span_embeddings = {}
#     for node_id, surface['surface'] in surface_per_node:
#         a, b = surface_string_to_token_index(surface, tokens, surface)
#         # embeddings = token_embeddings[a:b]
#         # span_embeddings.append([a,b])
#         span
#     return span_embeddings

# def surface_string_to_token_index(string_to_be_matched, tokens, original_string, a=0, b=1, match_started=False):
def surface_string_to_token_index(string_to_be_matched, tokens, original_string, a, b, match_started=False):
    # print(string_to_be_matched)
    # string_to_be_matched = string_to_be_matched.lower()
    if string_to_be_matched == '':
        return a, b
    else:
        if match_started:
            start_token = tokens[b]
            if not string_to_be_matched.startswith(start_token): #false match
                return surface_string_to_token_index(original_string, tokens, original_string, a+1, a+2, False)
            else:
                return surface_string_to_token_index(string_to_be_matched[len(start_token):].strip(), tokens,original_string, a, b+1, True)

        else:
            start_token = tokens[a]
            if string_to_be_matched.startswith(start_token):
                return surface_string_to_token_index(string_to_be_matched[len(start_token):].strip(), tokens, original_string, a, b, True)
            else:
                return surface_string_to_token_index(string_to_be_matched, tokens, original_string, a+1, b+1, False)
            
def match_node_spans_with_token_index(node_spans_dict, tokens):
    # node_spans = [x.lower() for x in node_spans]
    tokens = [x[2:] if x[:2] == '##' else x for x in tokens]
    a = 0
    # node_span_index = []
    
    for node_id, info_dict in node_spans_dict.items():
        cur_span = info_dict['surface'].lower()
        start, finish = surface_string_to_token_index(cur_span, tokens, cur_span, a, a+1)
        # node_span_index.append((start, finish))
        node_spans_dict[node_id]['token_index'] = (start, finish)
        a = start
        print((start, finish))
    return node_spans_dict

In [481]:
issue_ind = 234
tokens = tokenizer.tokenize(sentences[issue_ind])
tokens

['cr',
 '##ay',
 'computer',
 'also',
 'will',
 'face',
 'intense',
 'competition',
 ',',
 'not',
 'only',
 'from',
 'cr',
 '##ay',
 'research',
 ',',
 'which',
 'has',
 'about',
 '60',
 '%',
 'of',
 'the',
 'world',
 '-',
 'wide',
 'super',
 '##com',
 '##put',
 '##er',
 'market',
 'and',
 'which',
 'is',
 'expected',
 'to',
 'roll',
 'out',
 'the',
 'c',
 '-',
 '90',
 'machine',
 ',',
 'a',
 'direct',
 'competitor',
 'with',
 'the',
 'cr',
 '##ay',
 '-',
 '3',
 ',',
 'in',
 '1991',
 '.']

In [475]:
node_spans_dict = eds_nodes_to_surface_char_level(edses[issue_ind], sentences[issue_ind])
node_spans_dict

{'_1': {'surface': 'Cray Computer', 'start_char': 0, 'stop_char': 13},
 'e10': {'surface': 'Cray Computer', 'start_char': 0, 'stop_char': 13},
 '_2': {'surface': 'Cray', 'start_char': 0, 'stop_char': 4},
 'x9': {'surface': 'Cray', 'start_char': 0, 'stop_char': 4},
 'x6': {'surface': 'Computer', 'start_char': 5, 'stop_char': 13},
 'e15': {'surface': 'also', 'start_char': 14, 'stop_char': 18},
 'e3': {'surface': 'face', 'start_char': 24, 'stop_char': 28},
 '_3': {'surface': 'intense competition, not only from Cray Research, which has about 60% of the world-wide supercomputer market and which is expected to roll out the C-90 machine, a direct competitor with the Cray-3, in 1991.',
  'start_char': 29,
  'stop_char': 235},
 'e23': {'surface': 'intense', 'start_char': 29, 'stop_char': 36},
 'x18': {'surface': 'competition,', 'start_char': 37, 'stop_char': 49},
 'e24': {'surface': 'not only', 'start_char': 50, 'stop_char': 58},
 'e25': {'surface': 'from', 'start_char': 59, 'stop_char': 63},
 

In [479]:
eds = edses[issue_ind]
sentence = sentences[issue_ind]

In [480]:
sentence

'"You either believe Seymour can do it again or you don'

In [476]:
match_node_spans_with_token_index(node_spans_dict, tokens)

(0, 3)
(0, 3)
(0, 2)
(0, 2)
(2, 3)
(3, 4)
(5, 6)
(6, 57)
(6, 7)
(7, 9)
(9, 11)
(11, 12)
(12, 57)
(12, 16)
(12, 14)
(12, 14)
(14, 16)
(17, 18)
(18, 19)
(19, 31)
(19, 20)
(20, 21)
(21, 22)
(22, 23)
(23, 26)
(23, 26)
(23, 26)
(23, 26)
(26, 31)
(26, 30)
(26, 30)
(30, 31)
(31, 32)
(34, 35)
(34, 35)
(36, 37)
(38, 54)
(38, 39)
(39, 44)
(39, 42)
(39, 42)
(42, 44)
(44, 45)
(45, 46)
(46, 47)
(47, 48)
(48, 49)
(49, 54)
(49, 53)
(49, 53)
(49, 54)
(54, 55)
(55, 57)
(55, 57)


{'_1': {'surface': 'Cray Computer',
  'start_char': 0,
  'stop_char': 13,
  'token_index': (0, 3)},
 'e10': {'surface': 'Cray Computer',
  'start_char': 0,
  'stop_char': 13,
  'token_index': (0, 3)},
 '_2': {'surface': 'Cray',
  'start_char': 0,
  'stop_char': 4,
  'token_index': (0, 2)},
 'x9': {'surface': 'Cray',
  'start_char': 0,
  'stop_char': 4,
  'token_index': (0, 2)},
 'x6': {'surface': 'Computer',
  'start_char': 5,
  'stop_char': 13,
  'token_index': (2, 3)},
 'e15': {'surface': 'also',
  'start_char': 14,
  'stop_char': 18,
  'token_index': (3, 4)},
 'e3': {'surface': 'face',
  'start_char': 24,
  'stop_char': 28,
  'token_index': (5, 6)},
 '_3': {'surface': 'intense competition, not only from Cray Research, which has about 60% of the world-wide supercomputer market and which is expected to roll out the C-90 machine, a direct competitor with the Cray-3, in 1991.',
  'start_char': 29,
  'stop_char': 235,
  'token_index': (6, 57)},
 'e23': {'surface': 'intense',
  'start_cha

In [447]:
ind = 2
cur_span = node_spans[ind].lower()
a = 2
tokens = [x[2:] if x[:2]=='##' else x for x in tokens]
surface_string_to_token_index(cur_span, tokens, cur_span, a, a+1)

(3, 15)

In [446]:
cur_span.startswith(tokens[4:][0])

False

In [445]:
cur_span[len(tokens[3:][0]):].strip()

'malizia negus, editor of money fund report,'

In [441]:
tokens[2:]

['said',
 'brenda',
 'mali',
 '##zia',
 'ne',
 '##gus',
 ',',
 'editor',
 'of',
 'money',
 'fund',
 'report',
 ',',
 'yields',
 '"',
 'may',
 'b',
 '##lip',
 'up',
 'again',
 'before',
 'they',
 'b',
 '##lip',
 'down',
 '"',
 'because',
 'of',
 'recent',
 'rises',
 'in',
 'short',
 '-',
 'term',
 'interest',
 'rates',
 '.']

In [471]:
issue = 'in 1991.'

In [472]:
for i in range (len(sentences)):
    if issue in sentences[i]:
        break

In [473]:
i

234

In [291]:
sentences = [ 
              "Hello I'm a single sentence",
              "And another sentence",
              "And the very very last one long long sentence",
              "Hello I'm a single sentence",
              "And another sentence",
              "And the very very last one",
              "Hello I'm a single sentence",
              "And another sentence"
            ]
tokens_list, token_embeddings = bert_featurerise(sentences)

Loading tokenizer...
Loading BERT...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|█████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 21.13it/s]


In [277]:
encoded['input_ids']

[[101, 7592, 1045, 1005, 1049, 1037, 2309, 6251, 102, 0, 0],
 [101, 1998, 2178, 6251, 102, 0, 0, 0, 0, 0, 0],
 [101, 1998, 1996, 2200, 2200, 2197, 2028, 2146, 2146, 6251, 102],
 [101, 7592, 1045, 1005, 1049, 1037, 2309, 6251, 102, 0, 0],
 [101, 1998, 2178, 6251, 102, 0, 0, 0, 0, 0, 0],
 [101, 1998, 1996, 2200, 2200, 2197, 2028, 102, 0, 0, 0],
 [101, 7592, 1045, 1005, 1049, 1037, 2309, 6251, 102, 0, 0],
 [101, 1998, 2178, 6251, 102, 0, 0, 0, 0, 0, 0]]

In [292]:
tokens_list

[['[CLS]', 'hello', 'i', "'", 'm', 'a', 'single', 'sentence', '[SEP]'],
 ['[CLS]', 'and', 'another', 'sentence', '[SEP]'],
 ['[CLS]',
  'and',
  'the',
  'very',
  'very',
  'last',
  'one',
  'long',
  'long',
  'sentence',
  '[SEP]'],
 ['[CLS]', 'hello', 'i', "'", 'm', 'a', 'single', 'sentence', '[SEP]'],
 ['[CLS]', 'and', 'another', 'sentence', '[SEP]'],
 ['[CLS]', 'and', 'the', 'very', 'very', 'last', 'one', '[SEP]'],
 ['[CLS]', 'hello', 'i', "'", 'm', 'a', 'single', 'sentence', '[SEP]'],
 ['[CLS]', 'and', 'another', 'sentence', '[SEP]']]

In [293]:
tokens = tokens_list[0]
tokens

['[CLS]', 'hello', 'i', "'", 'm', 'a', 'single', 'sentence', '[SEP]']

In [320]:
token_embeddings[0][1:5].shape

torch.Size([4, 768])

In [314]:
def get_node_span_embedding(eds, sentence, tokens, token_embeddings):
    surface_per_node = eds_nodes_to_surface_char_level(eds, sentence)
    span_embeddings = []
    for surface in surface_per_node:
        a, b = surface_string_to_token_index(surface, tokens)
        embeddings = token_embeddings[a:b]

        span_embeddings.append(torch.mean(torch.cat(embeddings, dim=0), dim=0).unsqueeze(0))
    return span_embeddings

In [322]:

torch.mean(token_embeddings[0][1:5], dim=0).unsqueeze(0).shape

torch.Size([1, 768])

In [429]:
'"safas'.strip()

'"safas'

In [497]:
a
b.shape

torch.Size([1, 24])

In [500]:
torch.cat([a,b],dim=1 ).shape

torch.Size([1, 48])

In [493]:
tokenizer(sentences[236:239])

{'input_ids': [[101, 13675, 4710, 3274, 2036, 2097, 2227, 6387, 2971, 1010, 2025, 2069, 2013, 13675, 4710, 2470, 1010, 2029, 2038, 2055, 3438, 1003, 1997, 1996, 2088, 1011, 2898, 3565, 9006, 18780, 2121, 3006, 1998, 2029, 2003, 3517, 2000, 4897, 2041, 1996, 1039, 1011, 3938, 3698, 1010, 1037, 3622, 12692, 2007, 1996, 13675, 4710, 1011, 1017, 1010, 1999, 2889, 1012, 102], [101, 1996, 6714, 7245, 2036, 2097, 5566, 2007, 2248, 2449, 6681, 13058, 1012, 1998, 2900, 102], [101, 2104, 3408, 1997, 1996, 6714, 7245, 1010, 13675, 4710, 2470, 4518, 17794, 2024, 2000, 4374, 2028, 13675, 4710, 3274, 3745, 2005, 2296, 2048, 13675, 4710, 2470, 6661, 2027, 2219, 1999, 1037, 4353, 3517, 2000, 5258, 1999, 2055, 2048, 3134, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0,

In [492]:
sentences[236:239]

['Cray Computer also will face intense competition, not only from Cray Research, which has about 60% of the world-wide supercomputer market and which is expected to roll out the C-90 machine, a direct competitor with the Cray-3, in 1991.',
 'The spinoff also will compete with International Business Machines Corp. and Japan',
 'Under terms of the spinoff, Cray Research stockholders are to receive one Cray Computer share for every two Cray Research shares they own in a distribution expected to occur in about two weeks.']

In [510]:
eds.nodes[3].type

'e'

In [512]:
eds.nodes[5]

<Node object (x15:named<20:27>[x]) at 139893323863872>

In [513]:
'1'.zfill(3)

'001'