In [1]:
from src.recom_search.model.beam_node_reverse import ReverseNode
from transformers import AutoTokenizer, AutoModel

import flatten_lattice as fl
import torch
from bert_models import LinearLatticeBert, LinearPOSBert
from encoding_utils import *
import pickle
import toy_helper as thelp

import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, AutoConfig
from latmask_bert_models import LatticeBertModel
import json


device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')

from mask_utils import *
from encoding_utils import *


bert_tok = AutoTokenizer.from_pretrained("bert-base-cased")
mbart_tok = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-one-mmt")

2022-09-05 05:46:51.206401: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-05 05:46:51.206422: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
# Model Wrapper
class LinearPOSBertV1(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = LatticeBertModel(AutoConfig.from_pretrained('bert-base-cased'))
        self.probe = nn.Linear(self.bert.config.hidden_size, num_labels)
        self.to(device)

    def parameters(self):
        return self.probe.parameters()
  
    def forward(self, sentences, pos_ids=None, attmasks=None):
        with torch.no_grad(): # no training of BERT parameters
            word_rep, sentence_rep = self.bert(sentences, position_ids=pos_ids, encoder_attention_mask=attmasks, attention_mask=attmasks, return_dict=False)
        return self.probe(word_rep)
    
def prepare_dataset(resset):
    x = []
    y = []
    for res in resset:
        
        cleaned = [clean_expanded(r) for r in res]
        inputs = bert_tok(cleaned, padding="max_length", max_length=500, return_tensors='pt').to(device)

        y.append(posbmodel(inputs.input_ids, attmasks = inputs.attention_mask))
        x.append(inputs.input_ids)
        
    return x, y

def check_accuracy(setpred, setlabels):
    cor = 0
    tot = 0
    for i in range(0, len(setpred)):
        ex = setpred[i]
        for j in range(0, len(ex)):
            if sum(setlabels[i][j])==0:
                continue
            elif torch.argmax(setlabels[i][j])==0:
                continue
            tot+=1
            if torch.argmax(ex[j])==torch.argmax(setlabels[i][j]):
                cor+=1
    return cor/tot

# correct posids
def mod_posids(pids):
    cop = pids
    for p in cop:
        for i in range(0, len(p)):
            if p[i]==0:
                p[i] = i
    return cop

# set posids to default
def def_posids(pids):
    cop = pids
    for p in cop:
        for i in range(0, len(p)):
            p[i] = i
    return cop

def show_labels (pred):
    res = []
    for p in pred:
        res.append(lablist[torch.argmax(p)])
    return res

In [3]:
# Load POS model, label vocabulary 
with open('./lab_vocab.json') as json_file:
    labels = json.load(json_file)
posbmodel = LinearPOSBertV1(len(list(labels.keys())))    
t = torch.load("./a3distrib/ckpt/posbert1way.pth")
posbmodel.load_state_dict(t)
posbmodel.eval()
print(torch.cuda.memory_allocated("cuda:2"))
torch.cuda.empty_cache()


867826688


In [16]:
# method that makes padding equal to 1
from mask_utils import ones_padding

def run_pipeline(inplist, resarrs = None, flat = None):
    # construct data structure for toy graph in format used on actual examples
    if flat==None:
        toygraph = thelp.create_toy_graph(inplist, mbart_tok)

        # get list of exploded candidates using same algorithm from numbers
        exploded = fl.get_all_possible_candidates(toygraph)

        # get a flattened version of toy lattice (same method as on actual examples)
        flat_toy = fl.flatten_lattice(toygraph)
    else:
        flat_toy = flat
        exploded = resarrs

    # generate mask (uses same method as actual examples), convert to -inf mask (seems to not do anything)
    mask = connect_mat(flat_toy)
    mask = torch.triu(mask)
    #mask[mask==0] = -float('inf')
    #mask = ones_padding(mask)
    

    # get gold labels for the exploded set
    dsetx, dsety = prepare_dataset([exploded])

    assert len(dsetx)==1

    # from encoding utils, get posids and relevant tokens
    sents, posids = create_inputs([flat_toy])
    
    # get gold label dictionaries for tokens in example, based on averages of tokens on dsety
    _ , tmaps = lattice_pos_goldlabels(dsetx, dsety, sents)

    # generate gold y labels using tmaps and 
    latposylabels = tmap_pos_goldlabels(tmaps, sents)

    # get generated labels for flattened lattice, def_posids can be used for default posids
    # params start as (sents.to(device), mod_posids(posids).to(device), torch.stack([mask]).to(device))
    # posids, mask can be set to None to ablate to default
    pred = posbmodel(sents.to(device), mod_posids(posids).to(device), torch.stack([mask]).to(device))
    #pred = posbmodel(sents.to(device), None, None)
    return pred, latposylabels, tmaps, sents, posids, dsetx, dsety, flat_toy, mask

lablist = [k for k in labels.keys()]
def print_results(CUTOFF):

    # sanity check to look at flat lattice 
    p = flat_toy
    tlist = fl.get_toklist(p)
    res = ""
    for s in tlist:
        res = res+" "+bert_tok.decode(s)
    decstr = res

    # number of tokens, the tokens that are passed into model for lattice
    print("INPUT")
    print(decstr)

    print("PREDICTED")
    print(show_labels(pred[0])[:CUTOFF])
    print("GOLD")
    print(show_labels(latposylabels[0])[:CUTOFF])
    
    # run explodeds through model
    indivlabs = posbmodel(dsetx[0])
    print("")
    print("Exploded paths")
    # show labels for s1, s2 when run through individually
    for i in range(len(inputlist)):
        print(inputlist[i])
        print(show_labels(indivlabs[i])[:20])

In [17]:
# get a list of input strings of the format where the start w/ the same pre-fix but have different endings
inputlist = [
    "The Fed raises interest rates.",
    "The Fed raises interest among the economists .",
    "The Fed raises the children of the future ."
]

pred, latposylabels, tmaps, sents, posids, dsetx, dsety, flat_toy, mask = run_pipeline(inputlist)
#mask[mask==0] = -float('inf')

# accuracy (assumes that gold is good, which isn't confirmed here)
print("Accuracy")
print(check_accuracy(pred, latposylabels))
# input is number of toks to print
print_results(26)

3
0
0
Accuracy
0.03
INPUT
 The Fed raises interest rates . interest among the economists . the children of the future .
PREDICTED
['<cls>', 'DT', 'NNP', 'VBZ', 'NN', 'NNS', '.', 'NN', 'IN', 'DT', 'NNS', '.', 'DT', 'NNS', 'IN', 'DT', 'NN', '.', '<sep>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
GOLD
['<cls>', 'DT', 'NNP', 'VBZ', 'NN', 'NNS', 'NNP', 'NN', 'IN', 'DT', 'NNS', 'NNP', 'DT', 'NNS', 'IN', 'DT', 'NN', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP']

Exploded paths
The Fed raises interest rates.
['<cls>', 'DT', 'NNP', 'VBZ', 'NN', 'NNS', 'RB', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
The Fed raises interest among the economists .
['<cls>', 'DT', 'NNP', 'VBZ', 'NN', 'IN', 'DT', 'NNS', 'RB', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
The Fed raises the children of the future .
['<cls>', 'DT', 'NNP', 'VBZ', 'DT', 'NN

In [13]:
lablist[torch.argmax(tmaps[0]['0'])]

'NNP'

In [9]:
# construct data structure for toy graph in format used on actual examples
toygraph = thelp.create_toy_graph(inputlist, mbart_tok)

# get list of exploded candidates using same algorithm from numbers
exploded = fl.get_all_possible_candidates(toygraph)

# get a flattened version of toy lattice (same method as on actual examples)
flat_toy = fl.flatten_lattice(toygraph)

# generate mask (uses same method as actual examples), convert to -inf mask (seems to not do anything)
mask = connect_mat(flat_toy)
mask[mask==0] = -float('inf')
mask = ones_padding(mask)


# get gold labels for the exploded set
dsetx, dsety = prepare_dataset([exploded])

assert len(dsetx)==1

# from encoding utils, get posids and relevant tokens
sents, posids = create_inputs([flat_toy])

In [10]:
sents[0][:19]

tensor([  101,  1109, 26356, 13686,  2199,  5600,   119,  2199,  1621,  1103,
        27705,   119,  1103,  1482,  1104,  1103,  2174,   119,   102],
       device='cuda:2')

In [11]:
# get gold label dictionaries for tokens in example, based on averages of tokens on dsety
_ , tmaps = lattice_pos_goldlabels(dsetx, dsety, sents)

# generate gold y labels using tmaps and 
latposylabels = tmap_pos_goldlabels(tmaps, sents)

# get generated labels for flattened lattice, def_posids can be used for default posids
# params start as (sents.to(device), mod_posids(posids).to(device), torch.stack([mask]).to(device))
# posids, mask can be set to None to ablate to default
pred = posbmodel(sents.to(device), def_posids(posids).to(device), torch.stack([mask]).to(device))#, mod_posids(posids).to(device), torch.stack([mask]).to(device))
#pred = posbmodel(sents.to(device), mod_posids(posids).to(device), None)

3
0
0


In [18]:
shortsents = torch.stack([sents[0][:19]])

In [18]:
pnobatch = posbmodel(shortsents.to(device))

In [19]:
torch.argmax(pnobatch[0][3])

tensor(18, device='cuda:2')

In [20]:
torch.argmax(pred[0][3])

tensor(18, device='cuda:2')

In [21]:
pl = []
for p in pred[0]:
    pl.append(int(torch.argmax(p)))

pnl = []
for pn in pnobatch[0]:
    pnl.append(int(torch.argmax(pn)))

In [19]:
loss = torch.nn.MSELoss()
for i in range(len(pnl)):
    print(loss(pred[0][i], pnobatch[0][i]))

NameError: name 'pnl' is not defined

In [41]:
print(list(zip(pl[:len(pnl)], pnl)))

[(0, 0), (2, 2), (3, 3), (18, 18), (5, 5), (10, 10), (17, 27), (5, 5), (1, 1), (2, 2), (10, 10), (17, 27), (2, 2), (10, 10), (1, 1), (2, 2), (23, 5), (17, 17), (0, 0)]


In [42]:
pred.shape

torch.Size([1, 500, 44])

In [12]:
loss = torch.nn.MSELoss()

ids = bert_tok("How is it going my boy.", padding="max_length", max_length=500, return_tensors='pt').to(device)
out = posbmodel(ids.input_ids, attmasks=ids.attention_mask)

In [13]:

shids = bert_tok("How is it going my boy.", return_tensors='pt').to(device)
shout = posbmodel(shids.input_ids, attmasks=shids.attention_mask)


In [14]:
loss(out[0][:9][0], shout[0][0])

tensor(6.9396e-12, device='cuda:2', grad_fn=<MseLossBackward0>)

In [5]:
print(out[0][:9][0])
print(shout[0][0])

NameError: name 'out' is not defined

In [13]:
out[0][:9].shape

torch.Size([9, 44])

In [14]:
shout[0].shape

torch.Size([9, 44])

In [6]:
import pickle

pgraphs = None
with open('./torchsaved/pgraphsall.pkl', 'rb') as file:
    pgraphs = pickle.load(file)
    
resarrs = None
with open('./torchsaved/resarrsall.pkl', 'rb') as file:
    resarrs = pickle.load(file)

In [7]:
IND = 87
pred, latposylabels, tmaps, sents, posids, dsetx, dsety, flat_toy, mask = run_pipeline(None, resarrs[IND], pgraphs[IND])
#mask[mask==0] = -float('inf')

298
missing token
missing token
missing token
missing token
missing token
missing token
missing token
missing token
missing token
missing token
missing token
missing token
missing token
missing token
missing token
missing token
0
missing token
missing token
missing token
missing token
missing token
missing token
missing token
missing token
missing token
missing token
missing token
missing token
missing token
missing token
missing token
missing token
0


In [8]:
# accuracy (assumes that gold is good, which isn't confirmed here)
print("Accuracy")
print(check_accuracy(pred, latposylabels))
# input is number of toks to print
print_results(298)

Accuracy
0.8381742738589212
INPUT
 Two days before the trial of President Mohamed Mo ##rs ##i was to begin , they went down to the street . s . the street . s . , who has been deposed , they went ou ##sted President Mohamed Mo ##rs ##i began , the two men went down to the street . s . they were out on the street . s . went to the street . s . on the street . s . into the street . s . out on the took to the street . s . , the two men went down to they were out on the street . s . took to the street . s . went into the street . s . out on the the outgoing President Mohamed Mo ##rs ##i began , the , the two they were took went into out down to the the street . s . deposed President Mohamed Mo ##rs ##i opens , the two they have taken to the street . s . took went out was due to start , they took went open , they took went begin , the two they were took went out to open , they took went begin , the two opened , the two they were on the street . s . took went to on into out into the street .

NameError: name 'inputlist' is not defined

In [15]:
lablist[torch.argmax(tmaps[0]['119'])]

'.'

In [17]:
for t in tmaps[0].keys():
    print(bert_tok.decode(int(t)), " ", lablist[torch.argmax(tmaps[0][t])])
    

[CLS]   <cls>
Two   CD
days   NNS
before   IN
the   DT
trial   NN
of   IN
President   NNP
Mohamed   NNP
Mo   NNP
##rs   NNP
##i   NNP
was   VBD
to   TO
begin   VB
,   ,
they   PRP
went   VBD
down   IN
street   NN
.   .
[SEP]   <sep>
[PAD]   <pad>
streets   NNS
who   WP
has   VBZ
been   VBN
deposed   JJ
ou   NN
##sted   VBN
began   VBD
two   CD
men   NNS
were   VBD
out   IN
on   IN
into   IN
took   VBD
outgoing   JJ
opens   VBZ
have   VBP
taken   VBN
due   VBG
start   VB
open   VB
opened   VBD


's'

In [25]:
cnt = 0
for l in latposylabels[0]:
    
    
    #print(cnt, " ", lablist[torch.argmax(l)] )
    if torch.argmax(l)==0:
        print(bert_tok.decode(sents[0][23]))
    cnt+=1

s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
s
