In [1]:
import torch
from transformers import BertTokenizer,BertModel,BertForPreTraining,BertForQuestionAnswering
import numpy as np
import glob
import os
import traceback

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
bert_model = BertModel.from_pretrained('bert-base-cased',output_hidden_states=True)    

In [3]:
def get_individual_token_ids(sentence, T=120):
    
    tokens = tokenizer.tokenize(sentence)
    tokens = ['[CLS]'] + tokens + ['[SEP]']

    padded_tokens = tokens +['[PAD]' for _ in range(T-len(tokens))]
    attn_mask = [ 1 if token != '[PAD]' else 0 for token in padded_tokens  ]

    seg_ids = [1 for _ in range(len(padded_tokens))]
    sent_ids = tokenizer.convert_tokens_to_ids(padded_tokens)
#     print("senetence idexes \n {} ".format(sent_ids))

    token_ids = torch.tensor(sent_ids).unsqueeze(0) 
    attn_mask = torch.tensor(attn_mask).unsqueeze(0) 
    seg_ids   = torch.tensor(seg_ids).unsqueeze(0)
        
    
    return tokens, token_ids, attn_mask, seg_ids

In [4]:
def get_embedding(last_1_layer, last_2_layer, last_3_layer, last_4_layer, T=120):

    token_list = []
    
    for index in range(T):
        token = torch.add(last_1_layer[index],last_2_layer[index])
        token = torch.add(token,last_3_layer[index])
        token = torch.add(token,last_4_layer[index])
        #print(token)
        token_mean = torch.div(token, 4.0)
        #print(token_mean)
        token_list.append(token_mean)
        #token_mean.shape

#     print ('Shape is: %d x %d' % (len(token_list), len(token_list[0])))

#     sentence_embedding = torch.mean(torch.stack(token_list), dim=0)
#     print(sentence_embedding.shape)

    return token_list


In [5]:
 def get_embedding_from_bert(token_ids, attn_mask, seg_ids, num_layers=4, T=120):
    bert_model.eval()

    with torch.no_grad():
        model_outputs = bert_model(token_ids, attention_mask = attn_mask,\
                                                token_type_ids = seg_ids)

    last_4_hidden_states = model_outputs[-1][-num_layers:]
#     print('**********', len(model_outputs), len(model_outputs[-1]), len(last_4_hidden_states))
#     print(token_ids)
    
    last_1_layer = torch.squeeze(last_4_hidden_states[0],dim=0)
    last_2_layer = torch.squeeze(last_4_hidden_states[1],dim=0)
    last_3_layer = torch.squeeze(last_4_hidden_states[2],dim=0)
    last_4_layer = torch.squeeze(last_4_hidden_states[3],dim=0)

    token_list_embedding = get_embedding(last_1_layer, last_2_layer, last_3_layer, last_4_layer, T)
    
    return token_list_embedding[:np.count_nonzero(attn_mask)]


In [6]:
def bert_embedding_individuals(file_name, sentences, tokenizer, bert_model, T=120):
    
    output_path = '/Users/talhindi/Documents/claim_detection/features/'
    file_name = file_name
    token_embeddings = []
    
    for sent_id, sentence in enumerate(sentences):
        
        try:
            print('processing sentence: ', sent_id)
            sent_tokens = sentence.split()
            tkns, token_ids, attn_mask, seg_ids = get_individual_token_ids(sentence, T)
            token_list_embedding = get_embedding_from_bert(token_ids, attn_mask, seg_ids, T=T)

            assert tkns[0] == '[CLS]'

            adjusted_token_emb, j = [], 1
            for i in range(len(sent_tokens)):
#                 print(i , j)

                # print(i+1, sent_tokens[i], end =' <--> ')
                # print(j, tkns[j], end=' ')
                j+=1

                if sent_tokens[i] == tkns[j-1]:
                    adjusted_token_emb.append(torch.squeeze(token_list_embedding[j-1]))
                else:
                    two_tokens = tkns[j-1].replace('#','') + tkns[j].replace('#','')
                    three_tokens = tkns[j-1].replace('#','') + tkns[j].replace('#','') + tkns[j+1].replace('#','')
                    if j+2 < len(tkns):
                        four_tokens = tkns[j-1].replace('#','') + tkns[j].replace('#','') + \
                                      tkns[j+1].replace('#','') + tkns[j+2].replace('#','')
                    else:
                        four_tokens = ''
                    
                    if j+3 < len(tkns):
                        five_tokens = tkns[j-1].replace('#','') + tkns[j].replace('#','') + \
                                      tkns[j+1].replace('#','') + tkns[j+2].replace('#','') + tkns[j+3].replace('#','')
                    else:
                        five_tokens = ''

                    if sent_tokens[i] == two_tokens: # handling 's, 'm
                        adjusted_token_emb.append(torch.squeeze(torch.mean(torch.stack(token_list_embedding[j-1:j+1]))) )
                        # print(tkns[j], end=' ')
                        j+=1

                    elif sent_tokens[i] == three_tokens: # handling n't
                        adjusted_token_emb.append(torch.squeeze(torch.mean(torch.stack(token_list_embedding[j-1:j+2]))) )
                        # print(tkns[j], end=' ')
                        j+=2

                    elif sent_tokens[i] == four_tokens: # handling U.S. <-->  U . S .
                        adjusted_token_emb.append(torch.squeeze(torch.mean(torch.stack(token_list_embedding[j-1:j+3]))) )
                        # print(tkns[j], end=' ')
                        j+=3
                        
                    elif sent_tokens[i] == five_tokens: # handling U.S. <-->  U . S .
                        adjusted_token_emb.append(torch.squeeze(torch.mean(torch.stack(token_list_embedding[j-1:j+4]))) )
                        # print(tkns[j], end=' ')
                        j+=4
                    else:
                        print('I have a longer list of wordpieces!')
                        # handling longer wordpieces, if any
                        wordpiece, wordpiece_emb = True, [token_list_embedding[j-1]]
#                         while wordpiece:
#                             if '#' in tkns[j]:
#                                 wordpiece_emb.append(token_list_embedding[j])
#                                 # print(tkns[j], end=' ')
#                                 j+=1
#                             else:
#                                 wordpiece = False
#                                 adjusted_token_emb.append( torch.squeeze(torch.mean(torch.stack(wordpiece_emb))) )
                        
                        tok_seq = tkns[j-1].replace('#','')
                        while wordpiece:
                            if sent_tokens[i] != tok_seq and j < len(tkns):
                                wordpiece_emb.append(token_list_embedding[j])
                                tok_seq += tkns[j].replace('#','')
                                j+=1
                            else:
                                wordpiece = False
                                adjusted_token_emb.append( torch.squeeze(torch.mean(torch.stack(wordpiece_emb))) )
                            

                # print(j)
            assert tkns[j] == '[SEP]'
            assert len(sent_tokens) == len(adjusted_token_emb)
        
        except Exception:
            print(i , j, len(sent_tokens), len(tkns), len(token_list_embedding))
            np.save(os.path.join(output_path, file_name+'_'+str(sent_id)+'.bert.npy'), token_embeddings)
            traceback.print_exc()
            return token_embeddings
        
        token_embeddings.append(adjusted_token_emb)
        
    np.save(os.path.join(output_path, file_name+'.bert.npy'), token_embeddings)
#     np.savetxt(os.path.join(output_path,file_name+'.txt'),msg_embeddings)
    
    return token_embeddings
 

In [168]:
train = open('../data/SG2017_claim/train.txt','r').readlines()

sent_tokens, sentences, sent_start = [], [], 0
for i, line in enumerate(train):
    if line == '\n':
        sent = ' '.join(sent_tokens)
        sentences.append(sent)
        sent_tokens = []
    else:        
        token, label = line.rstrip().split()
        sent_tokens.append(token)

In [193]:
embeddings = bert_embedding_individuals('train_claim_emb', sentences[3773:], tokenizer, bert_model)

processing sentence:  0
processing sentence:  1
processing sentence:  2
processing sentence:  3
processing sentence:  4
processing sentence:  5
processing sentence:  6
processing sentence:  7
processing sentence:  8
processing sentence:  9
processing sentence:  10
processing sentence:  11
processing sentence:  12
processing sentence:  13
processing sentence:  14
processing sentence:  15
processing sentence:  16
processing sentence:  17
processing sentence:  18
processing sentence:  19
processing sentence:  20
processing sentence:  21
processing sentence:  22
processing sentence:  23
processing sentence:  24
processing sentence:  25
processing sentence:  26
processing sentence:  27
processing sentence:  28
processing sentence:  29
processing sentence:  30
processing sentence:  31
processing sentence:  32
processing sentence:  33
processing sentence:  34
processing sentence:  35
processing sentence:  36
processing sentence:  37
processing sentence:  38
processing sentence:  39
processing

processing sentence:  320
processing sentence:  321
processing sentence:  322
processing sentence:  323
processing sentence:  324
processing sentence:  325
processing sentence:  326
processing sentence:  327
processing sentence:  328
processing sentence:  329
processing sentence:  330
processing sentence:  331
processing sentence:  332
processing sentence:  333
processing sentence:  334
processing sentence:  335
processing sentence:  336
processing sentence:  337
processing sentence:  338
processing sentence:  339
processing sentence:  340
processing sentence:  341
processing sentence:  342
processing sentence:  343
processing sentence:  344
processing sentence:  345
processing sentence:  346
processing sentence:  347
processing sentence:  348
processing sentence:  349
processing sentence:  350
processing sentence:  351
processing sentence:  352
processing sentence:  353
processing sentence:  354
processing sentence:  355
processing sentence:  356
processing sentence:  357
processing s

processing sentence:  634
processing sentence:  635
processing sentence:  636
processing sentence:  637
processing sentence:  638
processing sentence:  639
processing sentence:  640
processing sentence:  641
processing sentence:  642
processing sentence:  643
processing sentence:  644
processing sentence:  645
processing sentence:  646
processing sentence:  647
processing sentence:  648
processing sentence:  649
processing sentence:  650
processing sentence:  651
processing sentence:  652
processing sentence:  653
processing sentence:  654
processing sentence:  655
processing sentence:  656
processing sentence:  657
processing sentence:  658
processing sentence:  659
processing sentence:  660
processing sentence:  661
processing sentence:  662
processing sentence:  663
processing sentence:  664
processing sentence:  665
processing sentence:  666
processing sentence:  667
processing sentence:  668
processing sentence:  669
processing sentence:  670
processing sentence:  671
processing s

processing sentence:  947
processing sentence:  948
processing sentence:  949
processing sentence:  950
processing sentence:  951
I have a longer list of wordpieces!
processing sentence:  952
processing sentence:  953
processing sentence:  954
processing sentence:  955
processing sentence:  956
processing sentence:  957
processing sentence:  958
processing sentence:  959
processing sentence:  960
processing sentence:  961
processing sentence:  962
processing sentence:  963
processing sentence:  964
processing sentence:  965
processing sentence:  966
processing sentence:  967
processing sentence:  968
processing sentence:  969
processing sentence:  970
processing sentence:  971
processing sentence:  972
processing sentence:  973
processing sentence:  974
processing sentence:  975
processing sentence:  976
processing sentence:  977
processing sentence:  978
processing sentence:  979
processing sentence:  980
processing sentence:  981
processing sentence:  982
processing sentence:  983
pr

processing sentence:  1250
processing sentence:  1251
processing sentence:  1252
processing sentence:  1253
processing sentence:  1254
processing sentence:  1255
processing sentence:  1256
processing sentence:  1257
processing sentence:  1258
processing sentence:  1259
processing sentence:  1260
processing sentence:  1261
processing sentence:  1262
processing sentence:  1263
processing sentence:  1264
processing sentence:  1265
processing sentence:  1266
processing sentence:  1267
processing sentence:  1268
processing sentence:  1269
processing sentence:  1270
processing sentence:  1271
processing sentence:  1272
processing sentence:  1273
processing sentence:  1274
processing sentence:  1275
processing sentence:  1276
processing sentence:  1277
processing sentence:  1278
processing sentence:  1279
processing sentence:  1280
processing sentence:  1281
processing sentence:  1282
processing sentence:  1283
processing sentence:  1284
processing sentence:  1285
processing sentence:  1286
p

processing sentence:  1549
processing sentence:  1550
processing sentence:  1551
processing sentence:  1552
processing sentence:  1553
processing sentence:  1554
processing sentence:  1555
processing sentence:  1556
processing sentence:  1557
processing sentence:  1558
processing sentence:  1559
processing sentence:  1560
processing sentence:  1561
processing sentence:  1562
processing sentence:  1563
processing sentence:  1564
processing sentence:  1565
processing sentence:  1566
processing sentence:  1567
processing sentence:  1568
processing sentence:  1569
processing sentence:  1570
processing sentence:  1571
processing sentence:  1572
processing sentence:  1573
processing sentence:  1574
processing sentence:  1575
processing sentence:  1576
processing sentence:  1577
processing sentence:  1578
processing sentence:  1579
processing sentence:  1580
processing sentence:  1581
processing sentence:  1582
processing sentence:  1583
processing sentence:  1584
processing sentence:  1585
p

processing sentence:  1852
processing sentence:  1853
processing sentence:  1854
processing sentence:  1855
processing sentence:  1856
processing sentence:  1857
processing sentence:  1858
processing sentence:  1859
processing sentence:  1860
processing sentence:  1861
processing sentence:  1862
processing sentence:  1863
processing sentence:  1864
processing sentence:  1865




In [195]:
test = open('../data/SG2017_claim/test.txt','r').readlines()

sent_tokens, sentences = [], []
for line in test:
    if line == '\n':
        sent = ' '.join(sent_tokens)
        sentences.append(sent)
        sent_tokens = []
    else:        
        token, label = line.rstrip().split()
        sent_tokens.append(token)

In [196]:
embeddings = bert_embedding_individuals('test_claim_emb', sentences, tokenizer, bert_model)

processing sentence:  0
processing sentence:  1
processing sentence:  2
processing sentence:  3
processing sentence:  4
processing sentence:  5
processing sentence:  6
processing sentence:  7
processing sentence:  8
processing sentence:  9
I have a longer list of wordpieces!
processing sentence:  10
processing sentence:  11
processing sentence:  12
processing sentence:  13
processing sentence:  14
processing sentence:  15
processing sentence:  16
processing sentence:  17
processing sentence:  18
processing sentence:  19
processing sentence:  20
processing sentence:  21
processing sentence:  22
processing sentence:  23
processing sentence:  24
processing sentence:  25
processing sentence:  26
processing sentence:  27
processing sentence:  28
processing sentence:  29
processing sentence:  30
processing sentence:  31
processing sentence:  32
processing sentence:  33
processing sentence:  34
processing sentence:  35
processing sentence:  36
processing sentence:  37
processing sentence:  38

processing sentence:  318
processing sentence:  319
processing sentence:  320
processing sentence:  321
processing sentence:  322
processing sentence:  323
processing sentence:  324
processing sentence:  325
processing sentence:  326
processing sentence:  327
processing sentence:  328
processing sentence:  329
processing sentence:  330
processing sentence:  331
processing sentence:  332
processing sentence:  333
processing sentence:  334
processing sentence:  335
processing sentence:  336
processing sentence:  337
processing sentence:  338
processing sentence:  339
processing sentence:  340
processing sentence:  341
processing sentence:  342
processing sentence:  343
I have a longer list of wordpieces!
processing sentence:  344
processing sentence:  345
processing sentence:  346
processing sentence:  347
processing sentence:  348
processing sentence:  349
processing sentence:  350
processing sentence:  351
processing sentence:  352
processing sentence:  353
processing sentence:  354
pr

processing sentence:  631
processing sentence:  632
processing sentence:  633
processing sentence:  634
processing sentence:  635
processing sentence:  636
processing sentence:  637
processing sentence:  638
processing sentence:  639
processing sentence:  640
processing sentence:  641
processing sentence:  642
processing sentence:  643
processing sentence:  644
processing sentence:  645
processing sentence:  646
processing sentence:  647
processing sentence:  648
processing sentence:  649
processing sentence:  650
processing sentence:  651
processing sentence:  652
processing sentence:  653
processing sentence:  654
processing sentence:  655
processing sentence:  656
processing sentence:  657
processing sentence:  658
processing sentence:  659
processing sentence:  660
processing sentence:  661
processing sentence:  662
processing sentence:  663
processing sentence:  664
processing sentence:  665
processing sentence:  666
processing sentence:  667
processing sentence:  668
processing s

processing sentence:  945
processing sentence:  946
processing sentence:  947
processing sentence:  948
processing sentence:  949
processing sentence:  950
processing sentence:  951
processing sentence:  952
processing sentence:  953
processing sentence:  954
processing sentence:  955
processing sentence:  956
processing sentence:  957
processing sentence:  958
processing sentence:  959
processing sentence:  960
processing sentence:  961
processing sentence:  962
processing sentence:  963
processing sentence:  964
processing sentence:  965
processing sentence:  966
processing sentence:  967
processing sentence:  968
processing sentence:  969
processing sentence:  970
processing sentence:  971
processing sentence:  972
processing sentence:  973
processing sentence:  974
processing sentence:  975
processing sentence:  976
processing sentence:  977
processing sentence:  978
processing sentence:  979
processing sentence:  980
processing sentence:  981
processing sentence:  982
processing s

processing sentence:  1250
processing sentence:  1251
processing sentence:  1252
processing sentence:  1253
processing sentence:  1254
processing sentence:  1255
processing sentence:  1256
processing sentence:  1257
processing sentence:  1258
processing sentence:  1259
processing sentence:  1260
processing sentence:  1261
processing sentence:  1262
processing sentence:  1263
processing sentence:  1264
processing sentence:  1265
processing sentence:  1266
processing sentence:  1267
processing sentence:  1268
processing sentence:  1269
processing sentence:  1270
processing sentence:  1271
processing sentence:  1272
processing sentence:  1273
processing sentence:  1274
processing sentence:  1275
processing sentence:  1276
processing sentence:  1277
processing sentence:  1278
processing sentence:  1279
processing sentence:  1280
processing sentence:  1281
processing sentence:  1282
processing sentence:  1283
processing sentence:  1284
processing sentence:  1285
processing sentence:  1286
p



In [8]:
test = open('../../data_wm/arg_clean_45_1/test.txt','r').readlines()

sent_tokens, sentences = [], []
for line in test:
    if line == '\n':
        sent = ' '.join(sent_tokens)
        sentences.append(sent)
        sent_tokens = []
    else:
        token, label = line.rstrip().split()
        if len(token) < 25 and 'www' not in token:
            sent_tokens.append(token)

In [14]:
embeddings = bert_embedding_individuals('wm1_claim_emb', sentences[546:], tokenizer, bert_model)

processing sentence:  0
I have a longer list of wordpieces!
processing sentence:  1
processing sentence:  2
processing sentence:  3
processing sentence:  4
processing sentence:  5
processing sentence:  6
processing sentence:  7
processing sentence:  8
processing sentence:  9
processing sentence:  10
processing sentence:  11
processing sentence:  12
processing sentence:  13
processing sentence:  14
processing sentence:  15
processing sentence:  16
processing sentence:  17
processing sentence:  18
processing sentence:  19
processing sentence:  20
processing sentence:  21
processing sentence:  22
processing sentence:  23
processing sentence:  24
processing sentence:  25
processing sentence:  26
processing sentence:  27
processing sentence:  28
processing sentence:  29
processing sentence:  30
processing sentence:  31
processing sentence:  32
processing sentence:  33
processing sentence:  34
processing sentence:  35
processing sentence:  36
processing sentence:  37
processing sentence:  38

processing sentence:  314
processing sentence:  315
processing sentence:  316
processing sentence:  317
processing sentence:  318
processing sentence:  319
processing sentence:  320
processing sentence:  321
processing sentence:  322
processing sentence:  323
processing sentence:  324
processing sentence:  325
processing sentence:  326
processing sentence:  327
processing sentence:  328
processing sentence:  329
processing sentence:  330
processing sentence:  331
processing sentence:  332
processing sentence:  333
processing sentence:  334
processing sentence:  335
processing sentence:  336
processing sentence:  337
processing sentence:  338
processing sentence:  339
processing sentence:  340
processing sentence:  341
processing sentence:  342
processing sentence:  343
processing sentence:  344
processing sentence:  345
processing sentence:  346
processing sentence:  347
processing sentence:  348
processing sentence:  349
processing sentence:  350
processing sentence:  351
processing s

processing sentence:  627
processing sentence:  628
processing sentence:  629
processing sentence:  630
processing sentence:  631
processing sentence:  632
processing sentence:  633
processing sentence:  634
processing sentence:  635
processing sentence:  636
processing sentence:  637
processing sentence:  638
processing sentence:  639
processing sentence:  640
processing sentence:  641
processing sentence:  642
processing sentence:  643
processing sentence:  644
processing sentence:  645
processing sentence:  646
processing sentence:  647
processing sentence:  648
processing sentence:  649
processing sentence:  650
processing sentence:  651
processing sentence:  652
processing sentence:  653
processing sentence:  654
processing sentence:  655
processing sentence:  656
processing sentence:  657
processing sentence:  658
processing sentence:  659
processing sentence:  660
processing sentence:  661
processing sentence:  662
I have a longer list of wordpieces!
processing sentence:  663
pr



In [7]:
train_wm = open('../../data_wm/arg_clean_45_2/train.txt','r').readlines()

sent_tokens, sentences = [], []
for line in train_wm:
    if line == '\n':
        sent = ' '.join(sent_tokens)
        sentences.append(sent)
        sent_tokens = []
    else:
        token, label = line.rstrip().split()
        if len(token) < 25 and 'www' not in token:
            sent_tokens.append(token)

In [8]:
embeddings = bert_embedding_individuals('wm2_claim_emb', sentences, tokenizer, bert_model)

processing sentence:  0
processing sentence:  1
processing sentence:  2
processing sentence:  3
processing sentence:  4
processing sentence:  5
processing sentence:  6
processing sentence:  7
processing sentence:  8
processing sentence:  9
processing sentence:  10
processing sentence:  11
processing sentence:  12
processing sentence:  13
processing sentence:  14
processing sentence:  15
processing sentence:  16
processing sentence:  17
processing sentence:  18
processing sentence:  19
processing sentence:  20
processing sentence:  21
processing sentence:  22
processing sentence:  23
processing sentence:  24
processing sentence:  25
processing sentence:  26
processing sentence:  27
processing sentence:  28
processing sentence:  29
processing sentence:  30
processing sentence:  31
processing sentence:  32
processing sentence:  33
processing sentence:  34
processing sentence:  35
processing sentence:  36
processing sentence:  37
processing sentence:  38
processing sentence:  39
processing

processing sentence:  320
processing sentence:  321
processing sentence:  322
processing sentence:  323
processing sentence:  324
processing sentence:  325
processing sentence:  326
processing sentence:  327
processing sentence:  328
processing sentence:  329
processing sentence:  330
processing sentence:  331
processing sentence:  332
processing sentence:  333
processing sentence:  334
processing sentence:  335
processing sentence:  336
processing sentence:  337
processing sentence:  338
processing sentence:  339
processing sentence:  340
processing sentence:  341
processing sentence:  342
processing sentence:  343
processing sentence:  344
processing sentence:  345
processing sentence:  346
processing sentence:  347
processing sentence:  348
processing sentence:  349
processing sentence:  350
processing sentence:  351
processing sentence:  352
processing sentence:  353
processing sentence:  354
processing sentence:  355
processing sentence:  356
processing sentence:  357
processing s

processing sentence:  636
processing sentence:  637
processing sentence:  638
processing sentence:  639
processing sentence:  640
processing sentence:  641
processing sentence:  642
processing sentence:  643
processing sentence:  644
processing sentence:  645
processing sentence:  646
processing sentence:  647
processing sentence:  648
processing sentence:  649
processing sentence:  650
processing sentence:  651
processing sentence:  652
processing sentence:  653
processing sentence:  654
processing sentence:  655
processing sentence:  656
processing sentence:  657
processing sentence:  658
processing sentence:  659
processing sentence:  660
processing sentence:  661
processing sentence:  662
processing sentence:  663
processing sentence:  664
processing sentence:  665
processing sentence:  666
processing sentence:  667
processing sentence:  668
processing sentence:  669
processing sentence:  670
processing sentence:  671
processing sentence:  672
processing sentence:  673
processing s

processing sentence:  950
processing sentence:  951
processing sentence:  952
processing sentence:  953
processing sentence:  954
processing sentence:  955
processing sentence:  956
processing sentence:  957
processing sentence:  958
processing sentence:  959
processing sentence:  960
processing sentence:  961
processing sentence:  962
processing sentence:  963
processing sentence:  964
processing sentence:  965
processing sentence:  966
processing sentence:  967
processing sentence:  968
processing sentence:  969
processing sentence:  970
processing sentence:  971
processing sentence:  972
processing sentence:  973
processing sentence:  974
processing sentence:  975
processing sentence:  976
processing sentence:  977
processing sentence:  978
processing sentence:  979
processing sentence:  980
processing sentence:  981
processing sentence:  982
processing sentence:  983
processing sentence:  984
processing sentence:  985
processing sentence:  986
processing sentence:  987
processing s

processing sentence:  1256
processing sentence:  1257
processing sentence:  1258
processing sentence:  1259
processing sentence:  1260
processing sentence:  1261
processing sentence:  1262
processing sentence:  1263
processing sentence:  1264
processing sentence:  1265
processing sentence:  1266
processing sentence:  1267
processing sentence:  1268
processing sentence:  1269
processing sentence:  1270
processing sentence:  1271
processing sentence:  1272
processing sentence:  1273
processing sentence:  1274
processing sentence:  1275
processing sentence:  1276
processing sentence:  1277
processing sentence:  1278
processing sentence:  1279
processing sentence:  1280
processing sentence:  1281
processing sentence:  1282
processing sentence:  1283
processing sentence:  1284
processing sentence:  1285
processing sentence:  1286
processing sentence:  1287
processing sentence:  1288
processing sentence:  1289
processing sentence:  1290
processing sentence:  1291
processing sentence:  1292
p

processing sentence:  1553
processing sentence:  1554
processing sentence:  1555
processing sentence:  1556
processing sentence:  1557
processing sentence:  1558
processing sentence:  1559
processing sentence:  1560
processing sentence:  1561
processing sentence:  1562
processing sentence:  1563
processing sentence:  1564
processing sentence:  1565
processing sentence:  1566
processing sentence:  1567
processing sentence:  1568
processing sentence:  1569
processing sentence:  1570
processing sentence:  1571
processing sentence:  1572
processing sentence:  1573
processing sentence:  1574
processing sentence:  1575
processing sentence:  1576
processing sentence:  1577
processing sentence:  1578
processing sentence:  1579
processing sentence:  1580
processing sentence:  1581
processing sentence:  1582
processing sentence:  1583
processing sentence:  1584
processing sentence:  1585
processing sentence:  1586
processing sentence:  1587
processing sentence:  1588
processing sentence:  1589
p

processing sentence:  1857
processing sentence:  1858
processing sentence:  1859
processing sentence:  1860
processing sentence:  1861




In [7]:
narrative = open('../../data_wm/wm_narrative/test.txt','r').readlines()

sent_tokens, sentences = [], []
for line in narrative:
    if line == '\n':
        sent = ' '.join(sent_tokens)
        sentences.append(sent)
        sent_tokens = []
    else:
        token, label = line.rstrip().split()
        if len(token) < 25 and 'www' not in token:
            sent_tokens.append(token)

In [8]:
embeddings = bert_embedding_individuals('narrative', sentences, tokenizer, bert_model)

processing sentence:  0
processing sentence:  1
processing sentence:  2
processing sentence:  3
processing sentence:  4
processing sentence:  5
processing sentence:  6
processing sentence:  7
processing sentence:  8
processing sentence:  9
processing sentence:  10
processing sentence:  11
processing sentence:  12
processing sentence:  13
processing sentence:  14
processing sentence:  15
processing sentence:  16
processing sentence:  17
processing sentence:  18
processing sentence:  19
processing sentence:  20
processing sentence:  21
processing sentence:  22
processing sentence:  23
processing sentence:  24
processing sentence:  25
processing sentence:  26
processing sentence:  27
processing sentence:  28
processing sentence:  29
processing sentence:  30
processing sentence:  31
processing sentence:  32
processing sentence:  33
processing sentence:  34
processing sentence:  35
processing sentence:  36
processing sentence:  37
processing sentence:  38
processing sentence:  39
processing

processing sentence:  318
processing sentence:  319
processing sentence:  320
processing sentence:  321
processing sentence:  322
processing sentence:  323
processing sentence:  324
processing sentence:  325
processing sentence:  326
processing sentence:  327
processing sentence:  328
processing sentence:  329
processing sentence:  330
processing sentence:  331
processing sentence:  332
processing sentence:  333
processing sentence:  334
processing sentence:  335
processing sentence:  336
processing sentence:  337
processing sentence:  338
processing sentence:  339
processing sentence:  340
processing sentence:  341
processing sentence:  342
processing sentence:  343
processing sentence:  344
processing sentence:  345
processing sentence:  346
processing sentence:  347
processing sentence:  348
processing sentence:  349
processing sentence:  350
processing sentence:  351
processing sentence:  352
processing sentence:  353
processing sentence:  354
processing sentence:  355
processing s

processing sentence:  632
processing sentence:  633
processing sentence:  634
processing sentence:  635
processing sentence:  636
processing sentence:  637
processing sentence:  638
processing sentence:  639
processing sentence:  640
processing sentence:  641
processing sentence:  642
processing sentence:  643
processing sentence:  644
processing sentence:  645
processing sentence:  646
processing sentence:  647
processing sentence:  648
processing sentence:  649
processing sentence:  650
processing sentence:  651
processing sentence:  652
processing sentence:  653
processing sentence:  654
processing sentence:  655
processing sentence:  656
processing sentence:  657
processing sentence:  658
processing sentence:  659
processing sentence:  660
processing sentence:  661
processing sentence:  662
processing sentence:  663
processing sentence:  664
processing sentence:  665
processing sentence:  666
processing sentence:  667
processing sentence:  668
processing sentence:  669
processing s

processing sentence:  949
processing sentence:  950
processing sentence:  951
processing sentence:  952
processing sentence:  953
processing sentence:  954
processing sentence:  955
processing sentence:  956
processing sentence:  957
processing sentence:  958
processing sentence:  959
processing sentence:  960
processing sentence:  961
processing sentence:  962
processing sentence:  963
processing sentence:  964
processing sentence:  965
processing sentence:  966
processing sentence:  967
processing sentence:  968
processing sentence:  969
processing sentence:  970
processing sentence:  971
processing sentence:  972
processing sentence:  973
processing sentence:  974
processing sentence:  975
processing sentence:  976
processing sentence:  977
processing sentence:  978
processing sentence:  979
processing sentence:  980
processing sentence:  981
processing sentence:  982
processing sentence:  983
processing sentence:  984
processing sentence:  985
processing sentence:  986
processing s

processing sentence:  1253
processing sentence:  1254
processing sentence:  1255
processing sentence:  1256
processing sentence:  1257
processing sentence:  1258
processing sentence:  1259
processing sentence:  1260
processing sentence:  1261
processing sentence:  1262
processing sentence:  1263
processing sentence:  1264
processing sentence:  1265
processing sentence:  1266
processing sentence:  1267
processing sentence:  1268
processing sentence:  1269
processing sentence:  1270
processing sentence:  1271
processing sentence:  1272
processing sentence:  1273
processing sentence:  1274
processing sentence:  1275
processing sentence:  1276
processing sentence:  1277
processing sentence:  1278
processing sentence:  1279
processing sentence:  1280
processing sentence:  1281
processing sentence:  1282
processing sentence:  1283
processing sentence:  1284
processing sentence:  1285
processing sentence:  1286
processing sentence:  1287
processing sentence:  1288
processing sentence:  1289
p

